summaryrefslogtreecommitdiff
path: root/drivers/staging
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging')
-rw-r--r--drivers/staging/rdma/hfi1/affinity.c93
-rw-r--r--drivers/staging/rdma/hfi1/affinity.h19
-rw-r--r--drivers/staging/rdma/hfi1/chip.c647
-rw-r--r--drivers/staging/rdma/hfi1/chip.h7
-rw-r--r--drivers/staging/rdma/hfi1/chip_registers.h1
-rw-r--r--drivers/staging/rdma/hfi1/diag.c3
-rw-r--r--drivers/staging/rdma/hfi1/driver.c3
-rw-r--r--drivers/staging/rdma/hfi1/firmware.c9
-rw-r--r--drivers/staging/rdma/hfi1/hfi.h11
-rw-r--r--drivers/staging/rdma/hfi1/init.c25
-rw-r--r--drivers/staging/rdma/hfi1/mad.c16
-rw-r--r--drivers/staging/rdma/hfi1/mmu_rb.c35
-rw-r--r--drivers/staging/rdma/hfi1/mmu_rb.h2
-rw-r--r--drivers/staging/rdma/hfi1/pio.c52
-rw-r--r--drivers/staging/rdma/hfi1/pio.h4
-rw-r--r--drivers/staging/rdma/hfi1/platform.c99
-rw-r--r--drivers/staging/rdma/hfi1/qp.c6
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.c58
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.h15
-rw-r--r--drivers/staging/rdma/hfi1/rc.c9
-rw-r--r--drivers/staging/rdma/hfi1/ruc.c20
-rw-r--r--drivers/staging/rdma/hfi1/sysfs.c4
-rw-r--r--drivers/staging/rdma/hfi1/ud.c8
-rw-r--r--drivers/staging/rdma/hfi1/user_exp_rcv.c7
-rw-r--r--drivers/staging/rdma/hfi1/user_sdma.c97
-rw-r--r--drivers/staging/rdma/hfi1/verbs.c108
-rw-r--r--drivers/staging/rdma/hfi1/verbs.h4
27 files changed, 919 insertions, 443 deletions
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c
index 2cb8ca7..6e7050a 100644
--- a/drivers/staging/rdma/hfi1/affinity.c
+++ b/drivers/staging/rdma/hfi1/affinity.c
@@ -53,20 +53,6 @@
#include "sdma.h"
#include "trace.h"
-struct cpu_mask_set {
- struct cpumask mask;
- struct cpumask used;
- uint gen;
-};
-
-struct hfi1_affinity {
- struct cpu_mask_set def_intr;
- struct cpu_mask_set rcv_intr;
- struct cpu_mask_set proc;
- /* spin lock to protect affinity struct */
- spinlock_t lock;
-};
-
/* Name of IRQ types, indexed by enum irq_type */
static const char * const irq_type_names[] = {
"SDMA",
@@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
set->gen = 0;
}
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+ struct hfi1_affinity *info;
+ int possible, curr_cpu, i, ht;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ cpumask_clear(&info->real_cpu_mask);
+
+ /* Start with cpu online mask as the real cpu mask */
+ cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+ /*
+ * Remove HT cores from the real cpu mask. Do this in two steps below.
+ */
+ possible = cpumask_weight(&info->real_cpu_mask);
+ ht = cpumask_weight(topology_sibling_cpumask(
+ cpumask_first(&info->real_cpu_mask)));
+ /*
+ * Step 1. Skip over the first N HT siblings and use them as the
+ * "real" cores. Assumes that HT cores are not enumerated in
+ * succession (except in the single core case).
+ */
+ curr_cpu = cpumask_first(&info->real_cpu_mask);
+ for (i = 0; i < possible / ht; i++)
+ curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+ /*
+ * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
+ * skip any gaps.
+ */
+ for (; i < possible; i++) {
+ cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+ curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+ }
+
+ dd->affinity = info;
+ return 0;
+}
+
/*
* Interrupt affinity.
*
@@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
* to the node relative 1 as necessary.
*
*/
-int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
{
int node = pcibus_to_node(dd->pcidev->bus);
- struct hfi1_affinity *info;
+ struct hfi1_affinity *info = dd->affinity;
const struct cpumask *local_mask;
- int curr_cpu, possible, i, ht;
+ int curr_cpu, possible, i;
if (node < 0)
node = numa_node_id();
dd->node = node;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
- return -ENOMEM;
spin_lock_init(&info->lock);
init_cpu_mask_set(&info->def_intr);
@@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
local_mask = cpumask_of_node(dd->node);
if (cpumask_first(local_mask) >= nr_cpu_ids)
local_mask = topology_core_cpumask(0);
- /* use local mask as default */
- cpumask_copy(&info->def_intr.mask, local_mask);
- /*
- * Remove HT cores from the default mask. Do this in two steps below.
- */
- possible = cpumask_weight(&info->def_intr.mask);
- ht = cpumask_weight(topology_sibling_cpumask(
- cpumask_first(&info->def_intr.mask)));
- /*
- * Step 1. Skip over the first N HT siblings and use them as the
- * "real" cores. Assumes that HT cores are not enumerated in
- * succession (except in the single core case).
- */
- curr_cpu = cpumask_first(&info->def_intr.mask);
- for (i = 0; i < possible / ht; i++)
- curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
- /*
- * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
- * skip any gaps.
- */
- for (; i < possible; i++) {
- cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
- curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
- }
+ /* Use the "real" cpu mask of this node as the default */
+ cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
/* fill in the receive list */
possible = cpumask_weight(&info->def_intr.mask);
@@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
}
cpumask_copy(&info->proc.mask, cpu_online_mask);
- dd->affinity = info;
- return 0;
}
void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h
index b287e49..20f52fe 100644
--- a/drivers/staging/rdma/hfi1/affinity.h
+++ b/drivers/staging/rdma/hfi1/affinity.h
@@ -64,10 +64,27 @@ enum affinity_flags {
AFF_IRQ_LOCAL
};
+struct cpu_mask_set {
+ struct cpumask mask;
+ struct cpumask used;
+ uint gen;
+};
+
+struct hfi1_affinity {
+ struct cpu_mask_set def_intr;
+ struct cpu_mask_set rcv_intr;
+ struct cpu_mask_set proc;
+ struct cpumask real_cpu_mask;
+ /* spin lock to protect affinity struct */
+ spinlock_t lock;
+};
+
struct hfi1_msix_entry;
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
/* Initialize driver affinity data */
-int hfi1_dev_affinity_init(struct hfi1_devdata *);
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
/* Free driver affinity data */
void hfi1_dev_affinity_free(struct hfi1_devdata *);
/*
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 16eb653..dcae8e7 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -123,6 +123,8 @@ struct flag_table {
#define MIN_KERNEL_KCTXTS 2
#define FIRST_KERNEL_KCTXT 1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES 256
#define NUM_MAP_REGS 32
/* Bit offset into the GUID which carries HFI id information */
@@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd);
static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
int msecs);
static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
static void handle_temp_err(struct hfi1_devdata *);
static void dc_shutdown(struct hfi1_devdata *);
static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np);
/*
* Error interrupt table entry. This is used as input to the interrupt
@@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
sci = &dd->send_contexts[sw_index];
/* there is no information for user (PSM) and ack contexts */
- if (sci->type != SC_KERNEL)
+ if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
return -1;
sc = sci->sc;
@@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
/*
* Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
*/
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
{
- struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
- dc_host_req_work);
struct hfi1_devdata *dd = ppd->dd;
u64 reg;
u16 data = 0;
- u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
- u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+ u8 type;
reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work)
case HREQ_READ_CONFIG:
case HREQ_SET_TX_EQ_ABS:
case HREQ_SET_TX_EQ_REL:
+ case HREQ_ENABLE:
dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
type);
hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
break;
-
- case HREQ_ENABLE:
- lanes = data & 0xF;
- for (i = 0; lanes; lanes >>= 1, i++) {
- if (!(lanes & 1))
- continue;
- if (data & 0x200) {
- /* enable TX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x80)
- cdr_ctrl_byte |= (1 << (i + 4));
- } else {
- /* disable TX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x80)
- cdr_ctrl_byte &= ~(1 << (i + 4));
- }
-
- if (data & 0x800) {
- /* enable RX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x40)
- cdr_ctrl_byte |= (1 << i);
- } else {
- /* disable RX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x40)
- cdr_ctrl_byte &= ~(1 << i);
- }
- }
- one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
- &cdr_ctrl_byte, 1);
- hreq_response(dd, HREQ_SUCCESS, data);
- refresh_qsfp_cache(ppd, &ppd->qsfp_info);
- break;
-
case HREQ_CONFIG_DONE:
hreq_response(dd, HREQ_SUCCESS, 0);
break;
@@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work)
case HREQ_INTERFACE_TEST:
hreq_response(dd, HREQ_SUCCESS, data);
break;
-
default:
dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
ppd->neighbor_fm_security = 0;
}
+static const char * const link_down_reason_strs[] = {
+ [OPA_LINKDOWN_REASON_NONE] = "None",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+ [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+ [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+ [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+ [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+ [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+ [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+ [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+ [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+ [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+ [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+ [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+ [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+ [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+ [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+ [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+ [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+ [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+ [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+ [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+ [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+ "Excessive buffer overrun",
+ [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+ [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+ [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+ [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+ [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+ [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+ [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+ [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+ "Local media not installed",
+ [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+ [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+ [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+ "End to end not installed",
+ [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+ [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+ [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+ [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+ [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+ [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+ const char *str = NULL;
+
+ if (reason < ARRAY_SIZE(link_down_reason_strs))
+ str = link_down_reason_strs[reason];
+ if (!str)
+ str = "(invalid)";
+
+ return str;
+}
+
/*
* Handle a link down interrupt from the 8051.
*
@@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
void handle_link_down(struct work_struct *work)
{
u8 lcl_reason, neigh_reason = 0;
+ u8 link_down_reason;
struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
- link_down_work);
+ link_down_work);
+ int was_up;
+ static const char ldr_str[] = "Link down reason: ";
if ((ppd->host_link_state &
(HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work)
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
/* Go offline first, then deal with reading/writing through 8051 */
+ was_up = !!(ppd->host_link_state & HLS_UP);
set_link_state(ppd, HLS_DN_OFFLINE);
- lcl_reason = 0;
- read_planned_down_reason_code(ppd->dd, &neigh_reason);
+ if (was_up) {
+ lcl_reason = 0;
+ /* link down reason is only valid if the link was up */
+ read_link_down_reason(ppd->dd, &link_down_reason);
+ switch (link_down_reason) {
+ case LDR_LINK_TRANSFER_ACTIVE_LOW:
+ /* the link went down, no idle message reason */
+ dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+ ldr_str);
+ break;
+ case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+ /*
+ * The neighbor reason is only valid if an idle message
+ * was received for it.
+ */
+ read_planned_down_reason_code(ppd->dd, &neigh_reason);
+ dd_dev_info(ppd->dd,
+ "%sNeighbor link down message %d, %s\n",
+ ldr_str, neigh_reason,
+ link_down_reason_str(neigh_reason));
+ break;
+ case LDR_RECEIVED_HOST_OFFLINE_REQ:
+ dd_dev_info(ppd->dd,
+ "%sHost requested link to go offline\n",
+ ldr_str);
+ break;
+ default:
+ dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+ ldr_str, link_down_reason);
+ break;
+ }
- /*
- * If no reason, assume peer-initiated but missed
- * LinkGoingDown idle flits.
- */
- if (neigh_reason == 0)
- lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+ /*
+ * If no reason, assume peer-initiated but missed
+ * LinkGoingDown idle flits.
+ */
+ if (neigh_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+ } else {
+ /* went down while polling or going up */
+ lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+ }
set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+ /* inform the SMA when the link transitions from up to down */
+ if (was_up && ppd->local_link_down_reason.sma == 0 &&
+ ppd->neigh_link_down_reason.sma == 0) {
+ ppd->local_link_down_reason.sma =
+ ppd->local_link_down_reason.latest;
+ ppd->neigh_link_down_reason.sma =
+ ppd->neigh_link_down_reason.latest;
+ }
+
reset_neighbor_info(ppd);
/* disable the port */
@@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work)
* If there is no cable attached, turn the DC off. Otherwise,
* start the link bring up.
*/
- if (!qsfp_mod_present(ppd)) {
+ if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
dc_shutdown(ppd->dd);
} else {
tune_serdes(ppd);
@@ -7373,7 +7452,11 @@ retry:
ppd->link_width_downgrade_rx_active = rx;
}
- if (lwde == 0) {
+ if (ppd->link_width_downgrade_tx_active == 0 ||
+ ppd->link_width_downgrade_rx_active == 0) {
+ /* the 8051 reported a dead link as a downgrade */
+ dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+ } else if (lwde == 0) {
/* downgrade is disabled */
/* bounce if not at starting active width */
@@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
host_msg &= ~(u64)LINKUP_ACHIEVED;
}
if (host_msg & EXT_DEVICE_CFG_REQ) {
- queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+ handle_8051_request(ppd);
host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
}
if (host_msg & VERIFY_CAP_FRAME) {
@@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
}
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+ u32 frame;
+
+ read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+ *ldr = (frame & 0xff);
+}
+
static int read_tx_settings(struct hfi1_devdata *dd,
u8 *enable_lane_tx,
u8 *tx_polarity_inversion,
@@ -9049,9 +9140,9 @@ set_local_link_attributes_fail:
}
/*
- * Call this to start the link. Schedule a retry if the cable is not
- * present or if unable to start polling. Do not do anything if the
- * link is disabled. Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
*/
int start_link(struct hfi1_pportdata *ppd)
{
@@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd)
return 0;
}
- if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
- loopback == LOOPBACK_LCB ||
- ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
- return set_link_state(ppd, HLS_DN_POLL);
-
- dd_dev_info(ppd->dd,
- "%s: stopping link start because no cable is present\n",
- __func__);
- return -EAGAIN;
+ return set_link_state(ppd, HLS_DN_POLL);
}
static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
return 0;
}
-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
void qsfp_event(struct work_struct *work)
{
struct qsfp_data *qd;
@@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
& SEND_LEN_CHECK1_LEN_VL15_MASK) <<
SEND_LEN_CHECK1_LEN_VL15_SHIFT;
int i;
+ u32 thres;
for (i = 0; i < ppd->vls_supported; i++) {
if (dd->vld[i].mtu > maxvlmtu)
@@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
/* adjust kernel credit return thresholds based on new MTUs */
/* all kernel receive contexts have the same hdrqentsize */
for (i = 0; i < ppd->vls_supported; i++) {
- sc_set_cr_threshold(dd->vld[i].sc,
- sc_mtu_to_threshold(dd->vld[i].sc,
- dd->vld[i].mtu,
- dd->rcd[0]->
- rcvhdrqentsize));
- }
- sc_set_cr_threshold(dd->vld[15].sc,
- sc_mtu_to_threshold(dd->vld[15].sc,
- dd->vld[15].mtu,
+ thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+ sc_mtu_to_threshold(dd->vld[i].sc,
+ dd->vld[i].mtu,
dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->vld[i].sc, thres);
+ }
+ thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+ sc_mtu_to_threshold(dd->vld[15].sc,
+ dd->vld[15].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->vld[15].sc, thres);
/* Adjust maximum MTU for the port in DC */
dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
struct hfi1_devdata *dd = ppd->dd;
struct ib_event event = {.device = NULL};
int ret1, ret = 0;
- int was_up, is_down;
int orig_new_state, poll_bounce;
mutex_lock(&ppd->hls_lock);
@@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
poll_bounce ? "(bounce) " : "",
link_state_reason_name(ppd, state));
- was_up = !!(ppd->host_link_state & HLS_UP);
-
/*
* If we're going to a (HLS_*) link state that implies the logical
* link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
break;
}
- is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
- HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
- if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
- ppd->neigh_link_down_reason.sma == 0) {
- ppd->local_link_down_reason.sma =
- ppd->local_link_down_reason.latest;
- ppd->neigh_link_down_reason.sma =
- ppd->neigh_link_down_reason.latest;
- }
-
goto done;
unexpected:
@@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
int total_contexts;
int ret;
unsigned ngroups;
+ int qos_rmt_count;
+ int user_rmt_reduced;
/*
- * Kernel contexts: (to be fixed later):
- * - min or 2 or 1 context/numa
+ * Kernel receive contexts:
+ * - min of 2 or 1 context/numa (excluding control context)
* - Context 0 - control context (VL15/multicast/error)
- * - Context 1 - default context
+ * - Context 1 - first kernel context
+ * - Context 2 - second kernel context
+ * ...
*/
if (n_krcvqs)
/*
- * Don't count context 0 in n_krcvqs since
- * is isn't used for normal verbs traffic.
- *
- * krcvqs will reflect number of kernel
- * receive contexts above 0.
+ * n_krcvqs is the sum of module parameter kernel receive
+ * contexts, krcvqs[]. It does not include the control
+ * context, so add that.
*/
- num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+ num_kernel_contexts = n_krcvqs + 1;
else
num_kernel_contexts = num_online_nodes() + 1;
num_kernel_contexts =
@@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
}
/*
- * User contexts: (to be fixed later)
- * - default to 1 user context per CPU if num_user_contexts is
- * negative
+ * User contexts:
+ * - default to 1 user context per real (non-HT) CPU core if
+ * num_user_contexts is negative
*/
if (num_user_contexts < 0)
- num_user_contexts = num_online_cpus();
+ num_user_contexts =
+ cpumask_weight(&dd->affinity->real_cpu_mask);
total_contexts = num_kernel_contexts + num_user_contexts;
@@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
total_contexts = num_kernel_contexts + num_user_contexts;
}
+ /* each user context requires an entry in the RMT */
+ qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+ if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+ user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+ dd_dev_err(dd,
+ "RMT size is reducing the number of user receive contexts from %d to %d\n",
+ (int)num_user_contexts,
+ user_rmt_reduced);
+ /* recalculate */
+ num_user_contexts = user_rmt_reduced;
+ total_contexts = num_kernel_contexts + num_user_contexts;
+ }
+
/* the first N are kernel contexts, the rest are user contexts */
dd->num_rcv_contexts = total_contexts;
dd->n_krcv_queues = num_kernel_contexts;
@@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
dd->num_send_contexts = ret;
dd_dev_info(
dd,
- "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+ "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
dd->chip_send_contexts,
dd->num_send_contexts,
dd->sc_sizes[SC_KERNEL].count,
dd->sc_sizes[SC_ACK].count,
- dd->sc_sizes[SC_USER].count);
+ dd->sc_sizes[SC_USER].count,
+ dd->sc_sizes[SC_VL15].count);
ret = 0; /* success */
}
@@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
int i;
u64 ctxt = first_ctxt;
- for (i = 0; i < 256;) {
+ for (i = 0; i < 256; i++) {
reg |= ctxt << (8 * (i % 8));
- i++;
ctxt++;
if (ctxt > last_ctxt)
ctxt = first_ctxt;
- if (i % 8 == 0) {
+ if (i % 8 == 7) {
write_csr(dd, regno, reg);
reg = 0;
regno += 8;
}
}
- if (i % 8)
- write_csr(dd, regno, reg);
add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
}
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @first_context
- *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
- *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
- *
+struct rsm_map_table {
+ u64 map[NUM_MAP_REGS];
+ unsigned int used;
+};
+
+struct rsm_rule_data {
+ u8 offset;
+ u8 pkt_type;
+ u32 field1_off;
+ u32 field2_off;
+ u32 index1_off;
+ u32 index1_width;
+ u32 index2_off;
+ u32 index2_width;
+ u32 mask1;
+ u32 value1;
+ u32 mask2;
+ u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in. OK if it
+ * returns NULL, indicating no table.
*/
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
{
+ struct rsm_map_table *rmt;
+ u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
+
+ rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+ if (rmt) {
+ memset(rmt->map, rxcontext, sizeof(rmt->map));
+ rmt->used = 0;
+ }
+
+ return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table. OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ int i;
+
+ if (rmt) {
+ /* write table to chip */
+ for (i = 0; i < NUM_MAP_REGS; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+ /* enable RSM */
+ add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+ }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+ struct rsm_rule_data *rrd)
+{
+ write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+ (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+ 1ull << rule_index | /* enable bit */
+ (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+ write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+ (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+ (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+ (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+ (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+ (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+ (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+ write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+ (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+ (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+ (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+ (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np)
+{
+ int i;
+ unsigned int m, n;
u8 max_by_vl = 0;
- unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
- u64 *rsmmap;
- u64 reg;
- u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
- /* validate */
+ /* is QOS active at all? */
if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
num_vls == 1 ||
krcvqsset <= 1)
- goto bail;
- for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+ goto no_qos;
+
+ /* determine bits for qpn */
+ for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
if (krcvqs[i] > max_by_vl)
max_by_vl = krcvqs[i];
if (max_by_vl > 32)
- goto bail;
- qpns_per_vl = __roundup_pow_of_two(max_by_vl);
- /* determine bits vl */
- n = ilog2(num_vls);
- /* determine bits for qpn */
- m = ilog2(qpns_per_vl);
+ goto no_qos;
+ m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+ /* determine bits for vl */
+ n = ilog2(__roundup_pow_of_two(num_vls));
+
+ /* reject if too much is used */
if ((m + n) > 7)
+ goto no_qos;
+
+ if (mp)
+ *mp = m;
+ if (np)
+ *np = n;
+
+ return 1 << (m + n);
+
+no_qos:
+ if (mp)
+ *mp = 0;
+ if (np)
+ *np = 0;
+ return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+ unsigned int rmt_entries;
+ u64 reg;
+
+ if (!rmt)
goto bail;
- if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+ rmt_entries = qos_rmt_entries(dd, &m, &n);
+ if (rmt_entries == 0)
goto bail;
- rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
- if (!rsmmap)
+ qpns_per_vl = 1 << m;
+
+ /* enough room in the map table? */
+ rmt_entries = 1 << (m + n);
+ if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
goto bail;
- memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
- /* init the local copy of the table */
- for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+
+ /* add qos entries to the the RSM map table */
+ for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
unsigned tctxt;
for (qpn = 0, tctxt = ctxt;
krcvqs[i] && qpn < qpns_per_vl; qpn++) {
unsigned idx, regoff, regidx;
- /* generate index <= 128 */
- idx = (qpn << n) ^ i;
+ /* generate the index the hardware will produce */
+ idx = rmt->used + ((qpn << n) ^ i);
regoff = (idx % 8) * 8;
regidx = idx / 8;
- reg = rsmmap[regidx];
- /* replace 0xff with context number */
+ /* replace default with context number */
+ reg = rmt->map[regidx];
reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
<< regoff);
reg |= (u64)(tctxt++) << regoff;
- rsmmap[regidx] = reg;
+ rmt->map[regidx] = reg;
if (tctxt == ctxt + krcvqs[i])
tctxt = ctxt;
}
ctxt += krcvqs[i];
}
- /* flush cached copies to chip */
- for (i = 0; i < NUM_MAP_REGS; i++)
- write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
- /* add rule0 */
- write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
- RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
- RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
- 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
- write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
- LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
- LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
- LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
- ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
- QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
- ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
- write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
- LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
- LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
- LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
- LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
- /* Enable RSM */
- add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
- kfree(rsmmap);
- /* map everything else to first context */
- init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+
+ rrd.offset = rmt->used;
+ rrd.pkt_type = 2;
+ rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+ rrd.field2_off = LRH_SC_MATCH_OFFSET;
+ rrd.index1_off = LRH_SC_SELECT_OFFSET;
+ rrd.index1_width = n;
+ rrd.index2_off = QPN_SELECT_OFFSET;
+ rrd.index2_width = m + n;
+ rrd.mask1 = LRH_BTH_MASK;
+ rrd.value1 = LRH_BTH_VALUE;
+ rrd.mask2 = LRH_SC_MASK;
+ rrd.value2 = LRH_SC_VALUE;
+
+ /* add rule 0 */
+ add_rsm_rule(dd, 0, &rrd);
+
+ /* mark RSM map entries as used */
+ rmt->used += rmt_entries;
+ /* map everything else to the mcast/err/vl15 context */
+ init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
dd->qos_shift = n + 1;
return;
bail:
@@ -13574,13 +13764,86 @@ bail:
init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
}
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ u64 reg;
+ int i, idx, regoff, regidx;
+ u8 offset;
+
+ /* there needs to be enough room in the map table */
+ if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+ dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+ return;
+ }
+
+ /*
+ * RSM will extract the destination context as an index into the
+ * map table. The destination contexts are a sequential block
+ * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+ * Map entries are accessed as offset + extracted value. Adjust
+ * the added offset so this sequence can be placed anywhere in
+ * the table - as long as the entries themselves do not wrap.
+ * There are only enough bits in offset for the table size, so
+ * start with that to allow for a "negative" offset.
+ */
+ offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+ (int)dd->first_user_ctxt);
+
+ for (i = dd->first_user_ctxt, idx = rmt->used;
+ i < dd->num_rcv_contexts; i++, idx++) {
+ /* replace with identity mapping */
+ regoff = (idx % 8) * 8;
+ regidx = idx / 8;
+ reg = rmt->map[regidx];
+ reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+ reg |= (u64)i << regoff;
+ rmt->map[regidx] = reg;
+ }
+
+ /*
+ * For RSM intercept of Expected FECN packets:
+ * o packet type 0 - expected
+ * o match on F (bit 95), using select/match 1, and
+ * o match on SH (bit 133), using select/match 2.
+ *
+ * Use index 1 to extract the 8-bit receive context from DestQP
+ * (start at bit 64). Use that as the RSM map table index.
+ */
+ rrd.offset = offset;
+ rrd.pkt_type = 0;
+ rrd.field1_off = 95;
+ rrd.field2_off = 133;
+ rrd.index1_off = 64;
+ rrd.index1_width = 8;
+ rrd.index2_off = 0;
+ rrd.index2_width = 0;
+ rrd.mask1 = 1;
+ rrd.value1 = 1;
+ rrd.mask2 = 1;
+ rrd.value2 = 1;
+
+ /* add rule 1 */
+ add_rsm_rule(dd, 1, &rrd);
+
+ rmt->used += dd->num_user_contexts;
+}
+
static void init_rxe(struct hfi1_devdata *dd)
{
+ struct rsm_map_table *rmt;
+
/* enable all receive errors */
write_csr(dd, RCV_ERR_MASK, ~0ull);
- /* setup QPN map table - start where VL15 context leaves off */
- init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
- MIN_KERNEL_KCTXTS : 0);
+
+ rmt = alloc_rsm_map_table(dd);
+ /* set up QOS, including the QPN map table */
+ init_qos(dd, rmt);
+ init_user_fecn_handling(dd, rmt);
+ complete_rsm_map_table(dd, rmt);
+ kfree(rmt);
+
/*
* make sure RcvCtrl.RcvWcb <= PCIe Device Control
* Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
done:
return ret;
@@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
(dd->revision >> CCE_REVISION_SW_SHIFT)
& CCE_REVISION_SW_MASK);
+ /*
+ * The real cpu mask is part of the affinity struct but has to be
+ * initialized earlier than the rest of the affinity struct because it
+ * is needed to calculate the number of user contexts in
+ * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+ * which initializes the rest of the affinity struct members,
+ * depends on set_up_context_variables() for the number of kernel
+ * contexts, so it cannot be called before set_up_context_variables().
+ */
+ ret = init_real_cpu_mask(dd);
+ if (ret)
+ goto bail_cleanup;
+
ret = set_up_context_variables(dd);
if (ret)
goto bail_cleanup;
@@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
/* set up KDETH QP prefix in both RX and TX CSRs */
init_kdeth_qp(dd);
- ret = hfi1_dev_affinity_init(dd);
- if (ret)
- goto bail_cleanup;
+ hfi1_dev_affinity_init(dd);
/* send contexts must be set up before receive contexts */
ret = init_send_contexts(dd);
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
index 4f3b878..1948706 100644
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -389,6 +389,7 @@
#define LAST_REMOTE_STATE_COMPLETE 0x13
#define LINK_QUALITY_INFO 0x14
#define REMOTE_DEVICE_ID 0x15
+#define LINK_DOWN_REASON 0x16
/* 8051 lane specific register field IDs */
#define TX_EQ_SETTINGS 0x00
@@ -497,6 +498,11 @@
#define PWRM_BER_CONTROL 0x1
#define PWRM_BANDWIDTH_CONTROL 0x2
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc
+
/* verify capability fabric CRC size bits */
enum {
CAP_CRC_14B = (1 << 0), /* 14b CRC */
@@ -691,7 +697,6 @@ void handle_verify_cap(struct work_struct *work);
void handle_freeze(struct work_struct *work);
void handle_link_up(struct work_struct *work);
void handle_link_down(struct work_struct *work);
-void handle_8051_request(struct work_struct *work);
void handle_link_downgrade(struct work_struct *work);
void handle_link_bounce(struct work_struct *work);
void handle_sma_message(struct work_struct *work);
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
index 770f05c..8744de6 100644
--- a/drivers/staging/rdma/hfi1/chip_registers.h
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -771,6 +771,7 @@
#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
#define RCV_RSM_MATCH (RXE + 0x000000000800)
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
index c5b520b..bb2409a 100644
--- a/drivers/staging/rdma/hfi1/diag.c
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp)
goto bail;
}
/* can only use kernel contexts */
- if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+ if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
+ dd->send_contexts[dp->sw_index].type != SC_VL15) {
ret = -EINVAL;
goto bail;
}
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
index 34511e5..700c6fa 100644
--- a/drivers/staging/rdma/hfi1/driver.c
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex); /* general driver use */
unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+ HFI1_DEFAULT_MAX_MTU));
unsigned int hfi1_cu = 1;
module_param_named(cu, hfi1_cu, uint, S_IRUGO);
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
index 3040162..ed680fd 100644
--- a/drivers/staging/rdma/hfi1/firmware.c
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
if (resource & CR_DYN_MASK) {
/* a dynamic resource is in use if either HFI has set the bit */
- all_bits = resource_mask(0, resource) |
+ if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+ (resource & (CR_I2C1 | CR_I2C2))) {
+ /* discrete devices must serialize across both chains */
+ all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+ resource_mask(1, CR_I2C1 | CR_I2C2);
+ } else {
+ all_bits = resource_mask(0, resource) |
resource_mask(1, resource);
+ }
my_bit = resource_mask(dd->hfi1_id, resource);
} else {
/* non-dynamic resources are not split between HFIs */
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index 16cbdc4..7b78d56 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -455,9 +455,9 @@ struct rvt_sge_state;
#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
/* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 8192
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
/* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 8192
+#define HFI1_DEFAULT_MAX_MTU 10240
/* default partition key */
#define DEFAULT_PKEY 0xffff
@@ -606,7 +606,6 @@ struct hfi1_pportdata {
struct work_struct link_vc_work;
struct work_struct link_up_work;
struct work_struct link_down_work;
- struct work_struct dc_host_req_work;
struct work_struct sma_message_work;
struct work_struct freeze_work;
struct work_struct link_downgrade_work;
@@ -1258,7 +1257,7 @@ void receive_interrupt_work(struct work_struct *work);
static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
{
return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
- ((!!(rhf & RHF_DC_INFO_MASK)) << 4);
+ ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
}
static inline u16 generate_jkey(kuid_t uid)
@@ -1333,6 +1332,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
u32 pkey, u32 slid, u32 dlid, u8 sc5,
const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+ u8 sc5, int8_t s_pkey_index);
#define PACKET_EGRESS_TIMEOUT 350
static inline void pause_for_credit_return(struct hfi1_devdata *dd)
@@ -1776,6 +1778,7 @@ extern struct mutex hfi1_mutex;
#define HFI1_PKT_USER_SC_INTEGRITY \
(SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \
| SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \
| SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index cfcdc16..502b7cf 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
struct cca_timer *cca_timer;
struct hfi1_pportdata *ppd;
int sl;
- u16 ccti, ccti_timer, ccti_min;
+ u16 ccti_timer, ccti_min;
struct cc_state *cc_state;
unsigned long flags;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
cca_timer = container_of(t, struct cca_timer, hrtimer);
ppd = cca_timer->ppd;
@@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
spin_lock_irqsave(&ppd->cca_timer_lock, flags);
- ccti = cca_timer->ccti;
-
- if (ccti > ccti_min) {
+ if (cca_timer->ccti > ccti_min) {
cca_timer->ccti--;
set_link_ipg(ppd);
}
- spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
- rcu_read_unlock();
-
- if (ccti > ccti_min) {
+ if (cca_timer->ccti > ccti_min) {
unsigned long nsec = 1024 * ccti_timer;
/* ccti_timer is in units of 1.024 usec */
hrtimer_forward_now(t, ns_to_ktime(nsec));
- return HRTIMER_RESTART;
+ ret = HRTIMER_RESTART;
}
- return HRTIMER_NORESTART;
+
+ spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -496,7 +494,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
INIT_WORK(&ppd->link_up_work, handle_link_up);
INIT_WORK(&ppd->link_down_work, handle_link_down);
- INIT_WORK(&ppd->dc_host_req_work, handle_8051_request);
INIT_WORK(&ppd->freeze_work, handle_freeze);
INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
INIT_WORK(&ppd->sma_message_work, handle_sma_message);
@@ -1007,7 +1004,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
free_percpu(dd->rcv_limit);
hfi1_dev_affinity_free(dd);
free_percpu(dd->send_schedule);
- ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
}
/*
@@ -1110,7 +1107,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
bail:
if (!list_empty(&dd->list))
list_del_init(&dd->list);
- ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
return ERR_PTR(ret);
}
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
index d1e7f4d..ed58cf2 100644
--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
break;
}
- set_link_state(ppd, link_state);
+ if ((link_state == HLS_DN_POLL ||
+ link_state == HLS_DN_DOWNDEF)) {
+ /*
+ * Going to poll. No matter what the current state,
+ * always move offline first, then tune and start the
+ * link. This correctly handles a FM link bounce and
+ * a link enable. Going offline is a no-op if already
+ * offline.
+ */
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ tune_serdes(ppd);
+ start_link(ppd);
+ } else {
+ set_link_state(ppd, link_state);
+ }
if (link_state == HLS_DN_DISABLE &&
(ppd->offline_disabled_reason >
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
index b3f0682..2b0e91d 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node)
static unsigned long mmu_node_last(struct mmu_rb_node *node)
{
- return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1;
+ return PAGE_ALIGN(node->addr + node->len) - 1;
}
int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
@@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
if (!handler)
return;
+ /* Unregister first so we don't get any more notifications. */
+ if (current->mm)
+ mmu_notifier_unregister(&handler->mn, current->mm);
+
spin_lock_irqsave(&mmu_rb_lock, flags);
list_del(&handler->list);
spin_unlock_irqrestore(&mmu_rb_lock, flags);
+ spin_lock_irqsave(&handler->lock, flags);
if (!RB_EMPTY_ROOT(root)) {
struct rb_node *node;
struct mmu_rb_node *rbnode;
@@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
handler->ops->remove(root, rbnode, NULL);
}
}
+ spin_unlock_irqrestore(&handler->lock, flags);
- if (current->mm)
- mmu_notifier_unregister(&handler->mn, current->mm);
kfree(handler);
}
@@ -235,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
return node;
}
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+ unsigned long addr, unsigned long len)
+{
+ struct mmu_rb_handler *handler = find_mmu_handler(root);
+ struct mmu_rb_node *node;
+ unsigned long flags;
+
+ if (!handler)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&handler->lock, flags);
+ node = __mmu_rb_search(handler, addr, len);
+ if (node)
+ __mmu_int_rb_remove(node, handler->root);
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ return node;
+}
+
void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
{
struct mmu_rb_handler *handler = find_mmu_handler(root);
@@ -293,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
node->addr, node->len);
if (handler->ops->invalidate(root, node)) {
- spin_unlock_irqrestore(&handler->lock, flags);
- __mmu_rb_remove(handler, node, mm);
- spin_lock_irqsave(&handler->lock, flags);
+ __mmu_int_rb_remove(node, root);
+ if (handler->ops->remove)
+ handler->ops->remove(root, node, mm);
}
}
spin_unlock_irqrestore(&handler->lock, flags);
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h
index 19a306e..7a57b9c 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.h
+++ b/drivers/staging/rdma/hfi1/mmu_rb.h
@@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+ unsigned long);
#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
index c6849ce..c67b9ad 100644
--- a/drivers/staging/rdma/hfi1/pio.c
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
/* Send Context Size (SCS) wildcards */
#define SCS_POOL_0 -1
#define SCS_POOL_1 -2
+
/* Send Context Count (SCC) wildcards */
#define SCC_PER_VL -1
#define SCC_PER_CPU -2
-
#define SCC_PER_KRCVQ -3
-#define SCC_ACK_CREDITS 32
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS 32
+#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
#define PIO_WAIT_BATCH_SIZE 5
/* default send context sizes */
static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
[SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */
- .count = SCC_PER_VL },/* one per NUMA */
- [SC_ACK] = { .size = SCC_ACK_CREDITS,
+ .count = SCC_PER_VL }, /* one per NUMA */
+ [SC_ACK] = { .size = SCS_ACK_CREDITS,
.count = SCC_PER_KRCVQ },
[SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */
.count = SCC_PER_CPU }, /* one per CPU */
+ [SC_VL15] = { .size = SCS_VL15_CREDITS,
+ .count = 1 },
};
@@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc)
static const char *sc_type_names[SC_MAX] = {
"kernel",
"ack",
- "user"
+ "user",
+ "vl15"
};
static const char *sc_type_name(int index)
@@ -231,6 +239,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
int i;
/*
+ * When SDMA is enabled, kernel context pio packet size is capped by
+ * "piothreshold". Reduce pio buffer allocation for kernel context by
+ * setting it to a fixed size. The allocation allows 3-deep buffering
+ * of the largest pio packets plus up to 128 bytes header, sufficient
+ * to maintain verbs performance.
+ *
+ * When SDMA is disabled, keep the default pooling allocation.
+ */
+ if (HFI1_CAP_IS_KSET(SDMA)) {
+ u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+ piothreshold : PIO_THRESHOLD_CEILING;
+ sc_config_sizes[SC_KERNEL].size =
+ 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+ }
+
+ /*
* Step 0:
* - copy the centipercents/absolute sizes from the pool config
* - sanity check these values
@@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
if (i == SC_ACK) {
count = dd->n_krcv_queues;
} else if (i == SC_KERNEL) {
- count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */;
+ count = INIT_SC_PER_VL * num_vls;
} else if (count == SCC_PER_CPU) {
count = dd->num_rcv_contexts - dd->n_krcv_queues;
} else if (count < 0) {
@@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
* Return value is what to write into the CSR: trigger return when
* unreturned credits pass this count.
*/
-static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
{
return (sc->credits * percent) / 100;
}
@@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
* For Ack contexts, set a threshold for half the credits.
* For User contexts use the given percentage. This has been
* sanitized on driver start-up.
- * For Kernel contexts, use the default MTU plus a header.
+ * For Kernel contexts, use the default MTU plus a header
+ * or half the credits, whichever is smaller. This should
+ * work for both the 3-deep buffering allocation and the
+ * pooling allocation.
*/
if (type == SC_ACK) {
thresh = sc_percent_to_threshold(sc, 50);
@@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
thresh = sc_percent_to_threshold(sc,
user_credit_return_threshold);
} else { /* kernel */
- thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+ thresh = min(sc_percent_to_threshold(sc, 50),
+ sc_mtu_to_threshold(sc, hfi1_max_mtu,
+ hdrqentsize));
}
reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
/* add in early return */
@@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc)
unsigned long flags;
unsigned i, n = 0;
- if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+ if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+ dd->send_contexts[sc->sw_index].type != SC_VL15)
return;
list = &sc->piowait;
/*
@@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd)
u32 ctxt;
struct hfi1_pportdata *ppd = dd->pport;
- dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+ dd->vld[15].sc = sc_alloc(dd, SC_VL15,
dd->rcd[0]->rcvhdrqentsize, dd->node);
if (!dd->vld[15].sc)
goto nomem;
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
index 0026976..53a08ed 100644
--- a/drivers/staging/rdma/hfi1/pio.h
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -51,7 +51,8 @@
#define SC_KERNEL 0
#define SC_ACK 1
#define SC_USER 2
-#define SC_MAX 3
+#define SC_VL15 3
+#define SC_MAX 4
/* invalid send context index */
#define INVALID_SCI 0xff
@@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
void sc_add_credit_return_intr(struct send_context *sc);
void sc_del_credit_return_intr(struct send_context *sc);
void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
void sc_wait(struct hfi1_devdata *dd);
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
index 0a1d074..8fe8a20 100644
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd)
if (ret)
return ret;
- if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
- cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
- else
- cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
- if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1))
- ppd->offline_disabled_reason =
- HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
- else if (cable_power_class > 4 && cable_power_class > (power_class_max))
+ if (cable_power_class > power_class_max)
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
- /*
- * cable_power_class will never have value 4 as this simply
- * means the high power settings are unused
- */
if (ppd->offline_disabled_reason ==
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
@@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
u8 *cache = ppd->qsfp_info.cache;
int ret;
- if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
- cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
- else
- cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
- if (cable_power_class) {
+ if (cable_power_class > QSFP_POWER_CLASS_1) {
power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
power_ctrl_byte |= 1;
@@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
if (ret != 1)
return -EIO;
- if (cable_power_class > 3) {
- /* > power class 4*/
+ if (cable_power_class > QSFP_POWER_CLASS_4) {
power_ctrl_byte |= (1 << 2);
ret = qsfp_write(ppd, ppd->dd->hfi1_id,
QSFP_PWR_CTRL_BYTE_OFFS,
@@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
{
u32 rx_preset;
u8 *cache = ppd->qsfp_info.cache;
+ int cable_power_class;
if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
(cache[QSFP_CDR_INFO_OFFS] & 0x40)))
return;
- /* rx_preset preset to zero to catch error */
+ /* RX CDR present, bypass supported */
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class <= QSFP_POWER_CLASS_3) {
+ /* Power class <= 3, ignore config & turn RX CDR on */
+ *cdr_ctrl_byte |= 0xF;
+ return;
+ }
+
get_platform_config_field(
ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
@@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
static void apply_tx_cdr(struct hfi1_pportdata *ppd,
u32 tx_preset_index,
- u8 *ctr_ctrl_byte)
+ u8 *cdr_ctrl_byte)
{
u32 tx_preset;
u8 *cache = ppd->qsfp_info.cache;
+ int cable_power_class;
if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
(cache[QSFP_CDR_INFO_OFFS] & 0x80)))
return;
+ /* TX CDR present, bypass supported */
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class <= QSFP_POWER_CLASS_3) {
+ /* Power class <= 3, ignore config & turn TX CDR on */
+ *cdr_ctrl_byte |= 0xF0;
+ return;
+ }
+
get_platform_config_field(
ppd->dd,
PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
@@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd,
(tx_preset << 2) | (tx_preset << 3));
if (tx_preset)
- *ctr_ctrl_byte |= (tx_preset << 4);
+ *cdr_ctrl_byte |= (tx_preset << 4);
else
/* Preserve current/determined RX CDR status */
- *ctr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+ *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
}
static void apply_cdr_settings(
@@ -598,6 +603,7 @@ static void apply_tunings(
"Applying TX settings");
}
+/* Must be holding the QSFP i2c resource */
static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
u32 *ptr_rx_preset, u32 *ptr_total_atten)
{
@@ -605,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
u8 *cache = ppd->qsfp_info.cache;
- ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
- if (ret) {
- dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
- __func__, (int)ppd->dd->hfi1_id);
- return ret;
- }
-
ppd->qsfp_info.limiting_active = 1;
ret = set_qsfp_tx(ppd, 0);
if (ret)
- goto bail_unlock;
+ return ret;
ret = qual_power(ppd);
if (ret)
- goto bail_unlock;
+ return ret;
ret = qual_bitrate(ppd);
if (ret)
- goto bail_unlock;
+ return ret;
if (ppd->qsfp_info.reset_needed) {
reset_qsfp(ppd);
@@ -636,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
ret = set_qsfp_high_power(ppd);
if (ret)
- goto bail_unlock;
+ return ret;
if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
ret = get_platform_config_field(
@@ -646,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
ptr_tx_preset, 4);
if (ret) {
*ptr_tx_preset = OPA_INVALID_INDEX;
- goto bail_unlock;
+ return ret;
}
} else {
ret = get_platform_config_field(
@@ -656,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
ptr_tx_preset, 4);
if (ret) {
*ptr_tx_preset = OPA_INVALID_INDEX;
- goto bail_unlock;
+ return ret;
}
}
@@ -665,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
if (ret) {
*ptr_rx_preset = OPA_INVALID_INDEX;
- goto bail_unlock;
+ return ret;
}
if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
@@ -685,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
ret = set_qsfp_tx(ppd, 1);
-bail_unlock:
- release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
return ret;
}
@@ -833,12 +830,22 @@ void tune_serdes(struct hfi1_pportdata *ppd)
total_atten = platform_atten + remote_atten;
tuning_method = OPA_PASSIVE_TUNING;
- } else
+ } else {
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+ goto bail;
+ }
break;
case PORT_TYPE_QSFP:
if (qsfp_mod_present(ppd)) {
+ ret = acquire_chip_resource(ppd->dd,
+ qsfp_resource(ppd->dd),
+ QSFP_WAIT);
+ if (ret) {
+ dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+ __func__, (int)ppd->dd->hfi1_id);
+ goto bail;
+ }
refresh_qsfp_cache(ppd, &ppd->qsfp_info);
if (ppd->qsfp_info.cache_valid) {
@@ -853,21 +860,23 @@ void tune_serdes(struct hfi1_pportdata *ppd)
* update the cache to reflect the changes
*/
refresh_qsfp_cache(ppd, &ppd->qsfp_info);
- if (ret)
- goto bail;
-
limiting_active =
ppd->qsfp_info.limiting_active;
} else {
dd_dev_err(dd,
"%s: Reading QSFP memory failed\n",
__func__);
- goto bail;
+ ret = -EINVAL; /* a fail indication */
}
- } else
+ release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+ if (ret)
+ goto bail;
+ } else {
ppd->offline_disabled_reason =
HFI1_ODR_MASK(
OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+ goto bail;
+ }
break;
default:
dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
index dc9119e..91eb423 100644
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu)
*/
static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
{
- int val = opa_mtu_enum_to_int((int)mtu);
+ int val;
+ /* Constraining 10KB packets to 8KB packets */
+ if (mtu == (enum ib_mtu)OPA_MTU_10240)
+ mtu = OPA_MTU_8192;
+ val = opa_mtu_enum_to_int((int)mtu);
if (val > 0)
return val;
return ib_mtu_enum_to_int(mtu);
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
index 9ed1963..2441669 100644
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
{
int ret;
- if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
return -EACCES;
/* make sure the TWSI bus is in a sane state */
@@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
{
int ret;
- if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
return -EACCES;
/* make sure the TWSI bus is in a sane state */
@@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
int ret;
u8 page;
- if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
return -EACCES;
/* make sure the TWSI bus is in a sane state */
@@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
int ret;
u8 page;
- if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
return -EACCES;
/* make sure the TWSI bus is in a sane state */
@@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
* The calls to qsfp_{read,write} in this function correctly handle the
* address map difference between this mapping and the mapping implemented
* by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
*/
int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
{
@@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
if (!qsfp_mod_present(ppd)) {
ret = -ENODEV;
- goto bail_no_release;
+ goto bail;
}
- ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
- if (ret)
- goto bail_no_release;
-
ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
if (ret != QSFP_PAGESIZE) {
dd_dev_info(ppd->dd,
@@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
}
}
- release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-
spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
ppd->qsfp_info.cache_valid = 1;
ppd->qsfp_info.cache_refresh_required = 0;
@@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
return 0;
bail:
- release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-bail_no_release:
memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
return ret;
}
@@ -466,7 +460,28 @@ const char * const hfi1_qsfp_devtech[16] = {
#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
#define QSFP_DEFAULT_HDR_CNT 224
-static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+ if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+ /* power classes count from 1, their bit encodings from 0 */
+ return (QSFP_PWR(power_byte) + 1);
+ /*
+ * 00 in the high power classes stands for unused, bringing
+ * balance to the off-by-1 offset above, we add 4 here to
+ * account for the difference between the low and high power
+ * groups
+ */
+ return (QSFP_HIGH_PWR(power_byte) + 4);
+}
int qsfp_mod_present(struct hfi1_pportdata *ppd)
{
@@ -537,6 +552,16 @@ set_zeroes:
return ret;
}
+static const char *pwr_codes[8] = {"N/AW",
+ "1.5W",
+ "2.0W",
+ "2.5W",
+ "3.5W",
+ "4.0W",
+ "4.5W",
+ "5.0W"
+ };
+
int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
{
u8 *cache = &ppd->qsfp_info.cache[0];
@@ -546,6 +571,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
int bidx = 0;
u8 *atten = &cache[QSFP_ATTEN_OFFS];
u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+ u8 power_byte = 0;
sofar = 0;
lenstr[0] = ' ';
@@ -555,9 +581,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+ power_byte = cache[QSFP_MOD_PWR_OFFS];
sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
- pwr_codes +
- (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+ pwr_codes[get_qsfp_power_class(power_byte)]);
sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
lenstr,
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
index 831fe4c..dadc66c 100644
--- a/drivers/staging/rdma/hfi1/qsfp.h
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -82,8 +82,9 @@
/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
#define QSFP_MOD_ID_OFFS 128
/*
- * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
- * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
*/
#define QSFP_MOD_PWR_OFFS 129
/* Byte 130 is Connector type. Not Intel req'd */
@@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16];
#define QSFP_HIGH_BIAS_WARNING 0x22
#define QSFP_LOW_BIAS_WARNING 0x11
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
/*
* struct qsfp_data encapsulates state of QSFP device for one port.
* it will be part of port-specific data if a board supports QSFP.
@@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16];
* and let the qsfp_lock arbitrate access to common resources.
*
*/
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4)
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
struct qsfp_data {
/* Helps to find our way */
struct hfi1_pportdata *ppd;
@@ -223,6 +221,7 @@ struct qsfp_data {
int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
int qsfp_mod_present(struct hfi1_pportdata *ppd);
int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
u32 len, u8 *data);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
index 0d7e101..792f15e 100644
--- a/drivers/staging/rdma/hfi1/rc.c
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -1497,7 +1497,7 @@ reserved:
/* Ignore reserved NAK codes. */
goto bail_stop;
}
- return ret;
+ /* cannot be reached */
bail_stop:
hfi1_stop_rc_timers(qp);
return ret;
@@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
if (sl >= OPA_MAX_SLS)
return;
- cca_timer = &ppd->cca_timer[sl];
-
cc_state = get_cc_state(ppd);
if (!cc_state)
@@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+ cca_timer = &ppd->cca_timer[sl];
if (cca_timer->ccti < ccti_limit) {
if (cca_timer->ccti + ccti_incr <= ccti_limit)
cca_timer->ccti += ccti_incr;
@@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
set_link_ipg(ppd);
}
- spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
ccti = cca_timer->ccti;
if (!hrtimer_active(&cca_timer->hrtimer)) {
@@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
HRTIMER_MODE_REL);
}
+ spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
index 08813cd..a659aec 100644
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp)
struct hfi1_pkt_state ps;
struct hfi1_qp_priv *priv = qp->priv;
int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
- unsigned long flags;
unsigned long timeout;
unsigned long timeout_int;
int cpu;
@@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp)
timeout_int = SEND_RESCHED_TIMEOUT;
}
- spin_lock_irqsave(&qp->s_lock, flags);
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
/* Return if we are already busy processing a work request. */
if (!hfi1_send_ok(qp)) {
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
return;
}
@@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp)
do {
/* Check for a constructed packet to be sent. */
if (qp->s_hdrwords != 0) {
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
* the send tasklet will be woken up later.
@@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp)
if (unlikely(time_after(jiffies, timeout))) {
if (workqueue_congested(cpu,
ps.ppd->hfi1_wq)) {
- spin_lock_irqsave(&qp->s_lock, flags);
+ spin_lock_irqsave(
+ &qp->s_lock,
+ ps.flags);
qp->s_flags &= ~RVT_S_BUSY;
hfi1_schedule_send(qp);
- spin_unlock_irqrestore(&qp->s_lock,
- flags);
+ spin_unlock_irqrestore(
+ &qp->s_lock,
+ ps.flags);
this_cpu_inc(
*ps.ppd->dd->send_schedule);
return;
@@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp)
}
timeout = jiffies + (timeout_int) / 8;
}
- spin_lock_irqsave(&qp->s_lock, flags);
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
}
} while (make_req(qp, &ps));
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
}
/*
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
index c7f1271..8cd6df8 100644
--- a/drivers/staging/rdma/hfi1/sysfs.c
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
rcu_read_unlock();
return -EINVAL;
}
- memcpy(buf, &cc_state->cct, count);
+ memcpy(buf, (void *)&cc_state->cct + pos, count);
rcu_read_unlock();
return count;
@@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
rcu_read_unlock();
return -EINVAL;
}
- memcpy(buf, &cc_state->cong_setting, count);
+ memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
rcu_read_unlock();
return count;
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
index ae8a70f..1e503ad 100644
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
(lid == ppd->lid ||
(lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
qp->ibqp.qp_type == IB_QPT_GSI)))) {
- unsigned long flags;
+ unsigned long tflags = ps->flags;
/*
* If DMAs are in progress, we can't generate
* a completion for the loopback packet since
@@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
goto bail;
}
qp->s_cur = next_cur;
- local_irq_save(flags);
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ spin_unlock_irqrestore(&qp->s_lock, tflags);
ud_loopback(qp, wqe);
- spin_lock_irqsave(&qp->s_lock, flags);
+ spin_lock_irqsave(&qp->s_lock, tflags);
+ ps->flags = tflags;
hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
goto done_free_tx;
}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
index 8bd56d5..1b640a3 100644
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
* pages, accept the amount pinned so far and program only that.
* User space knows how to deal with partially programmed buffers.
*/
- if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages))
- return -ENOMEM;
+ if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
if (pinned <= 0) {
ret = pinned;
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index d53a659..0014c9c 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -180,6 +180,8 @@ struct user_sdma_iovec {
u64 offset;
};
+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
struct sdma_mmu_node {
struct mmu_rb_node rb;
struct list_head list;
@@ -187,6 +189,7 @@ struct sdma_mmu_node {
atomic_t refcount;
struct page **pages;
unsigned npages;
+ unsigned long flags;
};
struct user_sdma_request {
@@ -597,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
goto free_req;
}
+ /* Checking P_KEY for requests from user-space */
+ if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+ PKEY_CHECK_INVALID)) {
+ ret = -EINVAL;
+ goto free_req;
+ }
+
/*
* Also should check the BTH.lnh. If it says the next header is GRH then
* the RXE parsing will be off and will land in the middle of the KDETH
@@ -1030,27 +1040,29 @@ static inline int num_user_pages(const struct iovec *iov)
return 1 + ((epage - spage) >> PAGE_SHIFT);
}
-/* Caller must hold pq->evict_lock */
static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
{
u32 cleared = 0;
struct sdma_mmu_node *node, *ptr;
+ struct list_head to_evict = LIST_HEAD_INIT(to_evict);
+ spin_lock(&pq->evict_lock);
list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
/* Make sure that no one is still using the node. */
if (!atomic_read(&node->refcount)) {
- /*
- * Need to use the page count now as the remove callback
- * will free the node.
- */
+ set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+ list_del_init(&node->list);
+ list_add(&node->list, &to_evict);
cleared += node->npages;
- spin_unlock(&pq->evict_lock);
- hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
- spin_lock(&pq->evict_lock);
if (cleared >= npages)
break;
}
}
+ spin_unlock(&pq->evict_lock);
+
+ list_for_each_entry_safe(node, ptr, &to_evict, list)
+ hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
return cleared;
}
@@ -1062,9 +1074,9 @@ static int pin_vector_pages(struct user_sdma_request *req,
struct sdma_mmu_node *node = NULL;
struct mmu_rb_node *rb_node;
- rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root,
- (unsigned long)iovec->iov.iov_base,
- iovec->iov.iov_len);
+ rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+ (unsigned long)iovec->iov.iov_base,
+ iovec->iov.iov_len);
if (rb_node && !IS_ERR(rb_node))
node = container_of(rb_node, struct sdma_mmu_node, rb);
else
@@ -1076,7 +1088,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
return -ENOMEM;
node->rb.addr = (unsigned long)iovec->iov.iov_base;
- node->rb.len = iovec->iov.iov_len;
node->pq = pq;
atomic_set(&node->refcount, 0);
INIT_LIST_HEAD(&node->list);
@@ -1093,11 +1104,25 @@ static int pin_vector_pages(struct user_sdma_request *req,
memcpy(pages, node->pages, node->npages * sizeof(*pages));
npages -= node->npages;
+
+ /*
+ * If rb_node is NULL, it means that this is brand new node
+ * and, therefore not on the eviction list.
+ * If, however, the rb_node is non-NULL, it means that the
+ * node is already in RB tree and, therefore on the eviction
+ * list (nodes are unconditionally inserted in the eviction
+ * list). In that case, we have to remove the node prior to
+ * calling the eviction function in order to prevent it from
+ * freeing this node.
+ */
+ if (rb_node) {
+ spin_lock(&pq->evict_lock);
+ list_del_init(&node->list);
+ spin_unlock(&pq->evict_lock);
+ }
retry:
if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
- spin_lock(&pq->evict_lock);
cleared = sdma_cache_evict(pq, npages);
- spin_unlock(&pq->evict_lock);
if (cleared >= npages)
goto retry;
}
@@ -1117,37 +1142,32 @@ retry:
goto bail;
}
kfree(node->pages);
+ node->rb.len = iovec->iov.iov_len;
node->pages = pages;
node->npages += pinned;
npages = node->npages;
spin_lock(&pq->evict_lock);
- if (!rb_node)
- list_add(&node->list, &pq->evict);
- else
- list_move(&node->list, &pq->evict);
+ list_add(&node->list, &pq->evict);
pq->n_locked += pinned;
spin_unlock(&pq->evict_lock);
}
iovec->pages = node->pages;
iovec->npages = npages;
- if (!rb_node) {
- ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
- if (ret) {
- spin_lock(&pq->evict_lock);
+ ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+ if (ret) {
+ spin_lock(&pq->evict_lock);
+ if (!list_empty(&node->list))
list_del(&node->list);
- pq->n_locked -= node->npages;
- spin_unlock(&pq->evict_lock);
- ret = 0;
- goto bail;
- }
- } else {
- atomic_inc(&node->refcount);
+ pq->n_locked -= node->npages;
+ spin_unlock(&pq->evict_lock);
+ goto bail;
}
return 0;
bail:
- if (!rb_node)
- kfree(node);
+ if (rb_node)
+ unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+ kfree(node);
return ret;
}
@@ -1558,7 +1578,20 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
container_of(mnode, struct sdma_mmu_node, rb);
spin_lock(&node->pq->evict_lock);
- list_del(&node->list);
+ /*
+ * We've been called by the MMU notifier but this node has been
+ * scheduled for eviction. The eviction function will take care
+ * of freeing this node.
+ * We have to take the above lock first because we are racing
+ * against the setting of the bit in the eviction function.
+ */
+ if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+ spin_unlock(&node->pq->evict_lock);
+ return;
+ }
+
+ if (!list_empty(&node->list))
+ list_del(&node->list);
node->pq->n_locked -= node->npages;
spin_unlock(&node->pq->evict_lock);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index 89f2aad..9cdc85f 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet)
if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
goto dropit;
- if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+ if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
(opcode == IB_OPCODE_CNP))
return 1;
dropit:
@@ -1089,16 +1089,16 @@ bail:
/*
* egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
+ * being an entry from the partition key table), return 0
* otherwise. Use the matching criteria for egress partition keys
* specified in the OPAv1 spec., section 9.1l.7.
*/
static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
{
u16 mkey = pkey & PKEY_LOW_15_MASK;
- u16 ment = ent & PKEY_LOW_15_MASK;
+ u16 mentry = ent & PKEY_LOW_15_MASK;
- if (mkey == ment) {
+ if (mkey == mentry) {
/*
* If pkey[15] is set (full partition member),
* is bit 15 in the corresponding table element
@@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
return 0;
}
-/*
- * egress_pkey_check - return 0 if hdr's pkey matches according to the
- * criteria in the OPAv1 spec., section 9.11.7.
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd: Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5: SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
*/
-static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
- struct hfi1_ib_header *hdr,
- struct rvt_qp *qp)
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+ u8 sc5, int8_t s_pkey_index)
{
- struct hfi1_qp_priv *priv = qp->priv;
- struct hfi1_other_headers *ohdr;
struct hfi1_devdata *dd;
- int i = 0;
+ int i;
u16 pkey;
- u8 lnh, sc5 = priv->s_sc;
+ int is_user_ctxt_mechanism = (s_pkey_index < 0);
if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
return 0;
- /* locate the pkey within the headers */
- lnh = be16_to_cpu(hdr->lrh[0]) & 3;
- if (lnh == HFI1_LRH_GRH)
- ohdr = &hdr->u.l.oth;
- else
- ohdr = &hdr->u.oth;
-
- pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+ pkey = (u16)be32_to_cpu(bth[0]);
/* If SC15, pkey[0:14] must be 0x7fff */
if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
@@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
if ((pkey & PKEY_LOW_15_MASK) == 0)
goto bad;
- /* The most likely matching pkey has index qp->s_pkey_index */
- if (unlikely(!egress_pkey_matches_entry(pkey,
- ppd->pkeys
- [qp->s_pkey_index]))) {
- /* no match - try the entire table */
- for (; i < MAX_PKEY_VALUES; i++) {
- if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
- break;
- }
+ /*
+ * For the kernel contexts only, if a qp is passed into the function,
+ * the most likely matching pkey has index qp->s_pkey_index
+ */
+ if (!is_user_ctxt_mechanism &&
+ egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+ return 0;
}
- if (i < MAX_PKEY_VALUES)
- return 0;
+ for (i = 0; i < MAX_PKEY_VALUES; i++) {
+ if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+ return 0;
+ }
bad:
- incr_cntr64(&ppd->port_xmit_constraint_errors);
- dd = ppd->dd;
- if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
- u16 slid = be16_to_cpu(hdr->lrh[3]);
-
- dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
- dd->err_info_xmit_constraint.slid = slid;
- dd->err_info_xmit_constraint.pkey = pkey;
+ /*
+ * For the user-context mechanism, the P_KEY check would only happen
+ * once per SDMA request, not once per packet. Therefore, there's no
+ * need to increment the counter for the user-context mechanism.
+ */
+ if (!is_user_ctxt_mechanism) {
+ incr_cntr64(&ppd->port_xmit_constraint_errors);
+ dd = ppd->dd;
+ if (!(dd->err_info_xmit_constraint.status &
+ OPA_EI_STATUS_SMASK)) {
+ u16 slid = be16_to_cpu(lrh[3]);
+
+ dd->err_info_xmit_constraint.status |=
+ OPA_EI_STATUS_SMASK;
+ dd->err_info_xmit_constraint.slid = slid;
+ dd->err_info_xmit_constraint.pkey = pkey;
+ }
}
return 1;
}
@@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
{
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_ib_header *hdr;
send_routine sr;
int ret;
+ u8 lnh;
+
+ hdr = &ps->s_txreq->phdr.hdr;
+ /* locate the pkey within the headers */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ ohdr = &hdr->u.oth;
sr = get_send_routine(qp, ps->s_txreq);
- ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
+ ret = egress_pkey_check(dd->pport,
+ hdr->lrh,
+ ohdr->bth,
+ priv->s_sc,
+ qp->s_pkey_index);
if (unlikely(ret)) {
/*
* The value we are returning here does not get propagated to
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 6c4670f..3ee2239 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -215,6 +215,7 @@ struct hfi1_pkt_state {
struct hfi1_ibport *ibp;
struct hfi1_pportdata *ppd;
struct verbs_txreq *s_txreq;
+ unsigned long flags;
};
#define HFI1_PSN_CREDIT 16
@@ -334,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
#endif
#define PSN_MODIFY_MASK 0xFFFFFF
-/* Number of bits to pay attention to in the opcode for checking qp type */
-#define OPCODE_QP_MASK 0xE0
-
/*
* Compare the lower 24 bits of the msn values.
* Returns an integer <, ==, or > than zero.