summaryrefslogtreecommitdiff
path: root/drivers/infiniband
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig8
-rw-r--r--drivers/infiniband/Makefile1
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/cache.c1
-rw-r--r--drivers/infiniband/core/cm.c29
-rw-r--r--drivers/infiniband/core/cma.c110
-rw-r--r--drivers/infiniband/core/device.c138
-rw-r--r--drivers/infiniband/core/umem.c (renamed from drivers/infiniband/core/uverbs_mem.c)162
-rw-r--r--drivers/infiniband/core/uverbs.h6
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c60
-rw-r--r--drivers/infiniband/core/uverbs_main.c11
-rw-r--r--drivers/infiniband/hw/amso1100/c2.c2
-rw-r--r--drivers/infiniband/hw/amso1100/c2_provider.c42
-rw-r--r--drivers/infiniband/hw/amso1100/c2_provider.h1
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c28
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.h1
-rw-r--r--drivers/infiniband/hw/ehca/ehca_classes.h2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c13
-rw-r--r--drivers/infiniband/hw/ehca/ehca_iverbs.h3
-rw-r--r--drivers/infiniband/hw/ehca/ehca_main.c96
-rw-r--r--drivers/infiniband/hw/ehca/ehca_mrmw.c76
-rw-r--r--drivers/infiniband/hw/ehca/ehca_qp.c17
-rw-r--r--drivers/infiniband/hw/ehca/hcp_if.c15
-rw-r--r--drivers/infiniband/hw/ipath/ipath_fs.c2
-rw-r--r--drivers/infiniband/hw/ipath/ipath_iba6120.c7
-rw-r--r--drivers/infiniband/hw/ipath/ipath_intr.c7
-rw-r--r--drivers/infiniband/hw/ipath/ipath_kernel.h2
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mr.c38
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.c12
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.h5
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs_mcast.c16
-rw-r--r--drivers/infiniband/hw/mlx4/Kconfig9
-rw-r--r--drivers/infiniband/hw/mlx4/Makefile3
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c100
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c530
-rw-r--r--drivers/infiniband/hw/mlx4/doorbell.c216
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c339
-rw-r--r--drivers/infiniband/hw/mlx4/main.c658
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h288
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c184
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c1457
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c340
-rw-r--r--drivers/infiniband/hw/mlx4/user.h96
-rw-r--r--drivers/infiniband/hw/mthca/mthca_av.c1
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.c3
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cq.c5
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c4
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c1
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c38
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.h1
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c166
-rw-r--r--drivers/infiniband/hw/mthca/mthca_srq.c1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h49
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c251
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c118
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c7
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c40
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c1
59 files changed, 5276 insertions, 547 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 66b36de..994decc 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -1,4 +1,5 @@
menu "InfiniBand support"
+ depends on HAS_IOMEM
config INFINIBAND
depends on PCI || BROKEN
@@ -29,6 +30,11 @@ config INFINIBAND_USER_ACCESS
libibverbs, libibcm and a hardware driver library from
<http://www.openib.org>.
+config INFINIBAND_USER_MEM
+ bool
+ depends on INFINIBAND_USER_ACCESS != n
+ default y
+
config INFINIBAND_ADDR_TRANS
bool
depends on INFINIBAND && INET
@@ -40,6 +46,8 @@ source "drivers/infiniband/hw/ehca/Kconfig"
source "drivers/infiniband/hw/amso1100/Kconfig"
source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+
source "drivers/infiniband/ulp/ipoib/Kconfig"
source "drivers/infiniband/ulp/srp/Kconfig"
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index da2066c..75f325e 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/
obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/
obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/
obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/
+obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/
obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/
obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/
obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 189e5d4..cb1ab3e 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
@@ -28,5 +29,4 @@ ib_umad-y := user_mad.o
ib_ucm-y := ucm.o
-ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \
- uverbs_marshall.o
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 558c9a0..e85f701 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -38,6 +38,7 @@
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index eff591d..40c004a 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -306,7 +306,9 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
do {
spin_lock_irqsave(&cm.lock, flags);
ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
- next_id++, &id);
+ next_id, &id);
+ if (!ret)
+ next_id = ((unsigned) id + 1) & MAX_ID_MASK;
spin_unlock_irqrestore(&cm.lock, flags);
} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
@@ -1295,26 +1297,29 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
- /* Check for duplicate REQ and stale connections. */
+ /* Check for possible duplicate REQ. */
spin_lock_irqsave(&cm.lock, flags);
timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
- if (!timewait_info)
- timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
-
if (timewait_info) {
cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
- cm_cleanup_timewait(cm_id_priv->timewait_info);
spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_dup_req_handler(work, cur_cm_id_priv);
cm_deref_id(cur_cm_id_priv);
- } else
- cm_issue_rej(work->port, work->mad_recv_wc,
- IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
- NULL, 0);
- listen_cm_id_priv = NULL;
- goto out;
+ }
+ return NULL;
+ }
+
+ /* Check for stale connections. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ return NULL;
}
/* Find matching listen request. */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index fde92ce..32a0e66 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -346,12 +346,33 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
complete(&id_priv->comp);
}
-static void cma_release_remove(struct rdma_id_private *id_priv)
+static int cma_disable_remove(struct rdma_id_private *id_priv,
+ enum cma_state state)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == state) {
+ atomic_inc(&id_priv->dev_remove);
+ ret = 0;
+ } else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static void cma_enable_remove(struct rdma_id_private *id_priv)
{
if (atomic_dec_and_test(&id_priv->dev_remove))
wake_up(&id_priv->wait_remove);
}
+static int cma_has_cm_dev(struct rdma_id_private *id_priv)
+{
+ return (id_priv->id.device && id_priv->cm_id.ib);
+}
+
struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps)
{
@@ -884,9 +905,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
struct rdma_cm_event event;
int ret = 0;
- atomic_inc(&id_priv->dev_remove);
- if (!cma_comp(id_priv, CMA_CONNECT))
- goto out;
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
memset(&event, 0, sizeof event);
switch (ib_event->event) {
@@ -942,12 +962,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
rdma_destroy_id(&id_priv->id);
return ret;
}
out:
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
return ret;
}
@@ -1057,11 +1077,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
int offset, ret;
listen_id = cm_id->context;
- atomic_inc(&listen_id->dev_remove);
- if (!cma_comp(listen_id, CMA_LISTEN)) {
- ret = -ECONNABORTED;
- goto out;
- }
+ if (cma_disable_remove(listen_id, CMA_LISTEN))
+ return -ECONNABORTED;
memset(&event, 0, sizeof event);
offset = cma_user_data_offset(listen_id->id.ps);
@@ -1101,11 +1118,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
release_conn_id:
cma_exch(conn_id, CMA_DESTROYING);
- cma_release_remove(conn_id);
+ cma_enable_remove(conn_id);
rdma_destroy_id(&conn_id->id);
out:
- cma_release_remove(listen_id);
+ cma_enable_remove(listen_id);
return ret;
}
@@ -1171,9 +1188,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
struct sockaddr_in *sin;
int ret = 0;
- memset(&event, 0, sizeof event);
- atomic_inc(&id_priv->dev_remove);
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
+ memset(&event, 0, sizeof event);
switch (iw_event->event) {
case IW_CM_EVENT_CLOSE:
event.event = RDMA_CM_EVENT_DISCONNECTED;
@@ -1214,12 +1232,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.iw = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
rdma_destroy_id(&id_priv->id);
return ret;
}
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
return ret;
}
@@ -1234,11 +1252,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
int ret;
listen_id = cm_id->context;
- atomic_inc(&listen_id->dev_remove);
- if (!cma_comp(listen_id, CMA_LISTEN)) {
- ret = -ECONNABORTED;
- goto out;
- }
+ if (cma_disable_remove(listen_id, CMA_LISTEN))
+ return -ECONNABORTED;
/* Create a new RDMA id for the new IW CM ID */
new_cm_id = rdma_create_id(listen_id->id.event_handler,
@@ -1255,13 +1270,13 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
dev = ip_dev_find(iw_event->local_addr.sin_addr.s_addr);
if (!dev) {
ret = -EADDRNOTAVAIL;
- cma_release_remove(conn_id);
+ cma_enable_remove(conn_id);
rdma_destroy_id(new_cm_id);
goto out;
}
ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
if (ret) {
- cma_release_remove(conn_id);
+ cma_enable_remove(conn_id);
rdma_destroy_id(new_cm_id);
goto out;
}
@@ -1270,7 +1285,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
ret = cma_acquire_dev(conn_id);
mutex_unlock(&lock);
if (ret) {
- cma_release_remove(conn_id);
+ cma_enable_remove(conn_id);
rdma_destroy_id(new_cm_id);
goto out;
}
@@ -1293,14 +1308,14 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
/* User wants to destroy the CM ID */
conn_id->cm_id.iw = NULL;
cma_exch(conn_id, CMA_DESTROYING);
- cma_release_remove(conn_id);
+ cma_enable_remove(conn_id);
rdma_destroy_id(&conn_id->id);
}
out:
if (dev)
dev_put(dev);
- cma_release_remove(listen_id);
+ cma_enable_remove(listen_id);
return ret;
}
@@ -1519,7 +1534,7 @@ static void cma_work_handler(struct work_struct *_work)
destroy = 1;
}
out:
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
cma_deref_id(id_priv);
if (destroy)
rdma_destroy_id(&id_priv->id);
@@ -1711,13 +1726,13 @@ static void addr_handler(int status, struct sockaddr *src_addr,
if (id_priv->id.event_handler(&id_priv->id, &event)) {
cma_exch(id_priv, CMA_DESTROYING);
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
cma_deref_id(id_priv);
rdma_destroy_id(&id_priv->id);
return;
}
out:
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
cma_deref_id(id_priv);
}
@@ -2042,11 +2057,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
int ret = 0;
- memset(&event, 0, sizeof event);
- atomic_inc(&id_priv->dev_remove);
- if (!cma_comp(id_priv, CMA_CONNECT))
- goto out;
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
+ memset(&event, 0, sizeof event);
switch (ib_event->event) {
case IB_CM_SIDR_REQ_ERROR:
event.event = RDMA_CM_EVENT_UNREACHABLE;
@@ -2084,12 +2098,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
rdma_destroy_id(&id_priv->id);
return ret;
}
out:
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
return ret;
}
@@ -2413,7 +2427,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp(id_priv, CMA_CONNECT))
+ if (!cma_has_cm_dev(id_priv))
return -EINVAL;
switch (id->device->node_type) {
@@ -2435,7 +2449,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp(id_priv, CMA_CONNECT))
+ if (!cma_has_cm_dev(id_priv))
return -EINVAL;
switch (rdma_node_get_transport(id->device->node_type)) {
@@ -2466,8 +2480,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp(id_priv, CMA_CONNECT) &&
- !cma_comp(id_priv, CMA_DISCONNECT))
+ if (!cma_has_cm_dev(id_priv))
return -EINVAL;
switch (rdma_node_get_transport(id->device->node_type)) {
@@ -2499,10 +2512,9 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
int ret;
id_priv = mc->id_priv;
- atomic_inc(&id_priv->dev_remove);
- if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
- !cma_comp(id_priv, CMA_ADDR_RESOLVED))
- goto out;
+ if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
+ cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
+ return 0;
if (!status && id_priv->id.qp)
status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
@@ -2524,12 +2536,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
ret = id_priv->id.event_handler(&id_priv->id, &event);
if (ret) {
cma_exch(id_priv, CMA_DESTROYING);
- cma_release_remove(id_priv);
+ cma_enable_remove(id_priv);
rdma_destroy_id(&id_priv->id);
return 0;
}
-out:
- cma_release_remove(id_priv);
+
+ cma_enable_remove(id_priv);
return 0;
}
@@ -2761,8 +2773,8 @@ static int cma_init(void)
int ret;
get_random_bytes(&next_port, sizeof next_port);
- next_port = (next_port % (sysctl_local_port_range[1] -
- sysctl_local_port_range[0])) +
+ next_port = ((unsigned int) next_port %
+ (sysctl_local_port_range[1] - sysctl_local_port_range[0])) +
sysctl_local_port_range[0];
cma_wq = create_singlethread_workqueue("rdma_cm");
if (!cma_wq)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7fabb42..3ada17c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/workqueue.h>
#include "core_priv.h"
@@ -149,6 +150,18 @@ static int alloc_name(char *name)
return 0;
}
+static int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -208,6 +221,45 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
return 0;
}
+static int read_port_table_lengths(struct ib_device *device)
+{
+ struct ib_port_attr *tprops = NULL;
+ int num_ports, ret = -ENOMEM;
+ u8 port_index;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ goto out;
+
+ num_ports = end_port(device) - start_port(device) + 1;
+
+ device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+ GFP_KERNEL);
+ device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+ GFP_KERNEL);
+ if (!device->pkey_tbl_len || !device->gid_tbl_len)
+ goto err;
+
+ for (port_index = 0; port_index < num_ports; ++port_index) {
+ ret = ib_query_port(device, port_index + start_port(device),
+ tprops);
+ if (ret)
+ goto err;
+ device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+ device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
+ }
+
+ ret = 0;
+ goto out;
+
+err:
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+out:
+ kfree(tprops);
+ return ret;
+}
+
/**
* ib_register_device - Register an IB device with IB core
* @device:Device to register
@@ -239,10 +291,19 @@ int ib_register_device(struct ib_device *device)
spin_lock_init(&device->event_handler_lock);
spin_lock_init(&device->client_data_lock);
+ ret = read_port_table_lengths(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+ device->name);
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
device->name);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
goto out;
}
@@ -284,6 +345,9 @@ void ib_unregister_device(struct ib_device *device)
list_del(&device->core_list);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+
mutex_unlock(&device_mutex);
spin_lock_irqsave(&device->client_data_lock, flags);
@@ -506,10 +570,7 @@ int ib_query_port(struct ib_device *device,
u8 port_num,
struct ib_port_attr *port_attr)
{
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- if (port_num)
- return -EINVAL;
- } else if (port_num < 1 || port_num > device->phys_port_cnt)
+ if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
return device->query_port(device, port_num, port_attr);
@@ -581,10 +642,7 @@ int ib_modify_port(struct ib_device *device,
u8 port_num, int port_modify_mask,
struct ib_port_modify *port_modify)
{
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- if (port_num)
- return -EINVAL;
- } else if (port_num < 1 || port_num > device->phys_port_cnt)
+ if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
return device->modify_port(device, port_num, port_modify_mask,
@@ -592,6 +650,68 @@ int ib_modify_port(struct ib_device *device,
}
EXPORT_SYMBOL(ib_modify_port);
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = start_port(device); port <= end_port(device); ++port) {
+ for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ return ret;
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+
+ for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+
+ if (pkey == tmp_pkey) {
+ *index = i;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
static int __init ib_core_init(void)
{
int ret;
@@ -613,6 +733,8 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ib_sysfs_cleanup();
+ /* Make sure that any pending umem accounting work is done. */
+ flush_scheduled_work();
}
module_init(ib_core_init);
diff --git a/drivers/infiniband/core/uverbs_mem.c b/drivers/infiniband/core/umem.c
index c95fe95..d40652a 100644
--- a/drivers/infiniband/core/uverbs_mem.c
+++ b/drivers/infiniband/core/umem.c
@@ -36,16 +36,10 @@
#include <linux/mm.h>
#include <linux/dma-mapping.h>
+#include <linux/sched.h>
#include "uverbs.h"
-struct ib_umem_account_work {
- struct work_struct work;
- struct mm_struct *mm;
- unsigned long diff;
-};
-
-
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
struct ib_umem_chunk *chunk, *tmp;
@@ -64,35 +58,56 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
}
}
-int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
- void *addr, size_t size, int write)
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access)
{
+ struct ib_umem *umem;
struct page **page_list;
struct ib_umem_chunk *chunk;
unsigned long locked;
unsigned long lock_limit;
unsigned long cur_base;
unsigned long npages;
- int ret = 0;
+ int ret;
int off;
int i;
if (!can_do_mlock())
- return -EPERM;
+ return ERR_PTR(-EPERM);
- page_list = (struct page **) __get_free_page(GFP_KERNEL);
- if (!page_list)
- return -ENOMEM;
+ umem = kmalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
- mem->user_base = (unsigned long) addr;
- mem->length = size;
- mem->offset = (unsigned long) addr & ~PAGE_MASK;
- mem->page_size = PAGE_SIZE;
- mem->writable = write;
+ umem->context = context;
+ umem->length = size;
+ umem->offset = addr & ~PAGE_MASK;
+ umem->page_size = PAGE_SIZE;
+ /*
+ * We ask for writable memory if any access flags other than
+ * "remote read" are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
- INIT_LIST_HEAD(&mem->chunk_list);
+ INIT_LIST_HEAD(&umem->chunk_list);
- npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT;
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
down_write(&current->mm->mmap_sem);
@@ -104,13 +119,13 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
goto out;
}
- cur_base = (unsigned long) addr & PAGE_MASK;
+ cur_base = addr & PAGE_MASK;
while (npages) {
ret = get_user_pages(current, current->mm, cur_base,
min_t(int, npages,
PAGE_SIZE / sizeof (struct page *)),
- 1, !write, page_list, NULL);
+ 1, !umem->writable, page_list, NULL);
if (ret < 0)
goto out;
@@ -136,7 +151,7 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
chunk->page_list[i].length = PAGE_SIZE;
}
- chunk->nmap = ib_dma_map_sg(dev,
+ chunk->nmap = ib_dma_map_sg(context->device,
&chunk->page_list[0],
chunk->nents,
DMA_BIDIRECTIONAL);
@@ -151,75 +166,98 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
ret -= chunk->nents;
off += chunk->nents;
- list_add_tail(&chunk->list, &mem->chunk_list);
+ list_add_tail(&chunk->list, &umem->chunk_list);
}
ret = 0;
}
out:
- if (ret < 0)
- __ib_umem_release(dev, mem, 0);
- else
+ if (ret < 0) {
+ __ib_umem_release(context->device, umem, 0);
+ kfree(umem);
+ } else
current->mm->locked_vm = locked;
up_write(&current->mm->mmap_sem);
free_page((unsigned long) page_list);
- return ret;
+ return ret < 0 ? ERR_PTR(ret) : umem;
}
+EXPORT_SYMBOL(ib_umem_get);
-void ib_umem_release(struct ib_device *dev, struct ib_umem *umem)
+static void ib_umem_account(struct work_struct *work)
{
- __ib_umem_release(dev, umem, 1);
-
- down_write(&current->mm->mmap_sem);
- current->mm->locked_vm -=
- PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
- up_write(&current->mm->mmap_sem);
-}
+ struct ib_umem *umem = container_of(work, struct ib_umem, work);
-static void ib_umem_account(struct work_struct *_work)
-{
- struct ib_umem_account_work *work =
- container_of(_work, struct ib_umem_account_work, work);
-
- down_write(&work->mm->mmap_sem);
- work->mm->locked_vm -= work->diff;
- up_write(&work->mm->mmap_sem);
- mmput(work->mm);
- kfree(work);
+ down_write(&umem->mm->mmap_sem);
+ umem->mm->locked_vm -= umem->diff;
+ up_write(&umem->mm->mmap_sem);
+ mmput(umem->mm);
+ kfree(umem);
}
-void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem)
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
{
- struct ib_umem_account_work *work;
+ struct ib_ucontext *context = umem->context;
struct mm_struct *mm;
+ unsigned long diff;
- __ib_umem_release(dev, umem, 1);
+ __ib_umem_release(umem->context->device, umem, 1);
mm = get_task_mm(current);
- if (!mm)
+ if (!mm) {
+ kfree(umem);
return;
+ }
+
+ diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
/*
* We may be called with the mm's mmap_sem already held. This
* can happen when a userspace munmap() is the call that drops
* the last reference to our file and calls our release
* method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. Therefore we
- * defer the vm_locked accounting to the system workqueue.
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
*/
+ if (context->closing) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ INIT_WORK(&umem->work, ib_umem_account);
+ umem->mm = mm;
+ umem->diff = diff;
+
+ schedule_work(&umem->work);
+ return;
+ }
+ } else
+ down_write(&mm->mmap_sem);
- work = kmalloc(sizeof *work, GFP_KERNEL);
- if (!work) {
- mmput(mm);
- return;
- }
+ current->mm->locked_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ struct ib_umem_chunk *chunk;
+ int shift;
+ int i;
+ int n;
+
+ shift = ilog2(umem->page_size);
- INIT_WORK(&work->work, ib_umem_account);
- work->mm = mm;
- work->diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+ n = 0;
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (i = 0; i < chunk->nmap; ++i)
+ n += sg_dma_len(&chunk->page_list[i]) >> shift;
- schedule_work(&work->work);
+ return n;
}
+EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 102a59c..c33546f 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -45,6 +45,7 @@
#include <linux/completion.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
/*
@@ -163,11 +164,6 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
-int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
- void *addr, size_t size, int write);
-void ib_umem_release(struct ib_device *dev, struct ib_umem *umem);
-void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem);
-
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
const char __user *buf, int in_len, \
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bab6676..01d7008 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
* Copyright (c) 2006 Mellanox Technologies. All rights reserved.
*
@@ -295,6 +295,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->qp_list);
INIT_LIST_HEAD(&ucontext->srq_list);
INIT_LIST_HEAD(&ucontext->ah_list);
+ ucontext->closing = 0;
resp.num_comp_vectors = file->device->num_comp_vectors;
@@ -573,7 +574,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
struct ib_uverbs_reg_mr cmd;
struct ib_uverbs_reg_mr_resp resp;
struct ib_udata udata;
- struct ib_umem_object *obj;
+ struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mr *mr;
int ret;
@@ -599,35 +600,21 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
!(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
return -EINVAL;
- obj = kmalloc(sizeof *obj, GFP_KERNEL);
- if (!obj)
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
return -ENOMEM;
- init_uobj(&obj->uobject, 0, file->ucontext, &mr_lock_key);
- down_write(&obj->uobject.mutex);
-
- /*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
- * obviously require write access. "Remote atomic" can do
- * things like fetch and add, which will modify memory, and
- * "MW bind" can change permissions by binding a window.
- */
- ret = ib_umem_get(file->device->ib_dev, &obj->umem,
- (void *) (unsigned long) cmd.start, cmd.length,
- !!(cmd.access_flags & ~IB_ACCESS_REMOTE_READ));
- if (ret)
- goto err_free;
-
- obj->umem.virt_base = cmd.hca_va;
+ init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+ down_write(&uobj->mutex);
pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
- goto err_release;
+ goto err_free;
}
- mr = pd->device->reg_user_mr(pd, &obj->umem, cmd.access_flags, &udata);
+ mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags, &udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
goto err_put;
@@ -635,19 +622,19 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
mr->device = pd->device;
mr->pd = pd;
- mr->uobject = &obj->uobject;
+ mr->uobject = uobj;
atomic_inc(&pd->usecnt);
atomic_set(&mr->usecnt, 0);
- obj->uobject.object = mr;
- ret = idr_add_uobj(&ib_uverbs_mr_idr, &obj->uobject);
+ uobj->object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
if (ret)
goto err_unreg;
memset(&resp, 0, sizeof resp);
resp.lkey = mr->lkey;
resp.rkey = mr->rkey;
- resp.mr_handle = obj->uobject.id;
+ resp.mr_handle = uobj->id;
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp)) {
@@ -658,17 +645,17 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
put_pd_read(pd);
mutex_lock(&file->mutex);
- list_add_tail(&obj->uobject.list, &file->ucontext->mr_list);
+ list_add_tail(&uobj->list, &file->ucontext->mr_list);
mutex_unlock(&file->mutex);
- obj->uobject.live = 1;
+ uobj->live = 1;
- up_write(&obj->uobject.mutex);
+ up_write(&uobj->mutex);
return in_len;
err_copy:
- idr_remove_uobj(&ib_uverbs_mr_idr, &obj->uobject);
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
err_unreg:
ib_dereg_mr(mr);
@@ -676,11 +663,8 @@ err_unreg:
err_put:
put_pd_read(pd);
-err_release:
- ib_umem_release(file->device->ib_dev, &obj->umem);
-
err_free:
- put_uobj_write(&obj->uobject);
+ put_uobj_write(uobj);
return ret;
}
@@ -691,7 +675,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
struct ib_uverbs_dereg_mr cmd;
struct ib_mr *mr;
struct ib_uobject *uobj;
- struct ib_umem_object *memobj;
int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -701,8 +684,7 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
if (!uobj)
return -EINVAL;
- memobj = container_of(uobj, struct ib_umem_object, uobject);
- mr = uobj->object;
+ mr = uobj->object;
ret = ib_dereg_mr(mr);
if (!ret)
@@ -719,8 +701,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
list_del(&uobj->list);
mutex_unlock(&file->mutex);
- ib_umem_release(file->device->ib_dev, &memobj->umem);
-
put_uobj(uobj);
return in_len;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index d44e547..14d7ccd 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -183,6 +183,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
if (!context)
return 0;
+ context->closing = 1;
+
list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
struct ib_ah *ah = uobj->object;
@@ -230,16 +232,10 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
struct ib_mr *mr = uobj->object;
- struct ib_device *mrdev = mr->device;
- struct ib_umem_object *memobj;
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
ib_dereg_mr(mr);
-
- memobj = container_of(uobj, struct ib_umem_object, uobject);
- ib_umem_release_on_close(mrdev, &memobj->umem);
-
- kfree(memobj);
+ kfree(uobj);
}
list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
@@ -906,7 +902,6 @@ static void __exit ib_uverbs_cleanup(void)
unregister_filesystem(&uverbs_event_fs);
class_destroy(uverbs_class);
unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
- flush_scheduled_work();
idr_destroy(&ib_uverbs_pd_idr);
idr_destroy(&ib_uverbs_mr_idr);
idr_destroy(&ib_uverbs_mw_idr);
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
index 58bc272..0aecea6 100644
--- a/drivers/infiniband/hw/amso1100/c2.c
+++ b/drivers/infiniband/hw/amso1100/c2.c
@@ -672,7 +672,7 @@ static int c2_up(struct net_device *netdev)
* rdma interface.
*/
in_dev = in_dev_get(netdev);
- in_dev->cnf.arp_ignore = 1;
+ IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
in_dev_put(in_dev);
return 0;
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
index 1091662..997cf15 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -56,6 +56,7 @@
#include <asm/byteorder.h>
#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
#include "c2.h"
#include "c2_provider.h"
@@ -396,6 +397,7 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
}
mr->pd = to_c2pd(ib_pd);
+ mr->umem = NULL;
pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
"*iova_start %llx, first pa %llx, last pa %llx\n",
__FUNCTION__, page_shift, pbl_depth, total_len,
@@ -428,8 +430,8 @@ static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
}
-static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
- int acc, struct ib_udata *udata)
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int acc, struct ib_udata *udata)
{
u64 *pages;
u64 kva = 0;
@@ -441,15 +443,23 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
struct c2_mr *c2mr;
pr_debug("%s:%u\n", __FUNCTION__, __LINE__);
- shift = ffs(region->page_size) - 1;
c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
if (!c2mr)
return ERR_PTR(-ENOMEM);
c2mr->pd = c2pd;
+ c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+ if (IS_ERR(c2mr->umem)) {
+ err = PTR_ERR(c2mr->umem);
+ kfree(c2mr);
+ return ERR_PTR(err);
+ }
+
+ shift = ffs(c2mr->umem->page_size) - 1;
+
n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &c2mr->umem->chunk_list, list)
n += chunk->nents;
pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
@@ -459,35 +469,34 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
}
i = 0;
- list_for_each_entry(chunk, &region->chunk_list, list) {
+ list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) {
for (j = 0; j < chunk->nmap; ++j) {
len = sg_dma_len(&chunk->page_list[j]) >> shift;
for (k = 0; k < len; ++k) {
pages[i++] =
sg_dma_address(&chunk->page_list[j]) +
- (region->page_size * k);
+ (c2mr->umem->page_size * k);
}
}
}
- kva = (u64)region->virt_base;
+ kva = virt;
err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
pages,
- region->page_size,
+ c2mr->umem->page_size,
i,
- region->length,
- region->offset,
+ length,
+ c2mr->umem->offset,
&kva,
c2_convert_access(acc),
c2mr);
kfree(pages);
- if (err) {
- kfree(c2mr);
- return ERR_PTR(err);
- }
+ if (err)
+ goto err;
return &c2mr->ibmr;
err:
+ ib_umem_release(c2mr->umem);
kfree(c2mr);
return ERR_PTR(err);
}
@@ -502,8 +511,11 @@ static int c2_dereg_mr(struct ib_mr *ib_mr)
err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
if (err)
pr_debug("c2_stag_dealloc failed: %d\n", err);
- else
+ else {
+ if (mr->umem)
+ ib_umem_release(mr->umem);
kfree(mr);
+ }
return err;
}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h
index fc90622..1076df2 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.h
+++ b/drivers/infiniband/hw/amso1100/c2_provider.h
@@ -73,6 +73,7 @@ struct c2_pd {
struct c2_mr {
struct ib_mr ibmr;
struct c2_pd *pd;
+ struct ib_umem *umem;
};
struct c2_av;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index a891493..e7c2c39 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -47,6 +47,7 @@
#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
#include "cxio_hal.h"
@@ -443,6 +444,8 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
remove_handle(rhp, &rhp->mmidr, mmid);
if (mhp->kva)
kfree((void *) (unsigned long) mhp->kva);
+ if (mhp->umem)
+ ib_umem_release(mhp->umem);
PDBG("%s mmid 0x%x ptr %p\n", __FUNCTION__, mmid, mhp);
kfree(mhp);
return 0;
@@ -577,8 +580,8 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr,
}
-static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
- int acc, struct ib_udata *udata)
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int acc, struct ib_udata *udata)
{
__be64 *pages;
int shift, n, len;
@@ -591,7 +594,6 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
struct iwch_reg_user_mr_resp uresp;
PDBG("%s ib_pd %p\n", __FUNCTION__, pd);
- shift = ffs(region->page_size) - 1;
php = to_iwch_pd(pd);
rhp = php->rhp;
@@ -599,8 +601,17 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
if (!mhp)
return ERR_PTR(-ENOMEM);
+ mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+ if (IS_ERR(mhp->umem)) {
+ err = PTR_ERR(mhp->umem);
+ kfree(mhp);
+ return ERR_PTR(err);
+ }
+
+ shift = ffs(mhp->umem->page_size) - 1;
+
n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
n += chunk->nents;
pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
@@ -611,13 +622,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
i = n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
for (j = 0; j < chunk->nmap; ++j) {
len = sg_dma_len(&chunk->page_list[j]) >> shift;
for (k = 0; k < len; ++k) {
pages[i++] = cpu_to_be64(sg_dma_address(
&chunk->page_list[j]) +
- region->page_size * k);
+ mhp->umem->page_size * k);
}
}
@@ -625,9 +636,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
mhp->attr.pdid = php->pdid;
mhp->attr.zbva = 0;
mhp->attr.perms = iwch_ib_to_tpt_access(acc);
- mhp->attr.va_fbo = region->virt_base;
+ mhp->attr.va_fbo = virt;
mhp->attr.page_size = shift - 12;
- mhp->attr.len = (u32) region->length;
+ mhp->attr.len = (u32) length;
mhp->attr.pbl_size = i;
err = iwch_register_mem(rhp, php, mhp, shift, pages);
kfree(pages);
@@ -650,6 +661,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
return &mhp->ibmr;
err:
+ ib_umem_release(mhp->umem);
kfree(mhp);
return ERR_PTR(err);
}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index 93bcc56..48833f3 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -73,6 +73,7 @@ struct tpt_attributes {
struct iwch_mr {
struct ib_mr ibmr;
+ struct ib_umem *umem;
struct iwch_dev *rhp;
u64 kva;
struct tpt_attributes attr;
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 10fb8fb..1d286d3 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -176,6 +176,7 @@ struct ehca_mr {
struct ib_mr ib_mr; /* must always be first in ehca_mr */
struct ib_fmr ib_fmr; /* must always be first in ehca_mr */
} ib;
+ struct ib_umem *umem;
spinlock_t mrlock;
enum ehca_mr_flag flags;
@@ -276,6 +277,7 @@ void ehca_cleanup_mrmw_cache(void);
extern spinlock_t ehca_qp_idr_lock;
extern spinlock_t ehca_cq_idr_lock;
+extern spinlock_t hcall_lock;
extern struct idr ehca_qp_idr;
extern struct idr ehca_cq_idr;
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index f284be1..100329b 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -517,12 +517,11 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
else {
struct ehca_cq *cq = eq->eqe_cache[i].cq;
comp_event_callback(cq);
- spin_lock_irqsave(&ehca_cq_idr_lock, flags);
+ spin_lock(&ehca_cq_idr_lock);
cq->nr_events--;
if (!cq->nr_events)
wake_up(&cq->wait_completion);
- spin_unlock_irqrestore(&ehca_cq_idr_lock,
- flags);
+ spin_unlock(&ehca_cq_idr_lock);
}
} else {
ehca_dbg(&shca->ib_device, "Got non completion event");
@@ -711,6 +710,7 @@ static void destroy_comp_task(struct ehca_comp_pool *pool,
kthread_stop(task);
}
+#ifdef CONFIG_HOTPLUG_CPU
static void take_over_work(struct ehca_comp_pool *pool,
int cpu)
{
@@ -735,7 +735,6 @@ static void take_over_work(struct ehca_comp_pool *pool,
}
-#ifdef CONFIG_HOTPLUG_CPU
static int comp_pool_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
@@ -745,6 +744,7 @@ static int comp_pool_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
if(!create_comp_task(pool, cpu)) {
ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
@@ -752,24 +752,29 @@ static int comp_pool_callback(struct notifier_block *nfb,
}
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
kthread_bind(cct->task, any_online_cpu(cpu_online_map));
destroy_comp_task(pool, cpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
kthread_bind(cct->task, cpu);
wake_up_process(cct->task);
break;
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
break;
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
destroy_comp_task(pool, cpu);
take_over_work(pool, cpu);
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
index e14b029..37e7fe0 100644
--- a/drivers/infiniband/hw/ehca/ehca_iverbs.h
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -78,8 +78,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
int num_phys_buf,
int mr_access_flags, u64 *iova_start);
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd,
- struct ib_umem *region,
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt,
int mr_access_flags, struct ib_udata *udata);
int ehca_rereg_phys_mr(struct ib_mr *mr,
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 2d37054..c3f99f3 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -52,7 +52,7 @@
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
-MODULE_VERSION("SVNEHCA_0022");
+MODULE_VERSION("SVNEHCA_0023");
int ehca_open_aqp1 = 0;
int ehca_debug_level = 0;
@@ -62,7 +62,7 @@ int ehca_use_hp_mr = 0;
int ehca_port_act_time = 30;
int ehca_poll_all_eqs = 1;
int ehca_static_rate = -1;
-int ehca_scaling_code = 1;
+int ehca_scaling_code = 0;
module_param_named(open_aqp1, ehca_open_aqp1, int, 0);
module_param_named(debug_level, ehca_debug_level, int, 0);
@@ -98,6 +98,7 @@ MODULE_PARM_DESC(scaling_code,
spinlock_t ehca_qp_idr_lock;
spinlock_t ehca_cq_idr_lock;
+spinlock_t hcall_lock;
DEFINE_IDR(ehca_qp_idr);
DEFINE_IDR(ehca_cq_idr);
@@ -453,15 +454,14 @@ static ssize_t ehca_store_debug_level(struct device_driver *ddp,
DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
ehca_show_debug_level, ehca_store_debug_level);
-void ehca_create_driver_sysfs(struct ibmebus_driver *drv)
-{
- driver_create_file(&drv->driver, &driver_attr_debug_level);
-}
+static struct attribute *ehca_drv_attrs[] = {
+ &driver_attr_debug_level.attr,
+ NULL
+};
-void ehca_remove_driver_sysfs(struct ibmebus_driver *drv)
-{
- driver_remove_file(&drv->driver, &driver_attr_debug_level);
-}
+static struct attribute_group ehca_drv_attr_grp = {
+ .attrs = ehca_drv_attrs
+};
#define EHCA_RESOURCE_ATTR(name) \
static ssize_t ehca_show_##name(struct device *dev, \
@@ -523,44 +523,28 @@ static ssize_t ehca_show_adapter_handle(struct device *dev,
}
static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+static struct attribute *ehca_dev_attrs[] = {
+ &dev_attr_adapter_handle.attr,
+ &dev_attr_num_ports.attr,
+ &dev_attr_hw_ver.attr,
+ &dev_attr_max_eq.attr,
+ &dev_attr_cur_eq.attr,
+ &dev_attr_max_cq.attr,
+ &dev_attr_cur_cq.attr,
+ &dev_attr_max_qp.attr,
+ &dev_attr_cur_qp.attr,
+ &dev_attr_max_mr.attr,
+ &dev_attr_cur_mr.attr,
+ &dev_attr_max_mw.attr,
+ &dev_attr_cur_mw.attr,
+ &dev_attr_max_pd.attr,
+ &dev_attr_max_ah.attr,
+ NULL
+};
-void ehca_create_device_sysfs(struct ibmebus_dev *dev)
-{
- device_create_file(&dev->ofdev.dev, &dev_attr_adapter_handle);
- device_create_file(&dev->ofdev.dev, &dev_attr_num_ports);
- device_create_file(&dev->ofdev.dev, &dev_attr_hw_ver);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_eq);
- device_create_file(&dev->ofdev.dev, &dev_attr_cur_eq);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_cq);
- device_create_file(&dev->ofdev.dev, &dev_attr_cur_cq);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_qp);
- device_create_file(&dev->ofdev.dev, &dev_attr_cur_qp);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_mr);
- device_create_file(&dev->ofdev.dev, &dev_attr_cur_mr);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_mw);
- device_create_file(&dev->ofdev.dev, &dev_attr_cur_mw);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_pd);
- device_create_file(&dev->ofdev.dev, &dev_attr_max_ah);
-}
-
-void ehca_remove_device_sysfs(struct ibmebus_dev *dev)
-{
- device_remove_file(&dev->ofdev.dev, &dev_attr_adapter_handle);
- device_remove_file(&dev->ofdev.dev, &dev_attr_num_ports);
- device_remove_file(&dev->ofdev.dev, &dev_attr_hw_ver);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_eq);
- device_remove_file(&dev->ofdev.dev, &dev_attr_cur_eq);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_cq);
- device_remove_file(&dev->ofdev.dev, &dev_attr_cur_cq);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_qp);
- device_remove_file(&dev->ofdev.dev, &dev_attr_cur_qp);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_mr);
- device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mr);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_mw);
- device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mw);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_pd);
- device_remove_file(&dev->ofdev.dev, &dev_attr_max_ah);
-}
+static struct attribute_group ehca_dev_attr_grp = {
+ .attrs = ehca_dev_attrs
+};
static int __devinit ehca_probe(struct ibmebus_dev *dev,
const struct of_device_id *id)
@@ -570,7 +554,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev,
struct ib_pd *ibpd;
int ret;
- handle = get_property(dev->ofdev.node, "ibm,hca-handle", NULL);
+ handle = of_get_property(dev->ofdev.node, "ibm,hca-handle", NULL);
if (!handle) {
ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
dev->ofdev.node->full_name);
@@ -668,7 +652,10 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev,
}
}
- ehca_create_device_sysfs(dev);
+ ret = sysfs_create_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp);
+ if (ret) /* only complain; we can live without attributes */
+ ehca_err(&shca->ib_device,
+ "Cannot create device attributes ret=%d", ret);
spin_lock(&shca_list_lock);
list_add(&shca->shca_list, &shca_list);
@@ -720,7 +707,7 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev)
struct ehca_shca *shca = dev->ofdev.dev.driver_data;
int ret;
- ehca_remove_device_sysfs(dev);
+ sysfs_remove_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp);
if (ehca_open_aqp1 == 1) {
int i;
@@ -812,11 +799,12 @@ int __init ehca_module_init(void)
int ret;
printk(KERN_INFO "eHCA Infiniband Device Driver "
- "(Rel.: SVNEHCA_0022)\n");
+ "(Rel.: SVNEHCA_0023)\n");
idr_init(&ehca_qp_idr);
idr_init(&ehca_cq_idr);
spin_lock_init(&ehca_qp_idr_lock);
spin_lock_init(&ehca_cq_idr_lock);
+ spin_lock_init(&hcall_lock);
INIT_LIST_HEAD(&shca_list);
spin_lock_init(&shca_list_lock);
@@ -838,7 +826,9 @@ int __init ehca_module_init(void)
goto module_init2;
}
- ehca_create_driver_sysfs(&ehca_driver);
+ ret = sysfs_create_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp);
+ if (ret) /* only complain; we can live without attributes */
+ ehca_gen_err("Cannot create driver attributes ret=%d", ret);
if (ehca_poll_all_eqs != 1) {
ehca_gen_err("WARNING!!!");
@@ -865,7 +855,7 @@ void __exit ehca_module_exit(void)
if (ehca_poll_all_eqs == 1)
del_timer_sync(&poll_eqs_timer);
- ehca_remove_driver_sysfs(&ehca_driver);
+ sysfs_remove_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp);
ibmebus_unregister_driver(&ehca_driver);
ehca_destroy_slab_caches();
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index d22ab56..add79bd 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -39,6 +39,8 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
+#include <rdma/ib_umem.h>
+
#include <asm/current.h>
#include "ehca_iverbs.h"
@@ -238,10 +240,8 @@ reg_phys_mr_exit0:
/*----------------------------------------------------------------------*/
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd,
- struct ib_umem *region,
- int mr_access_flags,
- struct ib_udata *udata)
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt,
+ int mr_access_flags, struct ib_udata *udata)
{
struct ib_mr *ib_mr;
struct ehca_mr *e_mr;
@@ -257,11 +257,7 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd,
ehca_gen_err("bad pd=%p", pd);
return ERR_PTR(-EFAULT);
}
- if (!region) {
- ehca_err(pd->device, "bad input values: region=%p", region);
- ib_mr = ERR_PTR(-EINVAL);
- goto reg_user_mr_exit0;
- }
+
if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
!(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
@@ -275,17 +271,10 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd,
ib_mr = ERR_PTR(-EINVAL);
goto reg_user_mr_exit0;
}
- if (region->page_size != PAGE_SIZE) {
- ehca_err(pd->device, "page size not supported, "
- "region->page_size=%x", region->page_size);
- ib_mr = ERR_PTR(-EINVAL);
- goto reg_user_mr_exit0;
- }
- if ((region->length == 0) ||
- ((region->virt_base + region->length) < region->virt_base)) {
+ if (length == 0 || virt + length < virt) {
ehca_err(pd->device, "bad input values: length=%lx "
- "virt_base=%lx", region->length, region->virt_base);
+ "virt_base=%lx", length, virt);
ib_mr = ERR_PTR(-EINVAL);
goto reg_user_mr_exit0;
}
@@ -297,40 +286,55 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd,
goto reg_user_mr_exit0;
}
+ e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+ mr_access_flags);
+ if (IS_ERR(e_mr->umem)) {
+ ib_mr = (void *) e_mr->umem;
+ goto reg_user_mr_exit1;
+ }
+
+ if (e_mr->umem->page_size != PAGE_SIZE) {
+ ehca_err(pd->device, "page size not supported, "
+ "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_user_mr_exit2;
+ }
+
/* determine number of MR pages */
- num_pages_mr = (((region->virt_base % PAGE_SIZE) + region->length +
- PAGE_SIZE - 1) / PAGE_SIZE);
- num_pages_4k = (((region->virt_base % EHCA_PAGESIZE) + region->length +
- EHCA_PAGESIZE - 1) / EHCA_PAGESIZE);
+ num_pages_mr = (((virt % PAGE_SIZE) + length + PAGE_SIZE - 1) /
+ PAGE_SIZE);
+ num_pages_4k = (((virt % EHCA_PAGESIZE) + length + EHCA_PAGESIZE - 1) /
+ EHCA_PAGESIZE);
/* register MR on HCA */
pginfo.type = EHCA_MR_PGI_USER;
pginfo.num_pages = num_pages_mr;
pginfo.num_4k = num_pages_4k;
- pginfo.region = region;
- pginfo.next_4k = region->offset / EHCA_PAGESIZE;
+ pginfo.region = e_mr->umem;
+ pginfo.next_4k = e_mr->umem->offset / EHCA_PAGESIZE;
pginfo.next_chunk = list_prepare_entry(pginfo.next_chunk,
- (&region->chunk_list),
+ (&e_mr->umem->chunk_list),
list);
- ret = ehca_reg_mr(shca, e_mr, (u64*)region->virt_base,
- region->length, mr_access_flags, e_pd, &pginfo,
- &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey);
+ ret = ehca_reg_mr(shca, e_mr, (u64*) virt, length, mr_access_flags, e_pd,
+ &pginfo, &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey);
if (ret) {
ib_mr = ERR_PTR(ret);
- goto reg_user_mr_exit1;
+ goto reg_user_mr_exit2;
}
/* successful registration of all pages */
return &e_mr->ib.ib_mr;
+reg_user_mr_exit2:
+ ib_umem_release(e_mr->umem);
reg_user_mr_exit1:
ehca_mr_delete(e_mr);
reg_user_mr_exit0:
if (IS_ERR(ib_mr))
- ehca_err(pd->device, "rc=%lx pd=%p region=%p mr_access_flags=%x"
+ ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x"
" udata=%p",
- PTR_ERR(ib_mr), pd, region, mr_access_flags, udata);
+ PTR_ERR(ib_mr), pd, mr_access_flags, udata);
return ib_mr;
} /* end ehca_reg_user_mr() */
@@ -596,6 +600,9 @@ int ehca_dereg_mr(struct ib_mr *mr)
goto dereg_mr_exit0;
}
+ if (e_mr->umem)
+ ib_umem_release(e_mr->umem);
+
/* successful deregistration */
ehca_mr_delete(e_mr);
@@ -2043,13 +2050,10 @@ int ehca_mrmw_map_hrc_alloc(const u64 hipz_rc)
switch (hipz_rc) {
case H_SUCCESS: /* successful completion */
return 0;
- case H_ADAPTER_PARM: /* invalid adapter handle */
- case H_RT_PARM: /* invalid resource type */
case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
- case H_MLENGTH_PARM: /* invalid memory length */
- case H_MEM_ACCESS_PARM: /* invalid access controls */
case H_CONSTRAINED: /* resource constraint */
- return -EINVAL;
+ case H_NO_MEM:
+ return -ENOMEM;
case H_BUSY: /* long busy */
return -EBUSY;
default:
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index df0516f..b5bc787 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -523,6 +523,8 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
goto create_qp_exit1;
}
+ my_qp->ib_qp.qp_num = my_qp->real_qp_num;
+
switch (init_attr->qp_type) {
case IB_QPT_RC:
if (isdaqp == 0) {
@@ -568,7 +570,7 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
parms.act_nr_recv_wqes = init_attr->cap.max_recv_wr;
parms.act_nr_send_sges = init_attr->cap.max_send_sge;
parms.act_nr_recv_sges = init_attr->cap.max_recv_sge;
- my_qp->real_qp_num =
+ my_qp->ib_qp.qp_num =
(init_attr->qp_type == IB_QPT_SMI) ? 0 : 1;
}
@@ -595,7 +597,6 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
my_qp->ib_qp.recv_cq = init_attr->recv_cq;
my_qp->ib_qp.send_cq = init_attr->send_cq;
- my_qp->ib_qp.qp_num = my_qp->real_qp_num;
my_qp->ib_qp.qp_type = init_attr->qp_type;
my_qp->qp_type = init_attr->qp_type;
@@ -968,17 +969,21 @@ static int internal_modify_qp(struct ib_qp *ibqp,
((ehca_mult - 1) / ah_mult) : 0;
else
mqpcb->max_static_rate = 0;
-
update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
/*
+ * Always supply the GRH flag, even if it's zero, to give the
+ * hypervisor a clear "yes" or "no" instead of a "perhaps"
+ */
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+ /*
* only if GRH is TRUE we might consider SOURCE_GID_IDX
* and DEST_GID otherwise phype will return H_ATTR_PARM!!!
*/
if (attr->ah_attr.ah_flags == IB_AH_GRH) {
- mqpcb->send_grh_flag = 1 << 31;
- update_mask |=
- EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+ mqpcb->send_grh_flag = 1;
+
mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
update_mask |=
EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
index b564fcd..5766ae3 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -154,7 +154,8 @@ static long ehca_plpar_hcall9(unsigned long opcode,
unsigned long arg9)
{
long ret;
- int i, sleep_msecs;
+ int i, sleep_msecs, lock_is_set = 0;
+ unsigned long flags;
ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx "
"arg5=%lx arg6=%lx arg7=%lx arg8=%lx arg9=%lx",
@@ -162,10 +163,18 @@ static long ehca_plpar_hcall9(unsigned long opcode,
arg8, arg9);
for (i = 0; i < 5; i++) {
+ if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) {
+ spin_lock_irqsave(&hcall_lock, flags);
+ lock_is_set = 1;
+ }
+
ret = plpar_hcall9(opcode, outs,
arg1, arg2, arg3, arg4, arg5,
arg6, arg7, arg8, arg9);
+ if (lock_is_set)
+ spin_unlock_irqrestore(&hcall_lock, flags);
+
if (H_IS_LONG_BUSY(ret)) {
sleep_msecs = get_longbusy_msecs(ret);
msleep_interruptible(sleep_msecs);
@@ -193,11 +202,11 @@ static long ehca_plpar_hcall9(unsigned long opcode,
opcode, ret, outs[0], outs[1], outs[2], outs[3],
outs[4], outs[5], outs[6], outs[7], outs[8]);
return ret;
-
}
return H_BUSY;
}
+
u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
struct ehca_pfeq *pfeq,
const u32 neq_control,
@@ -322,7 +331,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
0);
qp->ipz_qp_handle.handle = outs[0];
qp->real_qp_num = (u32)outs[1];
- parms->act_nr_send_sges =
+ parms->act_nr_send_wqes =
(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
parms->act_nr_recv_wqes =
(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 036ed1e..ebd5c7b 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -523,7 +523,7 @@ static int ipathfs_fill_super(struct super_block *sb, void *data,
int ret;
static struct tree_descr files[] = {
- [1] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+ [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
{""},
};
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c
index 1b9c308..4e2e3df 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6120.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c
@@ -747,7 +747,6 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd)
static int ipath_pe_intconfig(struct ipath_devdata *dd)
{
- u64 val;
u32 chiprev;
/*
@@ -760,9 +759,9 @@ static int ipath_pe_intconfig(struct ipath_devdata *dd)
if ((chiprev & INFINIPATH_R_CHIPREVMINOR_MASK) > 1) {
/* Rev2+ reports extra errors via internal GPIO pins */
dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
- val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
- val |= IPATH_GPIO_ERRINTR_MASK;
- ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
+ dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
}
return 0;
}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 45d0331..a90d3b5 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -1056,7 +1056,7 @@ irqreturn_t ipath_intr(int irq, void *data)
gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
chk0rcv = 1;
}
- if (unlikely(gpiostatus)) {
+ if (gpiostatus) {
/*
* Some unexpected bits remain. If they could have
* caused the interrupt, complain and clear.
@@ -1065,9 +1065,8 @@ irqreturn_t ipath_intr(int irq, void *data)
* GPIO interrupts, possibly on a "three strikes"
* basis.
*/
- u32 mask;
- mask = ipath_read_kreg32(
- dd, dd->ipath_kregs->kr_gpio_mask);
+ const u32 mask = (u32) dd->ipath_gpio_mask;
+
if (mask & gpiostatus) {
ipath_dbg("Unexpected GPIO IRQ bits %x\n",
gpiostatus & mask);
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index e900c25..12194f3 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -397,6 +397,8 @@ struct ipath_devdata {
unsigned long ipath_pioavailshadow[8];
/* shadow of kr_gpio_out, for rmw ops */
u64 ipath_gpio_out;
+ /* shadow the gpio mask register */
+ u64 ipath_gpio_mask;
/* kr_revision shadow */
u64 ipath_revision;
/*
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
index 31e7073..bdeef8d 100644
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -31,6 +31,7 @@
* SOFTWARE.
*/
+#include <rdma/ib_umem.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_smi.h>
@@ -147,6 +148,7 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
mr->mr.offset = 0;
mr->mr.access_flags = acc;
mr->mr.max_segs = num_phys_buf;
+ mr->umem = NULL;
m = 0;
n = 0;
@@ -170,46 +172,56 @@ bail:
/**
* ipath_reg_user_mr - register a userspace memory region
* @pd: protection domain for this memory region
- * @region: the user memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
* @mr_access_flags: access flags for this memory region
* @udata: unused by the InfiniPath driver
*
* Returns the memory region on success, otherwise returns an errno.
*/
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
- int mr_access_flags, struct ib_udata *udata)
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata)
{
struct ipath_mr *mr;
+ struct ib_umem *umem;
struct ib_umem_chunk *chunk;
int n, m, i;
struct ib_mr *ret;
- if (region->length == 0) {
+ if (length == 0) {
ret = ERR_PTR(-EINVAL);
goto bail;
}
+ umem = ib_umem_get(pd->uobject->context, start, length, mr_access_flags);
+ if (IS_ERR(umem))
+ return (void *) umem;
+
n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &umem->chunk_list, list)
n += chunk->nents;
mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
if (!mr) {
ret = ERR_PTR(-ENOMEM);
+ ib_umem_release(umem);
goto bail;
}
mr->mr.pd = pd;
- mr->mr.user_base = region->user_base;
- mr->mr.iova = region->virt_base;
- mr->mr.length = region->length;
- mr->mr.offset = region->offset;
+ mr->mr.user_base = start;
+ mr->mr.iova = virt_addr;
+ mr->mr.length = length;
+ mr->mr.offset = umem->offset;
mr->mr.access_flags = mr_access_flags;
mr->mr.max_segs = n;
+ mr->umem = umem;
m = 0;
n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list) {
+ list_for_each_entry(chunk, &umem->chunk_list, list) {
for (i = 0; i < chunk->nents; i++) {
void *vaddr;
@@ -219,7 +231,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
goto bail;
}
mr->mr.map[m]->segs[n].vaddr = vaddr;
- mr->mr.map[m]->segs[n].length = region->page_size;
+ mr->mr.map[m]->segs[n].length = umem->page_size;
n++;
if (n == IPATH_SEGSZ) {
m++;
@@ -253,6 +265,10 @@ int ipath_dereg_mr(struct ib_mr *ibmr)
i--;
kfree(mr->mr.map[i]);
}
+
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+
kfree(mr);
return 0;
}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 12933e7..bb70845 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1387,13 +1387,12 @@ static int enable_timer(struct ipath_devdata *dd)
* processing.
*/
if (dd->ipath_flags & IPATH_GPIO_INTR) {
- u64 val;
ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
0x2074076542310ULL);
/* Enable GPIO bit 2 interrupt */
- val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
- val |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
- ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
+ dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
}
init_timer(&dd->verbs_timer);
@@ -1412,8 +1411,9 @@ static int disable_timer(struct ipath_devdata *dd)
u64 val;
/* Disable GPIO bit 2 interrupt */
val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
- val &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
- ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
+ dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
/*
* We might want to undo changes to debugportselect,
* but how?
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 7064fc2..088b837 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -251,6 +251,7 @@ struct ipath_sge {
/* Memory region */
struct ipath_mr {
struct ib_mr ibmr;
+ struct ib_umem *umem;
struct ipath_mregion mr; /* must be last */
};
@@ -751,8 +752,8 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
struct ib_phys_buf *buffer_list,
int num_phys_buf, int acc, u64 *iova_start);
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
- int mr_access_flags,
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
int ipath_dereg_mr(struct ib_mr *ibmr);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index 085e28b..dd691cf 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -165,10 +165,9 @@ static int ipath_mcast_add(struct ipath_ibdev *dev,
{
struct rb_node **n = &mcast_tree.rb_node;
struct rb_node *pn = NULL;
- unsigned long flags;
int ret;
- spin_lock_irqsave(&mcast_lock, flags);
+ spin_lock_irq(&mcast_lock);
while (*n) {
struct ipath_mcast *tmcast;
@@ -228,7 +227,7 @@ static int ipath_mcast_add(struct ipath_ibdev *dev,
ret = 0;
bail:
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
return ret;
}
@@ -289,17 +288,16 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
struct ipath_mcast *mcast = NULL;
struct ipath_mcast_qp *p, *tmp;
struct rb_node *n;
- unsigned long flags;
int last = 0;
int ret;
- spin_lock_irqsave(&mcast_lock, flags);
+ spin_lock_irq(&mcast_lock);
/* Find the GID in the mcast table. */
n = mcast_tree.rb_node;
while (1) {
if (n == NULL) {
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
ret = -EINVAL;
goto bail;
}
@@ -334,7 +332,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
break;
}
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
if (p) {
/*
@@ -348,9 +346,9 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
atomic_dec(&mcast->refcount);
wait_event(mcast->wait, !atomic_read(&mcast->refcount));
ipath_mcast_free(mcast);
- spin_lock(&dev->n_mcast_grps_lock);
+ spin_lock_irq(&dev->n_mcast_grps_lock);
dev->n_mcast_grps_allocated--;
- spin_unlock(&dev->n_mcast_grps_lock);
+ spin_unlock_irq(&dev->n_mcast_grps_lock);
}
ret = 0;
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 0000000..b8912cd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,9 @@
+config MLX4_INFINIBAND
+ tristate "Mellanox ConnectX HCA support"
+ depends on INFINIBAND
+ select MLX4_CORE
+ ---help---
+ This driver provides low-level InfiniBand support for
+ Mellanox ConnectX PCI Express host channel adapters (HCAs).
+ This is required to use InfiniBand protocols such as
+ IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 0000000..70f09c7
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o
+
+mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 0000000..c75ac94
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+ struct mlx4_ib_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ memset(&ah->av, 0, sizeof ah->av);
+
+ ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+ ah->av.g_slid = ah_attr->src_path_bits;
+ ah->av.dlid = cpu_to_be16(ah_attr->dlid);
+ if (ah_attr->static_rate) {
+ ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << ah->av.stat_rate & dev->caps.stat_rate_support))
+ --ah->av.stat_rate;
+ }
+ ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ ah->av.g_slid |= 0x80;
+ ah->av.gid_index = ah_attr->grh.sgid_index;
+ ah->av.hop_limit = ah_attr->grh.hop_limit;
+ ah->av.sl_tclass_flowlabel |=
+ cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+ ah_attr->grh.flow_label);
+ memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16);
+ }
+
+ return &ah->ibah;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+ struct mlx4_ib_ah *ah = to_mah(ibah);
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = be16_to_cpu(ah->av.dlid);
+ ah_attr->sl = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+ ah_attr->port_num = be32_to_cpu(ah->av.port_pd) >> 24;
+ if (ah->av.stat_rate)
+ ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET;
+ ah_attr->src_path_bits = ah->av.g_slid & 0x7F;
+
+ if (mlx4_ib_ah_grh_present(ah)) {
+ ah_attr->ah_flags = IB_AH_GRH;
+
+ ah_attr->grh.traffic_class =
+ be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20;
+ ah_attr->grh.flow_label =
+ be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff;
+ ah_attr->grh.hop_limit = ah->av.hop_limit;
+ ah_attr->grh.sgid_index = ah->av.gid_index;
+ memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16);
+ }
+
+ return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+ kfree(to_mah(ah));
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 0000000..660b27a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+ struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+ ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_cq *ibcq;
+
+ if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on CQ %06x\n", type, cq->cqn);
+ return;
+ }
+
+ ibcq = &to_mibcq(cq)->ibcq;
+ if (ibcq->event_handler) {
+ event.device = ibcq->device;
+ event.event = IB_EVENT_CQ_ERR;
+ event.element.cq = ibcq;
+ ibcq->event_handler(&event, ibcq->cq_context);
+ }
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+ int offset = n * sizeof (struct mlx4_cqe);
+
+ if (buf->buf.nbufs == 1)
+ return buf->buf.u.direct.buf + offset;
+ else
+ return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+ (offset & (PAGE_SIZE - 1));
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+
+ return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+ return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_cq *cq;
+ struct mlx4_uar *uar;
+ int buf_size;
+ int err;
+
+ if (entries < 1 || entries > dev->dev->caps.max_cqes)
+ return ERR_PTR(-EINVAL);
+
+ cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+
+ entries = roundup_pow_of_two(entries + 1);
+ cq->ibcq.cqe = entries - 1;
+ buf_size = entries * sizeof (struct mlx4_cqe);
+ spin_lock_init(&cq->lock);
+
+ if (context) {
+ struct mlx4_ib_create_cq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_cq;
+ }
+
+ cq->umem = ib_umem_get(context, ucmd.buf_addr, buf_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(cq->umem)) {
+ err = PTR_ERR(cq->umem);
+ goto err_cq;
+ }
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(cq->umem),
+ ilog2(cq->umem->page_size), &cq->buf.mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem);
+ if (err)
+ goto err_mtt;
+
+ err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+ &cq->db);
+ if (err)
+ goto err_mtt;
+
+ uar = &to_mucontext(context)->uar;
+ } else {
+ err = mlx4_ib_db_alloc(dev, &cq->db, 1);
+ if (err)
+ goto err_cq;
+
+ cq->mcq.set_ci_db = cq->db.db;
+ cq->mcq.arm_db = cq->db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ *cq->mcq.arm_db = 0;
+
+ if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &cq->buf.buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ err = mlx4_mtt_init(dev->dev, cq->buf.buf.npages, cq->buf.buf.page_shift,
+ &cq->buf.mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &cq->buf.mtt, &cq->buf.buf);
+ if (err)
+ goto err_mtt;
+
+ uar = &dev->priv_uar;
+ }
+
+ err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+ cq->db.dma, &cq->mcq);
+ if (err)
+ goto err_dbmap;
+
+ cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.event = mlx4_ib_cq_event;
+
+ if (context)
+ if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_dbmap;
+ }
+
+ return &cq->ibcq;
+
+err_dbmap:
+ if (context)
+ mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+err_buf:
+ if (context)
+ ib_umem_release(cq->umem);
+ else
+ mlx4_buf_free(dev->dev, entries * sizeof (struct mlx4_cqe),
+ &cq->buf.buf);
+
+err_db:
+ if (!context)
+ mlx4_ib_db_free(dev, &cq->db);
+
+err_cq:
+ kfree(cq);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+ struct mlx4_ib_dev *dev = to_mdev(cq->device);
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+ mlx4_cq_free(dev->dev, &mcq->mcq);
+ mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+ if (cq->uobject) {
+ mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+ ib_umem_release(mcq->umem);
+ } else {
+ mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
+ &mcq->buf.buf);
+ mlx4_ib_db_free(dev, &mcq->db);
+ }
+
+ kfree(mcq);
+
+ return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+ __be32 *buf = cqe;
+
+ printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+ be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+ be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+ struct ib_wc *wc)
+{
+ if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+ printk(KERN_DEBUG "local QP operation err "
+ "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+ "opcode = %02x)\n",
+ be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+ cqe->vendor_err_syndrome,
+ cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ dump_cqe(cqe);
+ }
+
+ switch (cqe->syndrome) {
+ case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+ wc->status = IB_WC_LOC_LEN_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+ wc->status = IB_WC_LOC_PROT_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+ wc->status = IB_WC_MW_BIND_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+ wc->status = IB_WC_BAD_RESP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+ wc->status = IB_WC_LOC_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+ wc->status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+ wc->status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+ wc->status = IB_WC_REM_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+ wc->status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+ wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+ wc->status = IB_WC_REM_ABORT_ERR;
+ break;
+ default:
+ wc->status = IB_WC_GENERAL_ERR;
+ break;
+ }
+
+ wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+ struct mlx4_ib_qp **cur_qp,
+ struct ib_wc *wc)
+{
+ struct mlx4_cqe *cqe;
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_wq *wq;
+ struct mlx4_ib_srq *srq;
+ int is_send;
+ int is_error;
+ u16 wqe_ctr;
+
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return -EAGAIN;
+
+ ++cq->mcq.cons_index;
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR;
+
+ if (!*cur_qp ||
+ (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+ be32_to_cpu(cqe->my_qpn));
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+ cq->mcq.cqn, be32_to_cpu(cqe->my_qpn) & 0xffffff);
+ return -EINVAL;
+ }
+
+ *cur_qp = to_mibqp(mqp);
+ }
+
+ wc->qp = &(*cur_qp)->ibqp;
+
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ } else if ((*cur_qp)->ibqp.srq) {
+ srq = to_msrq((*cur_qp)->ibqp.srq);
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else {
+ wq = &(*cur_qp)->rq;
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ }
+
+ if (unlikely(is_error)) {
+ mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+ return 0;
+ }
+
+ wc->status = IB_WC_SUCCESS;
+
+ if (is_send) {
+ wc->wc_flags = 0;
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ case MLX4_OPCODE_RDMA_WRITE:
+ wc->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case MLX4_OPCODE_SEND_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ case MLX4_OPCODE_SEND:
+ wc->opcode = IB_WC_SEND;
+ break;
+ case MLX4_OPCODE_RDMA_READ:
+ wc->opcode = IB_WC_SEND;
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+ break;
+ case MLX4_OPCODE_ATOMIC_CS:
+ wc->opcode = IB_WC_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_ATOMIC_FA:
+ wc->opcode = IB_WC_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_BIND_MW:
+ wc->opcode = IB_WC_BIND_MW;
+ break;
+ }
+ } else {
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->imm_data = cqe->immed_rss_invalid;
+ break;
+ case MLX4_RECV_OPCODE_SEND:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = 0;
+ break;
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->imm_data = cqe->immed_rss_invalid;
+ break;
+ }
+
+ wc->slid = be16_to_cpu(cqe->rlid);
+ wc->sl = cqe->sl >> 4;
+ wc->src_qp = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff;
+ wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f;
+ wc->wc_flags |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ?
+ IB_WC_GRH : 0;
+ wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) >> 16;
+ }
+
+ return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_ib_qp *cur_qp = NULL;
+ unsigned long flags;
+ int npolled;
+ int err = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+ err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+ if (err)
+ break;
+ }
+
+ if (npolled)
+ mlx4_cq_set_ci(&cq->mcq);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (err == 0 || err == -EAGAIN)
+ return npolled;
+ else
+ return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+ (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+ MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+ to_mdev(ibcq->device)->uar_map,
+ MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+ return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ u32 prod_index;
+ int nfreed = 0;
+ struct mlx4_cqe *cqe, *dest;
+ u8 owner_bit;
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+ if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+ break;
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+ if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
+ if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+ mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+ ++nfreed;
+ } else if (nfreed) {
+ dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+ owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+ memcpy(dest, cqe, sizeof *cqe);
+ dest->owner_sr_opcode = owner_bit |
+ (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ }
+ }
+
+ if (nfreed) {
+ cq->mcq.cons_index += nfreed;
+ /*
+ * Make sure update of buffer contents is done before
+ * updating consumer index.
+ */
+ wmb();
+ mlx4_cq_set_ci(&cq->mcq);
+ }
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ spin_lock_irq(&cq->lock);
+ __mlx4_ib_cq_clean(cq, qpn, srq);
+ spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 0000000..1c36087
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_db_pgdir {
+ struct list_head list;
+ DECLARE_BITMAP(order0, MLX4_IB_DB_PER_PAGE);
+ DECLARE_BITMAP(order1, MLX4_IB_DB_PER_PAGE / 2);
+ unsigned long *bits[2];
+ __be32 *db_page;
+ dma_addr_t db_dma;
+};
+
+static struct mlx4_ib_db_pgdir *mlx4_ib_alloc_db_pgdir(struct mlx4_ib_dev *dev)
+{
+ struct mlx4_ib_db_pgdir *pgdir;
+
+ pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+ if (!pgdir)
+ return NULL;
+
+ bitmap_fill(pgdir->order1, MLX4_IB_DB_PER_PAGE / 2);
+ pgdir->bits[0] = pgdir->order0;
+ pgdir->bits[1] = pgdir->order1;
+ pgdir->db_page = dma_alloc_coherent(dev->ib_dev.dma_device,
+ PAGE_SIZE, &pgdir->db_dma,
+ GFP_KERNEL);
+ if (!pgdir->db_page) {
+ kfree(pgdir);
+ return NULL;
+ }
+
+ return pgdir;
+}
+
+static int mlx4_ib_alloc_db_from_pgdir(struct mlx4_ib_db_pgdir *pgdir,
+ struct mlx4_ib_db *db, int order)
+{
+ int o;
+ int i;
+
+ for (o = order; o <= 1; ++o) {
+ i = find_first_bit(pgdir->bits[o], MLX4_IB_DB_PER_PAGE >> o);
+ if (i < MLX4_IB_DB_PER_PAGE >> o)
+ goto found;
+ }
+
+ return -ENOMEM;
+
+found:
+ clear_bit(i, pgdir->bits[o]);
+
+ i <<= o;
+
+ if (o > order)
+ set_bit(i ^ 1, pgdir->bits[order]);
+
+ db->u.pgdir = pgdir;
+ db->index = i;
+ db->db = pgdir->db_page + db->index;
+ db->dma = pgdir->db_dma + db->index * 4;
+ db->order = order;
+
+ return 0;
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order)
+{
+ struct mlx4_ib_db_pgdir *pgdir;
+ int ret = 0;
+
+ mutex_lock(&dev->pgdir_mutex);
+
+ list_for_each_entry(pgdir, &dev->pgdir_list, list)
+ if (!mlx4_ib_alloc_db_from_pgdir(pgdir, db, order))
+ goto out;
+
+ pgdir = mlx4_ib_alloc_db_pgdir(dev);
+ if (!pgdir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ list_add(&pgdir->list, &dev->pgdir_list);
+
+ /* This should never fail -- we just allocated an empty page: */
+ WARN_ON(mlx4_ib_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+ mutex_unlock(&dev->pgdir_mutex);
+
+ return ret;
+}
+
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db)
+{
+ int o;
+ int i;
+
+ mutex_lock(&dev->pgdir_mutex);
+
+ o = db->order;
+ i = db->index;
+
+ if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+ clear_bit(i ^ 1, db->u.pgdir->order0);
+ ++o;
+ }
+
+ i >>= o;
+ set_bit(i, db->u.pgdir->bits[o]);
+
+ if (bitmap_full(db->u.pgdir->order1, MLX4_IB_DB_PER_PAGE / 2)) {
+ dma_free_coherent(dev->ib_dev.dma_device, PAGE_SIZE,
+ db->u.pgdir->db_page, db->u.pgdir->db_dma);
+ list_del(&db->u.pgdir->list);
+ kfree(db->u.pgdir);
+ }
+
+ mutex_unlock(&dev->pgdir_mutex);
+}
+
+struct mlx4_ib_user_db_page {
+ struct list_head list;
+ struct ib_umem *umem;
+ unsigned long user_virt;
+ int refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+ struct mlx4_ib_db *db)
+{
+ struct mlx4_ib_user_db_page *page;
+ struct ib_umem_chunk *chunk;
+ int err = 0;
+
+ mutex_lock(&context->db_page_mutex);
+
+ list_for_each_entry(page, &context->db_page_list, list)
+ if (page->user_virt == (virt & PAGE_MASK))
+ goto found;
+
+ page = kmalloc(sizeof *page, GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ page->user_virt = (virt & PAGE_MASK);
+ page->refcnt = 0;
+ page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+ PAGE_SIZE, 0);
+ if (IS_ERR(page->umem)) {
+ err = PTR_ERR(page->umem);
+ kfree(page);
+ goto out;
+ }
+
+ list_add(&page->list, &context->db_page_list);
+
+found:
+ chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+ db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+ db->u.user_page = page;
+ ++page->refcnt;
+
+out:
+ mutex_unlock(&context->db_page_mutex);
+
+ return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db)
+{
+ mutex_lock(&context->db_page_mutex);
+
+ if (!--db->u.user_page->refcnt) {
+ list_del(&db->u.user_page->list);
+ ib_umem_release(db->u.user_page->umem);
+ kfree(db->u.user_page);
+ }
+
+ mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 0000000..3330917
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+
+enum {
+ MLX4_IB_VENDOR_CLASS1 = 0x9,
+ MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad)
+{
+ struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+ void *inbox;
+ int err;
+ u32 in_modifier = port;
+ u8 op_modifier = 0;
+
+ inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(inmailbox))
+ return PTR_ERR(inmailbox);
+ inbox = inmailbox->buf;
+
+ outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(outmailbox)) {
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ return PTR_ERR(outmailbox);
+ }
+
+ memcpy(inbox, in_mad, 256);
+
+ /*
+ * Key check traps can't be generated unless we have in_wc to
+ * tell us where to send the trap.
+ */
+ if (ignore_mkey || !in_wc)
+ op_modifier |= 0x1;
+ if (ignore_bkey || !in_wc)
+ op_modifier |= 0x2;
+
+ if (in_wc) {
+ struct {
+ __be32 my_qpn;
+ u32 reserved1;
+ __be32 rqpn;
+ u8 sl;
+ u8 g_path;
+ u16 reserved2[2];
+ __be16 pkey;
+ u32 reserved3[11];
+ u8 grh[40];
+ } *ext_info;
+
+ memset(inbox + 256, 0, 256);
+ ext_info = inbox + 256;
+
+ ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+ ext_info->rqpn = cpu_to_be32(in_wc->src_qp);
+ ext_info->sl = in_wc->sl << 4;
+ ext_info->g_path = in_wc->dlid_path_bits |
+ (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+ ext_info->pkey = cpu_to_be16(in_wc->pkey_index);
+
+ if (in_grh)
+ memcpy(ext_info->grh, in_grh, 40);
+
+ op_modifier |= 0x4;
+
+ in_modifier |= in_wc->slid << 16;
+ }
+
+ err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
+ in_modifier, op_modifier,
+ MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+
+ if (!err);
+ memcpy(response_mad, outmailbox->buf, 256);
+
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+ return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+ struct ib_ah *new_ah;
+ struct ib_ah_attr ah_attr;
+
+ if (!dev->send_agent[port_num - 1][0])
+ return;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = lid;
+ ah_attr.sl = sl;
+ ah_attr.port_num = port_num;
+
+ new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+ &ah_attr);
+ if (IS_ERR(new_ah))
+ return;
+
+ spin_lock(&dev->sm_lock);
+ if (dev->sm_ah[port_num - 1])
+ ib_destroy_ah(dev->sm_ah[port_num - 1]);
+ dev->sm_ah[port_num - 1] = new_ah;
+ spin_unlock(&dev->sm_lock);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
+{
+ struct ib_event event;
+
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+ struct ib_port_info *pinfo =
+ (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+
+ update_sm_ah(to_mdev(ibdev), port_num,
+ be16_to_cpu(pinfo->sm_lid),
+ pinfo->neighbormtu_mastersmsl & 0xf);
+
+ event.device = ibdev;
+ event.element.port_num = port_num;
+
+ if(pinfo->clientrereg_resv_subnetto & 0x80)
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ else
+ event.event = IB_EVENT_LID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+ event.device = ibdev;
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+ }
+ }
+}
+
+static void node_desc_override(struct ib_device *dev,
+ struct ib_mad *mad)
+{
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+ mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+ spin_lock(&to_mdev(dev)->sm_lock);
+ memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+ spin_unlock(&to_mdev(dev)->sm_lock);
+ }
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+ int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+ int ret;
+
+ if (agent) {
+ send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+ IB_MGMT_MAD_DATA, GFP_ATOMIC);
+ /*
+ * We rely here on the fact that MLX QPs don't use the
+ * address handle after the send is posted (this is
+ * wrong following the IB spec strictly, but we know
+ * it's OK for our devices).
+ */
+ spin_lock(&dev->sm_lock);
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+ ret = ib_post_send_mad(send_buf, NULL);
+ else
+ ret = -EINVAL;
+ spin_unlock(&dev->sm_lock);
+
+ if (ret)
+ ib_free_send_mad(send_buf);
+ }
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ u16 slid;
+ int err;
+
+ slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+ forward_trap(to_mdev(ibdev), port_num, in_mad);
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ }
+
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /*
+ * Don't process SMInfo queries or vendor-specific
+ * MADs -- the SMA can't handle them.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+ ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+ IB_SMP_ATTR_VENDOR_MASK))
+ return IB_MAD_RESULT_SUCCESS;
+ } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } else
+ return IB_MAD_RESULT_SUCCESS;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev),
+ mad_flags & IB_MAD_IGNORE_MKEY,
+ mad_flags & IB_MAD_IGNORE_BKEY,
+ port_num, in_wc, in_grh, in_mad, out_mad);
+ if (err)
+ return IB_MAD_RESULT_FAILURE;
+
+ if (!out_mad->mad_hdr.status) {
+ smp_snoop(ibdev, port_num, in_mad);
+ node_desc_override(ibdev, out_mad);
+ }
+
+ /* set return bit in status of directed route responses */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+ int ret;
+
+ for (p = 0; p < dev->dev->caps.num_ports; ++p)
+ for (q = 0; q <= 1; ++q) {
+ agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+ q ? IB_QPT_GSI : IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+ dev->send_agent[p][q] = agent;
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dev->dev->caps.num_ports; ++p)
+ for (q = 0; q <= 1; ++q)
+ if (dev->send_agent[p][q])
+ ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+ return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ for (p = 0; p < dev->dev->caps.num_ports; ++p) {
+ for (q = 0; q <= 1; ++q) {
+ agent = dev->send_agent[p][q];
+ dev->send_agent[p][q] = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+
+ if (dev->sm_ah[p])
+ ib_destroy_ah(dev->sm_ah[p]);
+ }
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 0000000..c591616
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+#define DRV_NAME "mlx4_ib"
+#define DRV_VERSION "0.01"
+#define DRV_RELDATE "May 1, 2006"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const char mlx4_ib_version[] __devinitdata =
+ DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void init_query_mad(struct ib_smp *mad)
+{
+ mad->base_version = 1;
+ mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ mad->class_version = 1;
+ mad->method = IB_MGMT_METHOD_GET;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ props->fw_ver = dev->dev->caps.fw_ver;
+ props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+ props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+ props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+ props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+ 0xffffff;
+ props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30));
+ props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
+
+ props->max_mr_size = ~0ull;
+ props->page_size_cap = dev->dev->caps.page_size_cap;
+ props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+ props->max_qp_wr = dev->dev->caps.max_wqes;
+ props->max_sge = min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg);
+ props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+ props->max_cqe = dev->dev->caps.max_cqes;
+ props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+ props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+ props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma;
+ props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+ props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
+ props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+ props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1;
+ props->max_srq_sge = dev->dev->caps.max_srq_sge;
+ props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay;
+ props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+ IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+ props->max_pkeys = dev->dev->caps.pkey_table_len[1];
+ props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+ props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+ props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16));
+ props->lmc = out_mad->data[34] & 0x7;
+ props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18));
+ props->sm_sl = out_mad->data[36] & 0xf;
+ props->state = out_mad->data[32] & 0xf;
+ props->phys_state = out_mad->data[33] >> 4;
+ props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20));
+ props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+ props->max_msg_sz = 0x80000000;
+ props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48));
+ props->active_width = out_mad->data[31] & 0xf;
+ props->active_speed = out_mad->data[35] >> 4;
+ props->max_mtu = out_mad->data[41] & 0xf;
+ props->active_mtu = out_mad->data[36] >> 4;
+ props->subnet_timeout = out_mad->data[51] & 0x1f;
+ props->max_vl_num = out_mad->data[37] >> 4;
+ props->init_type_reply = out_mad->data[41] >> 4;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return err;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw, out_mad->data + 8, 8);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(index / 8);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
+ in_mad->attr_mod = cpu_to_be32(index / 32);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+ struct ib_device_modify *props)
+{
+ if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+ return -EOPNOTSUPP;
+
+ if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ spin_lock(&to_mdev(ibdev)->sm_lock);
+ memcpy(ibdev->node_desc, props->node_desc, 64);
+ spin_unlock(&to_mdev(ibdev)->sm_lock);
+ }
+
+ return 0;
+}
+
+static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+ u32 cap_mask)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ memset(mailbox->buf, 0, 256);
+
+ if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+ *(u8 *) mailbox->buf = !!reset_qkey_viols << 6;
+ ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+ } else {
+ ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols;
+ ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+ }
+
+ err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev->dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+ struct ib_port_modify *props)
+{
+ struct ib_port_attr attr;
+ u32 cap_mask;
+ int err;
+
+ mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+
+ err = mlx4_ib_query_port(ibdev, port, &attr);
+ if (err)
+ goto out;
+
+ cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+ ~props->clr_port_cap_mask;
+
+ err = mlx4_SET_PORT(to_mdev(ibdev), port,
+ !!(mask & IB_PORT_RESET_QKEY_CNTR),
+ cap_mask);
+
+out:
+ mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+ return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_ucontext *context;
+ struct mlx4_ib_alloc_ucontext_resp resp;
+ int err;
+
+ resp.qp_tab_size = dev->dev->caps.num_qps;
+ resp.bf_reg_size = dev->dev->caps.bf_reg_size;
+ resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+ if (err) {
+ kfree(context);
+ return ERR_PTR(err);
+ }
+
+ INIT_LIST_HEAD(&context->db_page_list);
+ mutex_init(&context->db_page_mutex);
+
+ err = ib_copy_to_udata(udata, &resp, sizeof resp);
+ if (err) {
+ mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+ kfree(context);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+ struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+ mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+ kfree(context);
+
+ return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (vma->vm_pgoff == 0) {
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ to_mucontext(context)->uar.pfn,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+ } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+ /* FIXME want pgprot_writecombine() for BlueFlame pages */
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ to_mucontext(context)->uar.pfn +
+ dev->dev->caps.num_uars,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_pd *pd;
+ int err;
+
+ pd = kmalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+ if (err) {
+ kfree(pd);
+ return ERR_PTR(err);
+ }
+
+ if (context)
+ if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+ mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+ kfree(pd);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+ mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+ kfree(pd);
+
+ return 0;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
+ &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ return mlx4_multicast_detach(to_mdev(ibqp->device)->dev,
+ &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+ err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+ struct mlx4_ib_dev *ibdev;
+
+ ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+ if (!ibdev) {
+ dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+ return NULL;
+ }
+
+ if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+ goto err_dealloc;
+
+ if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+ goto err_pd;
+
+ ibdev->uar_map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!ibdev->uar_map)
+ goto err_uar;
+ MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+ INIT_LIST_HEAD(&ibdev->pgdir_list);
+ mutex_init(&ibdev->pgdir_mutex);
+
+ ibdev->dev = dev;
+
+ strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+ ibdev->ib_dev.owner = THIS_MODULE;
+ ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
+ ibdev->ib_dev.phys_port_cnt = dev->caps.num_ports;
+ ibdev->ib_dev.num_comp_vectors = 1;
+ ibdev->ib_dev.dma_device = &dev->pdev->dev;
+
+ ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+ ibdev->ib_dev.uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+ ibdev->ib_dev.query_device = mlx4_ib_query_device;
+ ibdev->ib_dev.query_port = mlx4_ib_query_port;
+ ibdev->ib_dev.query_gid = mlx4_ib_query_gid;
+ ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey;
+ ibdev->ib_dev.modify_device = mlx4_ib_modify_device;
+ ibdev->ib_dev.modify_port = mlx4_ib_modify_port;
+ ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext;
+ ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext;
+ ibdev->ib_dev.mmap = mlx4_ib_mmap;
+ ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd;
+ ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd;
+ ibdev->ib_dev.create_ah = mlx4_ib_create_ah;
+ ibdev->ib_dev.query_ah = mlx4_ib_query_ah;
+ ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah;
+ ibdev->ib_dev.create_srq = mlx4_ib_create_srq;
+ ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq;
+ ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq;
+ ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv;
+ ibdev->ib_dev.create_qp = mlx4_ib_create_qp;
+ ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp;
+ ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp;
+ ibdev->ib_dev.post_send = mlx4_ib_post_send;
+ ibdev->ib_dev.post_recv = mlx4_ib_post_recv;
+ ibdev->ib_dev.create_cq = mlx4_ib_create_cq;
+ ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq;
+ ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq;
+ ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq;
+ ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr;
+ ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr;
+ ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr;
+ ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
+ ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
+ ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
+
+ if (init_node_data(ibdev))
+ goto err_map;
+
+ spin_lock_init(&ibdev->sm_lock);
+ mutex_init(&ibdev->cap_mask_mutex);
+
+ if (ib_register_device(&ibdev->ib_dev))
+ goto err_map;
+
+ if (mlx4_ib_mad_init(ibdev))
+ goto err_reg;
+
+ return ibdev;
+
+err_reg:
+ ib_unregister_device(&ibdev->ib_dev);
+
+err_map:
+ iounmap(ibdev->uar_map);
+
+err_uar:
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+ ib_dealloc_device(&ibdev->ib_dev);
+
+ return NULL;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+ struct mlx4_ib_dev *ibdev = ibdev_ptr;
+ int p;
+
+ for (p = 1; p <= dev->caps.num_ports; ++p)
+ mlx4_CLOSE_PORT(dev, p);
+
+ mlx4_ib_mad_cleanup(ibdev);
+ ib_unregister_device(&ibdev->ib_dev);
+ iounmap(ibdev->uar_map);
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+ ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+ enum mlx4_dev_event event, int subtype,
+ int port)
+{
+ struct ib_event ibev;
+
+ switch (event) {
+ case MLX4_EVENT_TYPE_PORT_CHANGE:
+ ibev.event = subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ?
+ IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+ break;
+
+ case MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR:
+ ibev.event = IB_EVENT_DEVICE_FATAL;
+ break;
+
+ default:
+ return;
+ }
+
+ ibev.device = ibdev_ptr;
+ ibev.element.port_num = port;
+
+ ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+ .add = mlx4_ib_add,
+ .remove = mlx4_ib_remove,
+ .event = mlx4_ib_event
+};
+
+static int __init mlx4_ib_init(void)
+{
+ return mlx4_register_interface(&mlx4_ib_interface);
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+ mlx4_unregister_interface(&mlx4_ib_interface);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 0000000..24ccadd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+enum {
+ MLX4_IB_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+struct mlx4_ib_db_pgdir;
+struct mlx4_ib_user_db_page;
+
+struct mlx4_ib_db {
+ __be32 *db;
+ union {
+ struct mlx4_ib_db_pgdir *pgdir;
+ struct mlx4_ib_user_db_page *user_page;
+ } u;
+ dma_addr_t dma;
+ int index;
+ int order;
+};
+
+struct mlx4_ib_ucontext {
+ struct ib_ucontext ibucontext;
+ struct mlx4_uar uar;
+ struct list_head db_page_list;
+ struct mutex db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+ struct ib_pd ibpd;
+ u32 pdn;
+};
+
+struct mlx4_ib_cq_buf {
+ struct mlx4_buf buf;
+ struct mlx4_mtt mtt;
+};
+
+struct mlx4_ib_cq {
+ struct ib_cq ibcq;
+ struct mlx4_cq mcq;
+ struct mlx4_ib_cq_buf buf;
+ struct mlx4_ib_db db;
+ spinlock_t lock;
+ struct ib_umem *umem;
+};
+
+struct mlx4_ib_mr {
+ struct ib_mr ibmr;
+ struct mlx4_mr mmr;
+ struct ib_umem *umem;
+};
+
+struct mlx4_ib_wq {
+ u64 *wrid;
+ spinlock_t lock;
+ int wqe_cnt;
+ int max_post;
+ int max_gs;
+ int offset;
+ int wqe_shift;
+ unsigned head;
+ unsigned tail;
+};
+
+struct mlx4_ib_qp {
+ struct ib_qp ibqp;
+ struct mlx4_qp mqp;
+ struct mlx4_buf buf;
+
+ struct mlx4_ib_db db;
+ struct mlx4_ib_wq rq;
+
+ u32 doorbell_qpn;
+ __be32 sq_signal_bits;
+ int sq_spare_wqes;
+ struct mlx4_ib_wq sq;
+
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ int buf_size;
+ struct mutex mutex;
+ u8 port;
+ u8 alt_port;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+ u8 sq_no_prefetch;
+ u8 state;
+};
+
+struct mlx4_ib_srq {
+ struct ib_srq ibsrq;
+ struct mlx4_srq msrq;
+ struct mlx4_buf buf;
+ struct mlx4_ib_db db;
+ u64 *wrid;
+ spinlock_t lock;
+ int head;
+ int tail;
+ u16 wqe_ctr;
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ struct mutex mutex;
+};
+
+struct mlx4_ib_ah {
+ struct ib_ah ibah;
+ struct mlx4_av av;
+};
+
+struct mlx4_ib_dev {
+ struct ib_device ib_dev;
+ struct mlx4_dev *dev;
+ void __iomem *uar_map;
+
+ struct list_head pgdir_list;
+ struct mutex pgdir_mutex;
+
+ struct mlx4_uar priv_uar;
+ u32 priv_pdn;
+ MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+ struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2];
+ struct ib_ah *sm_ah[MLX4_MAX_PORTS];
+ spinlock_t sm_lock;
+
+ struct mutex cap_mask_mutex;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+ return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+ return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order);
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db);
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+ struct mlx4_ib_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+ return !!(ah->av.g_slid & 0x80);
+}
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 0000000..85ae906
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
+ MLX4_PERM_LOCAL_READ;
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct mlx4_ib_mr *mr;
+ int err;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+ ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+
+err_mr:
+ mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem)
+{
+ u64 *pages;
+ struct ib_umem_chunk *chunk;
+ int i, j, k;
+ int n;
+ int len;
+ int err = 0;
+
+ pages = (u64 *) __get_free_page(GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ i = n = 0;
+
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (j = 0; j < chunk->nmap; ++j) {
+ len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
+ for (k = 0; k < len; ++k) {
+ pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+ umem->page_size * k;
+ /*
+ * Be friendly to WRITE_MTT firmware
+ * command, and pass it chunks of
+ * appropriate size.
+ */
+ if (i == PAGE_SIZE / sizeof (u64) - 2) {
+ err = mlx4_write_mtt(dev->dev, mtt, n,
+ i, pages);
+ if (err)
+ goto out;
+ n += i;
+ i = 0;
+ }
+ }
+ }
+
+ if (i)
+ err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+ free_page((unsigned long) pages);
+ return err;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mr *mr;
+ int shift;
+ int err;
+ int n;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->umem = ib_umem_get(pd->uobject->context, start, length, access_flags);
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ goto err_free;
+ }
+
+ n = ib_umem_page_count(mr->umem);
+ shift = ilog2(mr->umem->page_size);
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+ convert_access(access_flags), n, shift, &mr->mmr);
+ if (err)
+ goto err_umem;
+
+ err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+ if (err)
+ goto err_mr;
+
+ err = mlx4_mr_enable(dev->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+ return &mr->ibmr;
+
+err_mr:
+ mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+ ib_umem_release(mr->umem);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+ struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+ mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+ kfree(mr);
+
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 0000000..28a08bd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,1457 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+ MLX4_IB_ACK_REQ_FREQ = 8,
+};
+
+enum {
+ MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
+ MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f
+};
+
+enum {
+ /*
+ * Largest possible UD header: send with GRH and immediate data.
+ */
+ MLX4_IB_UD_HEADER_SIZE = 72
+};
+
+struct mlx4_ib_sqp {
+ struct mlx4_ib_qp qp;
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+ [IB_WR_SEND] = __constant_cpu_to_be32(MLX4_OPCODE_SEND),
+ [IB_WR_SEND_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+ [IB_WR_RDMA_WRITE] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+ [IB_WR_RDMA_WRITE_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+ [IB_WR_RDMA_READ] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+ [IB_WR_ATOMIC_CMP_AND_SWP] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+ qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+}
+
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+ qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+ if (qp->buf.nbufs == 1)
+ return qp->buf.u.direct.buf + offset;
+ else
+ return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+ (offset & (PAGE_SIZE - 1));
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ u32 *wqe = get_send_wqe(qp, n);
+ int i;
+
+ for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
+ wqe[i] = 0xffffffff;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+ if (type == MLX4_EVENT_TYPE_PATH_MIG)
+ to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+ if (ibqp->event_handler) {
+ event.device = ibqp->device;
+ event.element.qp = ibqp;
+ switch (type) {
+ case MLX4_EVENT_TYPE_PATH_MIG:
+ event.event = IB_EVENT_PATH_MIG;
+ break;
+ case MLX4_EVENT_TYPE_COMM_EST:
+ event.event = IB_EVENT_COMM_EST;
+ break;
+ case MLX4_EVENT_TYPE_SQ_DRAINED:
+ event.event = IB_EVENT_SQ_DRAINED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+ event.event = IB_EVENT_QP_FATAL;
+ break;
+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+ event.event = IB_EVENT_PATH_MIG_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ event.event = IB_EVENT_QP_REQ_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+ event.event = IB_EVENT_QP_ACCESS_ERR;
+ break;
+ default:
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on QP %06x\n", type, qp->qpn);
+ return;
+ }
+
+ ibqp->event_handler(&event, ibqp->qp_context);
+ }
+}
+
+static int send_wqe_overhead(enum ib_qp_type type)
+{
+ /*
+ * UD WQEs must have a datagram segment.
+ * RC and UC WQEs might have a remote address segment.
+ * MLX WQEs need two extra inline data segments (for the UD
+ * header and space for the ICRC).
+ */
+ switch (type) {
+ case IB_QPT_UD:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg);
+ case IB_QPT_UC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_RC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_UD_HEADER_SIZE +
+ DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+ MLX4_INLINE_ALIGN) *
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg)) +
+ ALIGN(4 +
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg));
+ default:
+ return sizeof (struct mlx4_wqe_ctrl_seg);
+ }
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ int is_user, int has_srq, struct mlx4_ib_qp *qp)
+{
+ /* Sanity check RQ size before proceeding */
+ if (cap->max_recv_wr > dev->dev->caps.max_wqes ||
+ cap->max_recv_sge > dev->dev->caps.max_rq_sg)
+ return -EINVAL;
+
+ if (has_srq) {
+ /* QPs attached to an SRQ should have no RQ */
+ if (cap->max_recv_wr)
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+ } else {
+ /* HW requires >= 1 RQ entry with >= 1 gather entry */
+ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+ qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+ qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+ }
+
+ cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
+ cap->max_recv_sge = qp->rq.max_gs;
+
+ return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+ /* Sanity check SQ size before proceeding */
+ if (cap->max_send_wr > dev->dev->caps.max_wqes ||
+ cap->max_send_sge > dev->dev->caps.max_sq_sg ||
+ cap->max_inline_data + send_wqe_overhead(type) +
+ sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ /*
+ * For MLX transport we need 2 extra S/G entries:
+ * one for the header and one for the checksum at the end
+ */
+ if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+ cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+ return -EINVAL;
+
+ qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
+ sizeof (struct mlx4_wqe_data_seg),
+ cap->max_inline_data +
+ sizeof (struct mlx4_wqe_inline_seg)) +
+ send_wqe_overhead(type)));
+ qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
+ sizeof (struct mlx4_wqe_data_seg);
+
+ /*
+ * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+ qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.offset = 0;
+ qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ } else {
+ qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+ qp->sq.offset = 0;
+ }
+
+ cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+ cap->max_send_sge = qp->sq.max_gs;
+ /* We don't support inline sends for kernel QPs (yet) */
+ cap->max_inline_data = 0;
+
+ return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_qp *qp,
+ struct mlx4_ib_create_qp *ucmd)
+{
+ qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
+ qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+ return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+ int err;
+
+ mutex_init(&qp->mutex);
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+
+ qp->state = IB_QPS_RESET;
+ qp->atomic_rd_en = 0;
+ qp->resp_depth = 0;
+
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+
+ err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
+ if (err)
+ goto err;
+
+ if (pd->uobject) {
+ struct mlx4_ib_create_qp ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err;
+ }
+
+ qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+ err = set_user_sq_size(qp, &ucmd);
+ if (err)
+ goto err;
+
+ qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+ qp->buf_size, 0);
+ if (IS_ERR(qp->umem)) {
+ err = PTR_ERR(qp->umem);
+ goto err;
+ }
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+ ilog2(qp->umem->page_size), &qp->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+ if (err)
+ goto err_mtt;
+
+ if (!init_attr->srq) {
+ err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+ ucmd.db_addr, &qp->db);
+ if (err)
+ goto err_mtt;
+ }
+ } else {
+ qp->sq_no_prefetch = 0;
+
+ err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+ if (err)
+ goto err;
+
+ if (!init_attr->srq) {
+ err = mlx4_ib_db_alloc(dev, &qp->db, 0);
+ if (err)
+ goto err;
+
+ *qp->db.db = 0;
+ }
+
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+ &qp->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+ if (err)
+ goto err_mtt;
+
+ qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+ qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+
+ if (!qp->sq.wrid || !qp->rq.wrid) {
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ }
+
+ err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+ if (err)
+ goto err_wrid;
+
+ /*
+ * Hardware wants QPN written in big-endian order (after
+ * shifting) for send doorbell. Precompute this value to save
+ * a little bit when posting sends.
+ */
+ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+ if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+ else
+ qp->sq_signal_bits = 0;
+
+ qp->mqp.event = mlx4_ib_qp_event;
+
+ return 0;
+
+err_wrid:
+ if (pd->uobject && !init_attr->srq)
+ mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+ else {
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ }
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+ if (pd->uobject)
+ ib_umem_release(qp->umem);
+ else
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+ if (!pd->uobject && !init_attr->srq)
+ mlx4_ib_db_free(dev, &qp->db);
+
+err:
+ return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+ switch (state) {
+ case IB_QPS_RESET: return MLX4_QP_STATE_RST;
+ case IB_QPS_INIT: return MLX4_QP_STATE_INIT;
+ case IB_QPS_RTR: return MLX4_QP_STATE_RTR;
+ case IB_QPS_RTS: return MLX4_QP_STATE_RTS;
+ case IB_QPS_SQD: return MLX4_QP_STATE_SQD;
+ case IB_QPS_SQE: return MLX4_QP_STATE_SQER;
+ case IB_QPS_ERR: return MLX4_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_lock_irq(&send_cq->lock);
+ else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_lock_irq(&send_cq->lock);
+ spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock_irq(&recv_cq->lock);
+ spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_unlock_irq(&send_cq->lock);
+ else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+ } else {
+ spin_unlock(&send_cq->lock);
+ spin_unlock_irq(&recv_cq->lock);
+ }
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+ int is_user)
+{
+ struct mlx4_ib_cq *send_cq, *recv_cq;
+
+ if (qp->state != IB_QPS_RESET)
+ if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+ MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+ printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+ qp->mqp.qpn);
+
+ send_cq = to_mcq(qp->ibqp.send_cq);
+ recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+ mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+ if (!is_user) {
+ __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+ if (send_cq != recv_cq)
+ __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+ }
+
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+
+ mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+ mlx4_qp_free(dev->dev, &qp->mqp);
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+ if (is_user) {
+ if (!qp->ibqp.srq)
+ mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+ &qp->db);
+ ib_umem_release(qp->umem);
+ } else {
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+ if (!qp->ibqp.srq)
+ mlx4_ib_db_free(dev, &qp->db);
+ }
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_sqp *sqp;
+ struct mlx4_ib_qp *qp;
+ int err;
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ {
+ qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ qp->ibqp.qp_num = qp->mqp.qpn;
+
+ break;
+ }
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
+ /* Userspace is not allowed to create special QPs: */
+ if (pd->uobject)
+ return ERR_PTR(-EINVAL);
+
+ sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+ if (!sqp)
+ return ERR_PTR(-ENOMEM);
+
+ qp = &sqp->qp;
+
+ err = create_qp_common(dev, pd, init_attr, udata,
+ dev->dev->caps.sqp_start +
+ (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+ init_attr->port_num - 1,
+ qp);
+ if (err) {
+ kfree(sqp);
+ return ERR_PTR(err);
+ }
+
+ qp->port = init_attr->port_num;
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+ break;
+ }
+ default:
+ /* Don't support raw QPs */
+ return ERR_PTR(-EINVAL);
+ }
+
+ return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+ struct mlx4_ib_dev *dev = to_mdev(qp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (is_qp0(dev, mqp))
+ mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+ destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+
+ if (is_sqp(dev, mqp))
+ kfree(to_msqp(mqp));
+ else
+ kfree(mqp);
+
+ return 0;
+}
+
+static int to_mlx4_st(enum ib_qp_type type)
+{
+ switch (type) {
+ case IB_QPT_RC: return MLX4_QP_ST_RC;
+ case IB_QPT_UC: return MLX4_QP_ST_UC;
+ case IB_QPT_UD: return MLX4_QP_ST_UD;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI: return MLX4_QP_ST_MLX;
+ default: return -1;
+ }
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ u8 dest_rd_atomic;
+ u32 access_flags;
+ u32 hw_access_flags = 0;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ dest_rd_atomic = attr->max_dest_rd_atomic;
+ else
+ dest_rd_atomic = qp->resp_depth;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ access_flags = attr->qp_access_flags;
+ else
+ access_flags = qp->atomic_rd_en;
+
+ if (!dest_rd_atomic)
+ access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+ if (access_flags & IB_ACCESS_REMOTE_READ)
+ hw_access_flags |= MLX4_QP_BIT_RRE;
+ if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ hw_access_flags |= MLX4_QP_BIT_RAE;
+ if (access_flags & IB_ACCESS_REMOTE_WRITE)
+ hw_access_flags |= MLX4_QP_BIT_RWE;
+
+ return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ sqp->pkey_index = attr->pkey_index;
+ if (attr_mask & IB_QP_QKEY)
+ sqp->qkey = attr->qkey;
+ if (attr_mask & IB_QP_SQ_PSN)
+ sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+ path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+ struct mlx4_qp_path *path, u8 port)
+{
+ path->grh_mylmc = ah->src_path_bits & 0x7f;
+ path->rlid = cpu_to_be16(ah->dlid);
+ if (ah->static_rate) {
+ path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+ --path->static_rate;
+ } else
+ path->static_rate = 0;
+ path->counter_index = 0xff;
+
+ if (ah->ah_flags & IB_AH_GRH) {
+ if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+ printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+ ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+ return -1;
+ }
+
+ path->grh_mylmc |= 1 << 7;
+ path->mgid_index = ah->grh.sgid_index;
+ path->hop_limit = ah->grh.hop_limit;
+ path->tclass_flowlabel =
+ cpu_to_be32((ah->grh.traffic_class << 20) |
+ (ah->grh.flow_label));
+ memcpy(path->rgid, ah->grh.dgid.raw, 16);
+ }
+
+ path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+ ((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+
+ return 0;
+}
+
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_qp_context *context;
+ enum mlx4_qp_optpar optpar = 0;
+ int sqd_event;
+ int err = -EINVAL;
+
+ context = kzalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return -ENOMEM;
+
+ context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+ (to_mlx4_st(ibqp->qp_type) << 16));
+ context->flags |= cpu_to_be32(1 << 8); /* DE? */
+
+ if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ else {
+ optpar |= MLX4_QP_OPTPAR_PM_STATE;
+ switch (attr->path_mig_state) {
+ case IB_MIG_MIGRATED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ break;
+ case IB_MIG_REARM:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+ break;
+ case IB_MIG_ARMED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+ break;
+ }
+ }
+
+ if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+ ibqp->qp_type == IB_QPT_UD)
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+ else if (attr_mask & IB_QP_PATH_MTU) {
+ if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+ printk(KERN_ERR "path MTU (%u) is invalid\n",
+ attr->path_mtu);
+ return -EINVAL;
+ }
+ context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+ }
+
+ if (qp->rq.wqe_cnt)
+ context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+ context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+ if (qp->sq.wqe_cnt)
+ context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+ context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+
+ if (qp->ibqp.uobject)
+ context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+ else
+ context->usr_page = cpu_to_be32(dev->priv_uar.index);
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+ if (attr_mask & IB_QP_PORT) {
+ if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+ !(attr_mask & IB_QP_AV)) {
+ mlx4_set_sched(&context->pri_path, attr->port_num);
+ optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+ }
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ context->pri_path.pkey_index = attr->pkey_index;
+ optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+ }
+
+ if (attr_mask & IB_QP_AV) {
+ if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
+ attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+ MLX4_QP_OPTPAR_SCHED_QUEUE);
+ }
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ context->pri_path.ackto = attr->timeout << 3;
+ optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_port_num == 0 ||
+ attr->alt_port_num > dev->dev->caps.num_ports)
+ return -EINVAL;
+
+ if (attr->alt_pkey_index >=
+ dev->dev->caps.pkey_table_len[attr->alt_port_num])
+ return -EINVAL;
+
+ if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+ attr->alt_port_num))
+ return -EINVAL;
+
+ context->alt_path.pkey_index = attr->alt_pkey_index;
+ context->alt_path.ackto = attr->alt_timeout << 3;
+ optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+ }
+
+ context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
+ context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+ optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+ }
+
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+ optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic)
+ context->params1 |=
+ cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN)
+ context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+ context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ if (attr->max_dest_rd_atomic)
+ context->params2 |=
+ cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+ }
+
+ if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+ context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+ optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+ }
+
+ if (ibqp->srq)
+ context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+ optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+ }
+ if (attr_mask & IB_QP_RQ_PSN)
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+ context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
+
+ if (attr_mask & IB_QP_QKEY) {
+ context->qkey = cpu_to_be32(attr->qkey);
+ optpar |= MLX4_QP_OPTPAR_Q_KEY;
+ }
+
+ if (ibqp->srq)
+ context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+ if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+ if (cur_state == IB_QPS_INIT &&
+ new_state == IB_QPS_RTR &&
+ (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+ ibqp->qp_type == IB_QPT_UD)) {
+ context->pri_path.sched_queue = (qp->port - 1) << 6;
+ if (is_qp0(dev, qp))
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+ else
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+ }
+
+ if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
+ attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+ sqd_event = 1;
+ else
+ sqd_event = 0;
+
+ /*
+ * Before passing a kernel QP to the HW, make sure that the
+ * ownership bits of the send queue are set and the SQ
+ * headroom is stamped so that the hardware doesn't start
+ * processing stale work requests.
+ */
+ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ int i;
+
+ for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+ ctrl = get_send_wqe(qp, i);
+ ctrl->owner_opcode = cpu_to_be32(1 << 31);
+
+ stamp_send_wqe(qp, i);
+ }
+ }
+
+ err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+ to_mlx4_state(new_state), context, optpar,
+ sqd_event, &qp->mqp);
+ if (err)
+ goto out;
+
+ qp->state = new_state;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->atomic_rd_en = attr->qp_access_flags;
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->resp_depth = attr->max_dest_rd_atomic;
+ if (attr_mask & IB_QP_PORT)
+ qp->port = attr->port_num;
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_port = attr->alt_port_num;
+
+ if (is_sqp(dev, qp))
+ store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+ /*
+ * If we moved QP0 to RTR, bring the IB link up; if we moved
+ * QP0 to RESET or ERROR, bring the link back down.
+ */
+ if (is_qp0(dev, qp)) {
+ if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+ if (mlx4_INIT_PORT(dev->dev, qp->port))
+ printk(KERN_WARNING "INIT_PORT failed for port %d\n",
+ qp->port);
+
+ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+ mlx4_CLOSE_PORT(dev->dev, qp->port);
+ }
+
+ /*
+ * If we moved a kernel QP to RESET, clean up all old CQ
+ * entries and reinitialize the QP.
+ */
+ if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+ mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
+ ibqp->srq ? to_msrq(ibqp->srq): NULL);
+ if (ibqp->send_cq != ibqp->recv_cq)
+ mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+ if (!ibqp->srq)
+ *qp->db.db = 0;
+ }
+
+out:
+ kfree(context);
+ return err;
+}
+
+static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
+static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+};
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+ goto out;
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p])
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+ err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
+ mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
+ IB_QPS_RESET, IB_QPS_INIT);
+ if (err)
+ goto out;
+ cur_state = IB_QPS_INIT;
+ }
+
+ err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+ void *wqe)
+{
+ struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ u16 pkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int i;
+
+ send_size = 0;
+ for (i = 0; i < wr->num_sge; ++i)
+ send_size += wr->sg_list[i].length;
+
+ ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header);
+
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid = ah->av.dlid;
+ sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f);
+ if (mlx4_ib_ah_grh_present(ah)) {
+ sqp->ud_header.grh.traffic_class =
+ (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.grh.flow_label =
+ ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ sqp->ud_header.grh.hop_limit = ah->av.hop_limit;
+ ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
+ ah->av.gid_index, &sqp->ud_header.grh.source_gid);
+ memcpy(sqp->ud_header.grh.destination_gid.raw,
+ ah->av.dgid, 16);
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+ mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid ==
+ IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+ sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ if (!sqp->qp.ibqp.qp_num)
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+ else
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->wr.ud.remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ if (0) {
+ printk(KERN_ERR "built UD header of size %d:\n", header_size);
+ for (i = 0; i < header_size / 4; ++i) {
+ if (i % 8 == 0)
+ printk(" [%02x] ", i * 4);
+ printk(" %08x",
+ be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ printk("\n");
+ }
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ return ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mlx4_ib_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max_post))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ unsigned long flags;
+ int nreq;
+ int err = 0;
+ int ind;
+ int size;
+ int i;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ ind = qp->sq.head;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ctrl->srcrb_flags =
+ (wr->send_flags & IB_SEND_SIGNALED ?
+ cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+ (wr->send_flags & IB_SEND_SOLICITED ?
+ cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+ qp->sq_signal_bits;
+
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ctrl->imm = wr->imm_data;
+ else
+ ctrl->imm = 0;
+
+ wqe += sizeof *ctrl;
+ size = sizeof *ctrl / 16;
+
+ switch (ibqp->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ ((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+ cpu_to_be64(wr->wr.atomic.remote_addr);
+ ((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+ cpu_to_be32(wr->wr.atomic.rkey);
+ ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+ cpu_to_be64(wr->wr.atomic.swap);
+ ((struct mlx4_wqe_atomic_seg *) wqe)->compare =
+ cpu_to_be64(wr->wr.atomic.compare_add);
+ } else {
+ ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+ cpu_to_be64(wr->wr.atomic.compare_add);
+ ((struct mlx4_wqe_atomic_seg *) wqe)->compare = 0;
+ }
+
+ wqe += sizeof (struct mlx4_wqe_atomic_seg);
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ ((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+ cpu_to_be64(wr->wr.rdma.remote_addr);
+ ((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+ cpu_to_be32(wr->wr.rdma.rkey);
+ ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+ size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+ break;
+
+ case IB_QPT_UD:
+ memcpy(((struct mlx4_wqe_datagram_seg *) wqe)->av,
+ &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+ ((struct mlx4_wqe_datagram_seg *) wqe)->dqpn =
+ cpu_to_be32(wr->wr.ud.remote_qpn);
+ ((struct mlx4_wqe_datagram_seg *) wqe)->qkey =
+ cpu_to_be32(wr->wr.ud.remote_qkey);
+
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ break;
+
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ err = build_mlx_header(to_msqp(qp), wr, ctrl);
+ if (err < 0) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += err;
+ size += err / 16;
+
+ err = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mlx4_wqe_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mlx4_wqe_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mlx4_wqe_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+
+ wqe += sizeof (struct mlx4_wqe_data_seg);
+ size += sizeof (struct mlx4_wqe_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC for MLX sends */
+ if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) {
+ ((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mlx4_wqe_data_seg);
+ size += sizeof (struct mlx4_wqe_data_seg) / 16;
+ }
+
+ ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
+
+ if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+ (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+ /*
+ * We can improve latency by not stamping the last
+ * send queue WQE until after ringing the doorbell, so
+ * only stamp here if there are still more WQEs to post.
+ */
+ if (wr->next)
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+ (qp->sq.wqe_cnt - 1));
+
+ ++ind;
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ writel(qp->doorbell_qpn,
+ to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+ /*
+ * Make sure doorbells don't leak out of SQ spinlock
+ * and reach the HCA out of order.
+ */
+ mmiowb();
+
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+ (qp->sq.wqe_cnt - 1));
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+ return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ scat = get_recv_wqe(qp, ind);
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+ scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey);
+ scat[i].addr = cpu_to_be64(wr->sg_list[i].addr);
+ }
+
+ if (i < qp->rq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+
+ qp->rq.wrid[ind] = wr->wr_id;
+
+ ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 0000000..12fac1c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+ int offset = n << srq->msrq.wqe_shift;
+
+ if (srq->buf.nbufs == 1)
+ return srq->buf.u.direct.buf + offset;
+ else
+ return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+ (offset & (PAGE_SIZE - 1));
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+ if (ibsrq->event_handler) {
+ event.device = ibsrq->device;
+ event.element.srq = ibsrq;
+ switch (type) {
+ case MLX4_EVENT_TYPE_SRQ_LIMIT:
+ event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+ event.event = IB_EVENT_SRQ_ERR;
+ break;
+ default:
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on SRQ %06x\n", type, srq->srqn);
+ return;
+ }
+
+ ibsrq->event_handler(&event, ibsrq->srq_context);
+ }
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_srq *srq;
+ struct mlx4_wqe_srq_next_seg *next;
+ int desc_size;
+ int buf_size;
+ int err;
+ int i;
+
+ /* Sanity check SRQ size before proceeding */
+ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes ||
+ init_attr->attr.max_sge > dev->dev->caps.max_srq_sge)
+ return ERR_PTR(-EINVAL);
+
+ srq = kmalloc(sizeof *srq, GFP_KERNEL);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&srq->mutex);
+ spin_lock_init(&srq->lock);
+ srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+ srq->msrq.max_gs = init_attr->attr.max_sge;
+
+ desc_size = max(32UL,
+ roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+ srq->msrq.max_gs *
+ sizeof (struct mlx4_wqe_data_seg)));
+ srq->msrq.wqe_shift = ilog2(desc_size);
+
+ buf_size = srq->msrq.max * desc_size;
+
+ if (pd->uobject) {
+ struct mlx4_ib_create_srq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_srq;
+ }
+
+ srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+ buf_size, 0);
+ if (IS_ERR(srq->umem)) {
+ err = PTR_ERR(srq->umem);
+ goto err_srq;
+ }
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+ ilog2(srq->umem->page_size), &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+ if (err)
+ goto err_mtt;
+
+ err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+ ucmd.db_addr, &srq->db);
+ if (err)
+ goto err_mtt;
+ } else {
+ err = mlx4_ib_db_alloc(dev, &srq->db, 0);
+ if (err)
+ goto err_srq;
+
+ *srq->db.db = 0;
+
+ if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ srq->head = 0;
+ srq->tail = srq->msrq.max - 1;
+ srq->wqe_ctr = 0;
+
+ for (i = 0; i < srq->msrq.max; ++i) {
+ next = get_wqe(srq, i);
+ next->next_wqe_index =
+ cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+ }
+
+ err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+ &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+ if (err)
+ goto err_mtt;
+
+ srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+ if (!srq->wrid) {
+ err = -ENOMEM;
+ goto err_mtt;
+ }
+ }
+
+ err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt,
+ srq->db.dma, &srq->msrq);
+ if (err)
+ goto err_wrid;
+
+ srq->msrq.event = mlx4_ib_srq_event;
+
+ if (pd->uobject)
+ if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_wrid;
+ }
+
+ init_attr->attr.max_wr = srq->msrq.max - 1;
+
+ return &srq->ibsrq;
+
+err_wrid:
+ if (pd->uobject)
+ mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+ else
+ kfree(srq->wrid);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+ if (pd->uobject)
+ ib_umem_release(srq->umem);
+ else
+ mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+ if (!pd->uobject)
+ mlx4_ib_db_free(dev, &srq->db);
+
+err_srq:
+ kfree(srq);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ int ret;
+
+ /* We don't support resizing SRQs (yet?) */
+ if (attr_mask & IB_SRQ_MAX_WR)
+ return -EINVAL;
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ if (attr->srq_limit >= srq->msrq.max)
+ return -EINVAL;
+
+ mutex_lock(&srq->mutex);
+ ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+ mutex_unlock(&srq->mutex);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+ struct mlx4_ib_dev *dev = to_mdev(srq->device);
+ struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+ mlx4_srq_free(dev->dev, &msrq->msrq);
+ mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+ if (srq->uobject) {
+ mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+ ib_umem_release(msrq->umem);
+ } else {
+ kfree(msrq->wrid);
+ mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+ &msrq->buf);
+ mlx4_ib_db_free(dev, &msrq->db);
+ }
+
+ kfree(msrq);
+
+ return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+ struct mlx4_wqe_srq_next_seg *next;
+
+ /* always called with interrupts disabled. */
+ spin_lock(&srq->lock);
+
+ next = get_wqe(srq, srq->tail);
+ next->next_wqe_index = cpu_to_be16(wqe_index);
+ srq->tail = wqe_index;
+
+ spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (unlikely(srq->head == srq->tail)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ srq->wrid[srq->head] = wr->wr_id;
+
+ next = get_wqe(srq, srq->head);
+ srq->head = be16_to_cpu(next->next_wqe_index);
+ scat = (struct mlx4_wqe_data_seg *) (next + 1);
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+ scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey);
+ scat[i].addr = cpu_to_be64(wr->sg_list[i].addr);
+ }
+
+ if (i < srq->msrq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+ }
+
+ if (likely(nreq)) {
+ srq->wqe_ctr += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *srq->db.db = cpu_to_be32(srq->wqe_ctr);
+ }
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 0000000..e2d11be
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX4_IB_UVERBS_ABI_VERSION 3
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp {
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_cq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_resize_cq {
+ __u64 buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_qp {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 sq_no_prefetch;
+ __u8 reserved[5];
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 27caf3b..4b111a8 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -279,6 +279,7 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
header->grh.flow_label =
ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ header->grh.hop_limit = ah->av->hop_limit;
ib_get_cached_gid(&dev->ib_dev,
be32_to_cpu(ah->av->port_pd) >> 24,
ah->av->gid_index % dev->limits.gid_table_len,
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 7131446..f40558d 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -37,6 +37,7 @@
#include <linux/completion.h>
#include <linux/pci.h>
#include <linux/errno.h>
+#include <linux/sched.h>
#include <asm/io.h>
#include <rdma/ib_mad.h>
@@ -771,7 +772,7 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET);
/*
- * FW subminor version is at more signifant bits than minor
+ * FW subminor version is at more significant bits than minor
* version, so swap here.
*/
dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index cf0868f..be6e1e0 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -37,6 +37,7 @@
*/
#include <linux/hardirq.h>
+#include <linux/sched.h>
#include <asm/io.h>
@@ -284,7 +285,7 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
{
struct mthca_cqe *cqe;
u32 prod_index;
- int nfreed = 0;
+ int i, nfreed = 0;
spin_lock_irq(&cq->lock);
@@ -321,6 +322,8 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
}
if (nfreed) {
+ for (i = 0; i < nfreed; ++i)
+ set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
wmb();
cq->cons_index += nfreed;
update_cons_index(dev, cq, nfreed);
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 773145e..aa563e6 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -1250,12 +1250,14 @@ static void __mthca_remove_one(struct pci_dev *pdev)
int __mthca_restart_one(struct pci_dev *pdev)
{
struct mthca_dev *mdev;
+ int hca_type;
mdev = pci_get_drvdata(pdev);
if (!mdev)
return -ENODEV;
+ hca_type = mdev->hca_type;
__mthca_remove_one(pdev);
- return __mthca_init_one(pdev, mdev->hca_type);
+ return __mthca_init_one(pdev, hca_type);
}
static int __devinit mthca_init_one(struct pci_dev *pdev,
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 48f7c65..e61f3e6 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -36,6 +36,7 @@
#include <linux/mm.h>
#include <linux/scatterlist.h>
+#include <linux/sched.h>
#include <asm/page.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 1c05486..6bcde1c 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -37,6 +37,7 @@
*/
#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
#include <linux/mm.h>
@@ -908,6 +909,8 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
return ERR_PTR(err);
}
+ mr->umem = NULL;
+
return &mr->ibmr;
}
@@ -1003,11 +1006,13 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd,
}
kfree(page_list);
+ mr->umem = NULL;
+
return &mr->ibmr;
}
-static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
- int acc, struct ib_udata *udata)
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int acc, struct ib_udata *udata)
{
struct mthca_dev *dev = to_mdev(pd->device);
struct ib_umem_chunk *chunk;
@@ -1018,20 +1023,26 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
int err = 0;
int write_mtt_size;
- shift = ffs(region->page_size) - 1;
-
mr = kmalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
+ mr->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ goto err;
+ }
+
+ shift = ffs(mr->umem->page_size) - 1;
+
n = 0;
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &mr->umem->chunk_list, list)
n += chunk->nents;
mr->mtt = mthca_alloc_mtt(dev, n);
if (IS_ERR(mr->mtt)) {
err = PTR_ERR(mr->mtt);
- goto err;
+ goto err_umem;
}
pages = (u64 *) __get_free_page(GFP_KERNEL);
@@ -1044,12 +1055,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
- list_for_each_entry(chunk, &region->chunk_list, list)
+ list_for_each_entry(chunk, &mr->umem->chunk_list, list)
for (j = 0; j < chunk->nmap; ++j) {
len = sg_dma_len(&chunk->page_list[j]) >> shift;
for (k = 0; k < len; ++k) {
pages[i++] = sg_dma_address(&chunk->page_list[j]) +
- region->page_size * k;
+ mr->umem->page_size * k;
/*
* Be friendly to write_mtt and pass it chunks
* of appropriate size.
@@ -1071,8 +1082,8 @@ mtt_done:
if (err)
goto err_mtt;
- err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, region->virt_base,
- region->length, convert_access(acc), mr);
+ err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+ convert_access(acc), mr);
if (err)
goto err_mtt;
@@ -1082,6 +1093,9 @@ mtt_done:
err_mtt:
mthca_free_mtt(dev, mr->mtt);
+err_umem:
+ ib_umem_release(mr->umem);
+
err:
kfree(mr);
return ERR_PTR(err);
@@ -1090,8 +1104,12 @@ err:
static int mthca_dereg_mr(struct ib_mr *mr)
{
struct mthca_mr *mmr = to_mmr(mr);
+
mthca_free_mr(to_mdev(mr->device), mmr);
+ if (mmr->umem)
+ ib_umem_release(mmr->umem);
kfree(mmr);
+
return 0;
}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index 1d266ac..262616c 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -73,6 +73,7 @@ struct mthca_mtt;
struct mthca_mr {
struct ib_mr ibmr;
+ struct ib_umem *umem;
struct mthca_mtt *mtt;
};
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index fee60c8..eef415b 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -37,6 +37,7 @@
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/sched.h>
#include <asm/io.h>
@@ -295,7 +296,7 @@ static int to_mthca_st(int transport)
}
}
-static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
int attr_mask)
{
if (attr_mask & IB_QP_PKEY_INDEX)
@@ -327,7 +328,7 @@ static void init_port(struct mthca_dev *dev, int port)
mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
}
-static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
int attr_mask)
{
u8 dest_rd_atomic;
@@ -510,7 +511,7 @@ out:
return err;
}
-static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
struct mthca_qp_path *path, u8 port)
{
path->g_mylmc = ah->src_path_bits & 0x7f;
@@ -538,12 +539,12 @@ static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
return 0;
}
-int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
- struct ib_udata *udata)
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
- enum ib_qp_state cur_state, new_state;
struct mthca_mailbox *mailbox;
struct mthca_qp_param *qp_param;
struct mthca_qp_context *qp_context;
@@ -551,60 +552,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
u8 status;
int err = -EINVAL;
- mutex_lock(&qp->mutex);
-
- if (attr_mask & IB_QP_CUR_STATE) {
- cur_state = attr->cur_qp_state;
- } else {
- spin_lock_irq(&qp->sq.lock);
- spin_lock(&qp->rq.lock);
- cur_state = qp->state;
- spin_unlock(&qp->rq.lock);
- spin_unlock_irq(&qp->sq.lock);
- }
-
- new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
- if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
- mthca_dbg(dev, "Bad QP transition (transport %d) "
- "%d->%d with attr 0x%08x\n",
- qp->transport, cur_state, new_state,
- attr_mask);
- goto out;
- }
-
- if (cur_state == new_state && cur_state == IB_QPS_RESET) {
- err = 0;
- goto out;
- }
-
- if ((attr_mask & IB_QP_PKEY_INDEX) &&
- attr->pkey_index >= dev->limits.pkey_table_len) {
- mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
- attr->pkey_index, dev->limits.pkey_table_len-1);
- goto out;
- }
-
- if ((attr_mask & IB_QP_PORT) &&
- (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
- mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
- attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
- mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
- attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
- attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
- mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
- attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
- goto out;
- }
-
mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
if (IS_ERR(mailbox)) {
err = PTR_ERR(mailbox);
@@ -891,6 +838,98 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
out_mailbox:
mthca_free_mailbox(dev, mailbox);
+out:
+ return err;
+}
+
+static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
+static const int dummy_init_attr_mask[] = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+};
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+ if (attr_mask & IB_QP_CUR_STATE) {
+ cur_state = attr->cur_qp_state;
+ } else {
+ spin_lock_irq(&qp->sq.lock);
+ spin_lock(&qp->rq.lock);
+ cur_state = qp->state;
+ spin_unlock(&qp->rq.lock);
+ spin_unlock_irq(&qp->sq.lock);
+ }
+
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+ mthca_dbg(dev, "Bad QP transition (transport %d) "
+ "%d->%d with attr 0x%08x\n",
+ qp->transport, cur_state, new_state,
+ attr_mask);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PKEY_INDEX) &&
+ attr->pkey_index >= dev->limits.pkey_table_len) {
+ mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+ attr->pkey_index, dev->limits.pkey_table_len-1);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+ mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+ mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+ attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+ mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+ attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+ err = __mthca_modify_qp(ibqp, &dummy_init_attr,
+ dummy_init_attr_mask[ibqp->qp_type],
+ IB_QPS_RESET, IB_QPS_INIT);
+ if (err)
+ goto out;
+ cur_state = IB_QPS_INIT;
+ }
+
+ err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
out:
mutex_unlock(&qp->mutex);
@@ -1862,6 +1901,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
dev->kar + MTHCA_RECEIVE_DOORBELL,
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ qp->rq.next_ind = ind;
qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
size0 = 0;
}
@@ -2244,10 +2284,10 @@ void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
struct mthca_next_seg *next;
/*
- * For SRQs, all WQEs generate a CQE, so we're always at the
- * end of the doorbell chain.
+ * For SRQs, all receive WQEs generate a CQE, so we're always
+ * at the end of the doorbell chain.
*/
- if (qp->ibqp.srq) {
+ if (qp->ibqp.srq && !is_send) {
*new_wqe = 0;
return;
}
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 61974b0..b8f05a5 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/sched.h>
#include <asm/io.h>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 87310ee..285c143 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -132,12 +132,46 @@ struct ipoib_cm_data {
__be32 mtu;
};
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State. The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ * drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ * to be empty or the number of Poll CQ operations has exceeded
+ * CQ capacity size;
+ * - or
+ * post another WR that completes on the same CQ and wait for this
+ * WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+ IPOIB_CM_RX_LIVE,
+ IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */
+};
+
struct ipoib_cm_rx {
struct ib_cm_id *id;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
unsigned long jiffies;
+ enum ipoib_cm_state state;
};
struct ipoib_cm_tx {
@@ -165,10 +199,15 @@ struct ipoib_cm_dev_priv {
struct ib_srq *srq;
struct ipoib_cm_rx_buf *srq_ring;
struct ib_cm_id *id;
- struct list_head passive_ids;
+ struct list_head passive_ids; /* state: LIVE */
+ struct list_head rx_error_list; /* state: ERROR */
+ struct list_head rx_flush_list; /* state: FLUSH, drain not started */
+ struct list_head rx_drain_list; /* state: FLUSH, drain started */
+ struct list_head rx_reap_list; /* state: FLUSH, drain done */
struct work_struct start_task;
struct work_struct reap_task;
struct work_struct skb_task;
+ struct work_struct rx_reap_task;
struct delayed_work stale_task;
struct sk_buff_head skb_queue;
struct list_head start_list;
@@ -201,15 +240,17 @@ struct ipoib_dev_priv {
struct list_head multicast_list;
struct rb_root multicast_tree;
- struct delayed_work pkey_task;
+ struct delayed_work pkey_poll_task;
struct delayed_work mcast_task;
struct work_struct flush_task;
struct work_struct restart_task;
struct delayed_work ah_reap_task;
+ struct work_struct pkey_event_task;
struct ib_device *ca;
u8 port;
u16 pkey;
+ u16 pkey_index;
struct ib_pd *pd;
struct ib_mr *mr;
struct ib_cq *cq;
@@ -333,12 +374,13 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
void ipoib_ib_dev_cleanup(struct net_device *dev);
int ipoib_ib_dev_open(struct net_device *dev);
int ipoib_ib_dev_up(struct net_device *dev);
int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
void ipoib_dev_cleanup(struct net_device *dev);
@@ -386,6 +428,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
void ipoib_pkey_poll(struct work_struct *work);
int ipoib_pkey_dev_delay_open(struct net_device *dev);
+void ipoib_drain_cq(struct net_device *dev);
#ifdef CONFIG_INFINIBAND_IPOIB_CM
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 785bc85..ea74d1e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -37,6 +37,7 @@
#include <net/dst.h>
#include <net/icmp.h>
#include <linux/icmpv6.h>
+#include <linux/delay.h>
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
static int data_debug_level;
@@ -55,11 +56,15 @@ MODULE_PARM_DESC(cm_data_debug_level,
#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
#define IPOIB_CM_RX_UPDATE_MASK (0x3)
-struct ipoib_cm_id {
- struct ib_cm_id *id;
- int flags;
- u32 remote_qpn;
- u32 remote_mtu;
+static struct ib_qp_attr ipoib_cm_err_attr = {
+ .qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+ .wr_id = IPOIB_CM_RX_DRAIN_WRID,
+ .opcode = IB_WR_SEND,
};
static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
@@ -143,22 +148,61 @@ partial_error:
ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
- for (; i >= 0; --i)
- ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+ for (; i > 0; --i)
+ ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
dev_kfree_skb_any(skb);
return NULL;
}
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+{
+ struct ib_send_wr *bad_wr;
+ struct ipoib_cm_rx *p;
+
+ /* We only reserved 1 extra slot in CQ for drain WRs, so
+ * make sure we have at most 1 outstanding WR. */
+ if (list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list))
+ return;
+
+ /*
+ * QPs on flush list are error state. This way, a "flush
+ * error" WC will be immediately generated for each WR we post.
+ */
+ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+ if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+ ipoib_warn(priv, "failed to post drain wr\n");
+
+ list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+ struct ipoib_cm_rx *p = ctx;
+ struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+ unsigned long flags;
+
+ if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+ return;
+
+ spin_lock_irqsave(&priv->lock, flags);
+ list_move(&p->list, &priv->cm.rx_flush_list);
+ p->state = IPOIB_CM_RX_FLUSH;
+ ipoib_cm_start_rx_drain(priv);
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
struct ipoib_cm_rx *p)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_init_attr attr = {
- .send_cq = priv->cq, /* does not matter, we never send anything */
+ .event_handler = ipoib_cm_rx_event_handler,
+ .send_cq = priv->cq, /* For drain WR */
.recv_cq = priv->cq,
.srq = priv->cm.srq,
- .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+ .cap.max_send_wr = 1, /* For drain WR */
.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_RC,
@@ -198,6 +242,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
return ret;
}
+
+ /*
+ * Current Mellanox HCA firmware won't generate completions
+ * with error for drain WRs unless the QP has been moved to
+ * RTS first. This work-around leaves a window where a QP has
+ * moved to error asynchronously, but this will eventually get
+ * fixed in firmware, so let's not error out if modify QP
+ * fails.
+ */
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+ return 0;
+ }
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+ return 0;
+ }
+
return 0;
}
@@ -237,6 +302,11 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
return -ENOMEM;
p->dev = dev;
p->id = cm_id;
+ cm_id->context = p;
+ p->state = IPOIB_CM_RX_LIVE;
+ p->jiffies = jiffies;
+ INIT_LIST_HEAD(&p->list);
+
p->qp = ipoib_cm_create_rx_qp(dev, p);
if (IS_ERR(p->qp)) {
ret = PTR_ERR(p->qp);
@@ -248,22 +318,24 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
if (ret)
goto err_modify;
+ spin_lock_irq(&priv->lock);
+ queue_delayed_work(ipoib_workqueue,
+ &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+ /* Add this entry to passive ids list head, but do not re-add it
+ * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+ p->jiffies = jiffies;
+ if (p->state == IPOIB_CM_RX_LIVE)
+ list_move(&p->list, &priv->cm.passive_ids);
+ spin_unlock_irq(&priv->lock);
+
ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
if (ret) {
ipoib_warn(priv, "failed to send REP: %d\n", ret);
- goto err_rep;
+ if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+ ipoib_warn(priv, "unable to move qp to error state\n");
}
-
- cm_id->context = p;
- p->jiffies = jiffies;
- spin_lock_irq(&priv->lock);
- list_add(&p->list, &priv->cm.passive_ids);
- spin_unlock_irq(&priv->lock);
- queue_delayed_work(ipoib_workqueue,
- &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
return 0;
-err_rep:
err_modify:
ib_destroy_qp(p->qp);
err_qp:
@@ -276,7 +348,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
{
struct ipoib_cm_rx *p;
struct ipoib_dev_priv *priv;
- int ret;
switch (event->event) {
case IB_CM_REQ_RECEIVED:
@@ -288,20 +359,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
case IB_CM_REJ_RECEIVED:
p = cm_id->context;
priv = netdev_priv(p->dev);
- spin_lock_irq(&priv->lock);
- if (list_empty(&p->list))
- ret = 0; /* Connection is going away already. */
- else {
- list_del_init(&p->list);
- ret = -ECONNRESET;
- }
- spin_unlock_irq(&priv->lock);
- if (ret) {
- ib_destroy_qp(p->qp);
- kfree(p);
- return ret;
- }
- return 0;
+ if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+ ipoib_warn(priv, "unable to move qp to error state\n");
+ /* Fall through */
default:
return 0;
}
@@ -353,8 +413,15 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
wr_id, wc->status);
if (unlikely(wr_id >= ipoib_recvq_size)) {
- ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
- wr_id, ipoib_recvq_size);
+ if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+ spin_lock_irqsave(&priv->lock, flags);
+ list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+ ipoib_cm_start_rx_drain(priv);
+ queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ } else
+ ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_recvq_size);
return;
}
@@ -373,13 +440,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
spin_lock_irqsave(&priv->lock, flags);
p->jiffies = jiffies;
- /* Move this entry to list head, but do
- * not re-add it if it has been removed. */
- if (!list_empty(&p->list))
+ /* Move this entry to list head, but do not re-add it
+ * if it has been moved out of list. */
+ if (p->state == IPOIB_CM_RX_LIVE)
list_move(&p->list, &priv->cm.passive_ids);
spin_unlock_irqrestore(&priv->lock, flags);
- queue_delayed_work(ipoib_workqueue,
- &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
}
}
@@ -593,8 +658,7 @@ int ipoib_cm_dev_open(struct net_device *dev)
if (IS_ERR(priv->cm.id)) {
printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
ret = PTR_ERR(priv->cm.id);
- priv->cm.id = NULL;
- return ret;
+ goto err_cm;
}
ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
@@ -602,34 +666,76 @@ int ipoib_cm_dev_open(struct net_device *dev)
if (ret) {
printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
IPOIB_CM_IETF_ID | priv->qp->qp_num);
- ib_destroy_cm_id(priv->cm.id);
- priv->cm.id = NULL;
- return ret;
+ goto err_listen;
}
+
return 0;
+
+err_listen:
+ ib_destroy_cm_id(priv->cm.id);
+err_cm:
+ priv->cm.id = NULL;
+ return ret;
}
void ipoib_cm_dev_stop(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ipoib_cm_rx *p;
+ struct ipoib_cm_rx *p, *n;
+ unsigned long begin;
+ LIST_HEAD(list);
+ int ret;
if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
return;
ib_destroy_cm_id(priv->cm.id);
priv->cm.id = NULL;
+
spin_lock_irq(&priv->lock);
while (!list_empty(&priv->cm.passive_ids)) {
p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
- list_del_init(&p->list);
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
+ spin_unlock_irq(&priv->lock);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+ spin_lock_irq(&priv->lock);
+ }
+
+ /* Wait for all RX to be drained */
+ begin = jiffies;
+
+ while (!list_empty(&priv->cm.rx_error_list) ||
+ !list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list)) {
+ if (time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "RX drain timing out\n");
+
+ /*
+ * assume the HW is wedged and just free up everything.
+ */
+ list_splice_init(&priv->cm.rx_flush_list, &list);
+ list_splice_init(&priv->cm.rx_error_list, &list);
+ list_splice_init(&priv->cm.rx_drain_list, &list);
+ break;
+ }
spin_unlock_irq(&priv->lock);
+ msleep(1);
+ ipoib_drain_cq(dev);
+ spin_lock_irq(&priv->lock);
+ }
+
+ list_splice_init(&priv->cm.rx_reap_list, &list);
+
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(p, n, &list, list) {
ib_destroy_cm_id(p->id);
ib_destroy_qp(p->qp);
kfree(p);
- spin_lock_irq(&priv->lock);
}
- spin_unlock_irq(&priv->lock);
cancel_delayed_work(&priv->cm.stale_task);
}
@@ -646,9 +752,9 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
p->mtu = be32_to_cpu(data->mtu);
- if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) {
- ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n",
- p->mtu, priv->dev->mtu);
+ if (p->mtu <= IPOIB_ENCAP_LEN) {
+ ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+ p->mtu, IPOIB_ENCAP_LEN);
return -EINVAL;
}
@@ -1080,26 +1186,50 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
queue_work(ipoib_workqueue, &priv->cm.skb_task);
}
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.rx_reap_task);
+ struct ipoib_cm_rx *p, *n;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&priv->lock);
+ list_splice_init(&priv->cm.rx_reap_list, &list);
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(p, n, &list, list) {
+ ib_destroy_cm_id(p->id);
+ ib_destroy_qp(p->qp);
+ kfree(p);
+ }
+}
+
static void ipoib_cm_stale_task(struct work_struct *work)
{
struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
cm.stale_task.work);
struct ipoib_cm_rx *p;
+ int ret;
spin_lock_irq(&priv->lock);
while (!list_empty(&priv->cm.passive_ids)) {
- /* List if sorted by LRU, start from tail,
+ /* List is sorted by LRU, start from tail,
* stop when we see a recently used entry */
p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
break;
- list_del_init(&p->list);
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
spin_unlock_irq(&priv->lock);
- ib_destroy_cm_id(p->id);
- ib_destroy_qp(p->qp);
- kfree(p);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
spin_lock_irq(&priv->lock);
}
+
+ if (!list_empty(&priv->cm.passive_ids))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
spin_unlock_irq(&priv->lock);
}
@@ -1161,9 +1291,14 @@ int ipoib_cm_dev_init(struct net_device *dev)
INIT_LIST_HEAD(&priv->cm.passive_ids);
INIT_LIST_HEAD(&priv->cm.reap_list);
INIT_LIST_HEAD(&priv->cm.start_list);
+ INIT_LIST_HEAD(&priv->cm.rx_error_list);
+ INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+ INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+ INIT_LIST_HEAD(&priv->cm.rx_reap_list);
INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+ INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
skb_queue_head_init(&priv->cm.skb_queue);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 68d72c6..8404f05 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -448,6 +448,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+ ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ return -1;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
ret = ipoib_init_qp(dev);
if (ret) {
ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
@@ -457,14 +464,14 @@ int ipoib_ib_dev_open(struct net_device *dev)
ret = ipoib_ib_post_receives(dev);
if (ret) {
ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
return -1;
}
ret = ipoib_cm_dev_open(dev);
if (ret) {
- ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
- ipoib_ib_dev_stop(dev);
+ ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+ ipoib_ib_dev_stop(dev, 1);
return -1;
}
@@ -516,7 +523,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
mutex_lock(&pkey_mutex);
set_bit(IPOIB_PKEY_STOP, &priv->flags);
- cancel_delayed_work(&priv->pkey_task);
+ cancel_delayed_work(&priv->pkey_poll_task);
mutex_unlock(&pkey_mutex);
if (flush)
flush_workqueue(ipoib_workqueue);
@@ -543,13 +550,30 @@ static int recvs_pending(struct net_device *dev)
return pending;
}
-int ipoib_ib_dev_stop(struct net_device *dev)
+void ipoib_drain_cq(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int i, n;
+ do {
+ n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
+ for (i = 0; i < n; ++i) {
+ if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
+ ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+ else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
+ ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+ else
+ ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+ }
+ } while (n == IPOIB_NUM_WC);
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_attr qp_attr;
unsigned long begin;
struct ipoib_tx_buf *tx_req;
- int i, n;
+ int i;
clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
netif_poll_disable(dev);
@@ -604,17 +628,7 @@ int ipoib_ib_dev_stop(struct net_device *dev)
goto timeout;
}
- do {
- n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
- for (i = 0; i < n; ++i) {
- if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
- ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
- else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
- ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
- else
- ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
- }
- } while (n == IPOIB_NUM_WC);
+ ipoib_drain_cq(dev);
msleep(1);
}
@@ -629,7 +643,8 @@ timeout:
/* Wait for all AHs to be reaped */
set_bit(IPOIB_STOP_REAPER, &priv->flags);
cancel_delayed_work(&priv->ah_reap_task);
- flush_workqueue(ipoib_workqueue);
+ if (flush)
+ flush_workqueue(ipoib_workqueue);
begin = jiffies;
@@ -673,13 +688,24 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
return 0;
}
-void ipoib_ib_dev_flush(struct work_struct *work)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
{
- struct ipoib_dev_priv *cpriv, *priv =
- container_of(work, struct ipoib_dev_priv, flush_task);
+ struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv->dev;
+ u16 new_index;
+
+ mutex_lock(&priv->vlan_mutex);
+
+ /*
+ * Flush any child interfaces too -- they might be up even if
+ * the parent is down.
+ */
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ __ipoib_ib_dev_flush(cpriv, pkey_event);
- if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
+ mutex_unlock(&priv->vlan_mutex);
+
+ if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
return;
}
@@ -689,10 +715,32 @@ void ipoib_ib_dev_flush(struct work_struct *work)
return;
}
+ if (pkey_event) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ipoib_ib_dev_down(dev, 0);
+ ipoib_pkey_dev_delay_open(dev);
+ return;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ /* restart QP only if P_Key index is changed */
+ if (new_index == priv->pkey_index) {
+ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+ return;
+ }
+ priv->pkey_index = new_index;
+ }
+
ipoib_dbg(priv, "flushing\n");
ipoib_ib_dev_down(dev, 0);
+ if (pkey_event) {
+ ipoib_ib_dev_stop(dev, 0);
+ ipoib_ib_dev_open(dev);
+ }
+
/*
* The device could have been brought down between the start and when
* we get here, don't bring it back up if it's not configured up
@@ -701,14 +749,24 @@ void ipoib_ib_dev_flush(struct work_struct *work)
ipoib_ib_dev_up(dev);
ipoib_mcast_restart_task(&priv->restart_task);
}
+}
- mutex_lock(&priv->vlan_mutex);
+void ipoib_ib_dev_flush(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_task);
- /* Flush any child interfaces too */
- list_for_each_entry(cpriv, &priv->child_intfs, list)
- ipoib_ib_dev_flush(&cpriv->flush_task);
+ ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
+ __ipoib_ib_dev_flush(priv, 0);
+}
- mutex_unlock(&priv->vlan_mutex);
+void ipoib_pkey_event(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, pkey_event_task);
+
+ ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
+ __ipoib_ib_dev_flush(priv, 1);
}
void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -736,7 +794,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
void ipoib_pkey_poll(struct work_struct *work)
{
struct ipoib_dev_priv *priv =
- container_of(work, struct ipoib_dev_priv, pkey_task.work);
+ container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
struct net_device *dev = priv->dev;
ipoib_pkey_dev_check_presence(dev);
@@ -747,7 +805,7 @@ void ipoib_pkey_poll(struct work_struct *work)
mutex_lock(&pkey_mutex);
if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
queue_delayed_work(ipoib_workqueue,
- &priv->pkey_task,
+ &priv->pkey_poll_task,
HZ);
mutex_unlock(&pkey_mutex);
}
@@ -766,7 +824,7 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev)
mutex_lock(&pkey_mutex);
clear_bit(IPOIB_PKEY_STOP, &priv->flags);
queue_delayed_work(ipoib_workqueue,
- &priv->pkey_task,
+ &priv->pkey_poll_task,
HZ);
mutex_unlock(&pkey_mutex);
return 1;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0a428f2..894b1dcd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -107,7 +107,7 @@ int ipoib_open(struct net_device *dev)
return -EINVAL;
if (ipoib_ib_dev_up(dev)) {
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
return -EINVAL;
}
@@ -152,7 +152,7 @@ static int ipoib_stop(struct net_device *dev)
flush_workqueue(ipoib_workqueue);
ipoib_ib_dev_down(dev, 1);
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
@@ -988,7 +988,8 @@ static void ipoib_setup(struct net_device *dev)
INIT_LIST_HEAD(&priv->dead_ahs);
INIT_LIST_HEAD(&priv->multicast_list);
- INIT_DELAYED_WORK(&priv->pkey_task, ipoib_pkey_poll);
+ INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+ INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 54fbead..aae3670 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -524,7 +524,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
return;
if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
- ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+ ipoib_warn(priv, "ib_query_gid() failed\n");
else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 5c3c6a4..982eb88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -33,8 +33,6 @@
* $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
*/
-#include <rdma/ib_cache.h>
-
#include "ipoib.h"
int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
@@ -49,7 +47,7 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
if (!qp_attr)
goto out;
- if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
ret = -ENXIO;
goto out;
@@ -94,26 +92,16 @@ int ipoib_init_qp(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
- u16 pkey_index;
struct ib_qp_attr qp_attr;
int attr_mask;
- /*
- * Search through the port P_Key table for the requested pkey value.
- * The port has to be assigned to the respective IB partition in
- * advance.
- */
- ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
- if (ret) {
- clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
- return ret;
- }
- set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+ return -1;
qp_attr.qp_state = IB_QPS_INIT;
qp_attr.qkey = 0;
qp_attr.port_num = priv->port;
- qp_attr.pkey_index = pkey_index;
+ qp_attr.pkey_index = priv->pkey_index;
attr_mask =
IB_QP_QKEY |
IB_QP_PORT |
@@ -185,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
size = ipoib_sendq_size + ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(dev);
if (!ret)
- size += ipoib_recvq_size;
+ size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
if (IS_ERR(priv->cq)) {
@@ -259,14 +247,18 @@ void ipoib_event(struct ib_event_handler *handler,
struct ipoib_dev_priv *priv =
container_of(handler, struct ipoib_dev_priv, event_handler);
- if ((record->event == IB_EVENT_PORT_ERR ||
- record->event == IB_EVENT_PKEY_CHANGE ||
- record->event == IB_EVENT_PORT_ACTIVE ||
- record->event == IB_EVENT_LID_CHANGE ||
- record->event == IB_EVENT_SM_CHANGE ||
- record->event == IB_EVENT_CLIENT_REREGISTER) &&
- record->element.port_num == priv->port) {
+ if (record->element.port_num != priv->port)
+ return;
+
+ if (record->event == IB_EVENT_PORT_ERR ||
+ record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_LID_CHANGE ||
+ record->event == IB_EVENT_SM_CHANGE ||
+ record->event == IB_EVENT_CLIENT_REREGISTER) {
ipoib_dbg(priv, "Port state change event\n");
queue_work(ipoib_workqueue, &priv->flush_task);
+ } else if (record->event == IB_EVENT_PKEY_CHANGE) {
+ ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
+ queue_work(ipoib_workqueue, &priv->pkey_event_task);
}
}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 89d6008..3702e23 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -35,7 +35,6 @@
#include <asm/io.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/version.h>