From a3c874200cbcd95ed914ba84f33f571a0ef7adfa Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 20 Jul 2015 19:54:36 +0300
Subject: mlx5: Fix missing device local_dma_lkey

The mlx5 driver exposes device capability IB_DEVICE_LOCAL_DMA_LKEY
but does not set the the device local_dma_lkey. This breaks
rpcrdma drivers.

Query and set this lkey when creating the device resources.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 085c24b..5ab5bb3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1256,10 +1256,18 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	struct ib_srq_init_attr attr;
 	struct mlx5_ib_dev *dev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
+	u32 rsvd_lkey;
 	int ret = 0;
 
 	dev = container_of(devr, struct mlx5_ib_dev, devr);
 
+	ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey);
+	if (ret) {
+		pr_err("Failed to query special context %d\n", ret);
+		return ret;
+	}
+	dev->ib_dev.local_dma_lkey = rsvd_lkey;
+
 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
 	if (IS_ERR(devr->p0)) {
 		ret = PTR_ERR(devr->p0);
@@ -1421,7 +1429,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
 	dev->ib_dev.owner		= THIS_MODULE;
 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
-	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
 	dev->ib_dev.num_comp_vectors    =
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 9335e5a..aa0d5ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -200,3 +200,25 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
 
 	return err;
 }
+
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey)
+{
+	struct mlx5_cmd_query_special_contexts_mbox_in in;
+	struct mlx5_cmd_query_special_contexts_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	*rsvd_lkey = be32_to_cpu(out.resd_lkey);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_special_context);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b943cd9..6e4169c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -402,6 +402,17 @@ struct mlx5_cmd_teardown_hca_mbox_out {
 	u8			rsvd[8];
 };
 
+struct mlx5_cmd_query_special_contexts_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_query_special_contexts_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32                  dump_fill_mkey;
+	__be32                  resd_lkey;
+};
+
 struct mlx5_cmd_layout {
 	u8		type;
 	u8		rsvd0[3];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5722d88..1e2e48c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -828,6 +828,7 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey);
 
 struct mlx5_profile {
 	u64	mask;
-- 
cgit v0.10.2


From e0238a6a369eaa2e41b2c0321453272fb859f618 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Tue, 21 Jul 2015 14:40:12 +0300
Subject: mlx5: Expose correct page_size_cap in device attributes

Should be all the page sizes that are supported by the
device.

Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5ab5bb3..1ece3a7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -212,6 +212,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	int err = -ENOMEM;
 	int max_rq_sg;
 	int max_sq_sg;
+	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
 
 	if (uhw->inlen || uhw->outlen)
 		return -EINVAL;
@@ -264,7 +265,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->hw_ver		   = mdev->pdev->revision;
 
 	props->max_mr_size	   = ~0ull;
-	props->page_size_cap	   = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+	props->page_size_cap	   = ~(min_page_size - 1);
 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
 	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
-- 
cgit v0.10.2


From d6f1c17e162b2a11e708f28fa93f2f79c164b442 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Tue, 21 Jul 2015 08:36:07 -0400
Subject: IB/qib: Change lkey table allocation to support more MRs

The lkey table is allocated with with a get_user_pages() with an
order based on a number of index bits from a module parameter.

The underlying kernel code cannot allocate that many contiguous pages.

There is no reason the underlying memory needs to be physically
contiguous.

This patch:
- switches the allocation/deallocation to vmalloc/vfree
- caps the number of bits to 23 to insure at least 1 generation bit
  o this matches the module parameter description

Cc: stable@vger.kernel.org
Reviewed-by: Vinit Agnihotri <vinit.abhay.agnihotri@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c
index ad843c7..5afaa21 100644
--- a/drivers/infiniband/hw/qib/qib_keys.c
+++ b/drivers/infiniband/hw/qib/qib_keys.c
@@ -86,6 +86,10 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region)
 	 * unrestricted LKEY.
 	 */
 	rkt->gen++;
+	/*
+	 * bits are capped in qib_verbs.c to insure enough bits
+	 * for generation number
+	 */
 	mr->lkey = (r << (32 - ib_qib_lkey_table_size)) |
 		((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen)
 		 << 8);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index a05d1a3..77e981a 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -40,6 +40,7 @@
 #include <linux/rculist.h>
 #include <linux/mm.h>
 #include <linux/random.h>
+#include <linux/vmalloc.h>
 
 #include "qib.h"
 #include "qib_common.h"
@@ -2109,10 +2110,16 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	 * the LKEY).  The remaining bits act as a generation number or tag.
 	 */
 	spin_lock_init(&dev->lk_table.lock);
+	/* insure generation is at least 4 bits see keys.c */
+	if (ib_qib_lkey_table_size > MAX_LKEY_TABLE_BITS) {
+		qib_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
+			ib_qib_lkey_table_size, MAX_LKEY_TABLE_BITS);
+		ib_qib_lkey_table_size = MAX_LKEY_TABLE_BITS;
+	}
 	dev->lk_table.max = 1 << ib_qib_lkey_table_size;
 	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
 	dev->lk_table.table = (struct qib_mregion __rcu **)
-		__get_free_pages(GFP_KERNEL, get_order(lk_tab_size));
+		vmalloc(lk_tab_size);
 	if (dev->lk_table.table == NULL) {
 		ret = -ENOMEM;
 		goto err_lk;
@@ -2286,7 +2293,7 @@ err_tx:
 					sizeof(struct qib_pio_header),
 				  dev->pio_hdrs, dev->pio_hdrs_phys);
 err_hdrs:
-	free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size));
+	vfree(dev->lk_table.table);
 err_lk:
 	kfree(dev->qp_table);
 err_qpt:
@@ -2340,8 +2347,7 @@ void qib_unregister_ib_device(struct qib_devdata *dd)
 					sizeof(struct qib_pio_header),
 				  dev->pio_hdrs, dev->pio_hdrs_phys);
 	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
-	free_pages((unsigned long) dev->lk_table.table,
-		   get_order(lk_tab_size));
+	vfree(dev->lk_table.table);
 	kfree(dev->qp_table);
 }
 
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 1635572..bce0fa5 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -647,6 +647,8 @@ struct qib_qpn_table {
 	struct qpn_map map[QPNMAP_ENTRIES];
 };
 
+#define MAX_LKEY_TABLE_BITS 23
+
 struct qib_lkey_table {
 	spinlock_t lock; /* protect changes in this struct */
 	u32 next;               /* next unused index (speeds search) */
-- 
cgit v0.10.2


From d6c7276be180fbd0a30cda8fac5c82d483be47b6 Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Mon, 27 Jul 2015 14:43:23 -0700
Subject: IB/mlx5: Remove dead code from alloc_cached_mr()

The only place that assigns mr inside the loop already does a break.
So "if (mr)" will never be true here since the function initializes mr
to NULL at the top.  We can just drop the extra if and break here.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Acked-by: Eli Cohen  <eli@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index bc9a0de..10d2b21 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -441,9 +441,6 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 		spin_unlock_irq(&ent->lock);
 
 		queue_work(cache->wq, &ent->work);
-
-		if (mr)
-			break;
 	}
 
 	if (!mr)
-- 
cgit v0.10.2


From 7854550ae6d89bb90980b9885c7a71e471820bf2 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 28 Jul 2015 09:13:52 -0500
Subject: RDMA/iser: Limit sgs to the device fastreg depth

Currently the sg tablesize, which dictates fast register page list
depth to use, does not take into account the limits of the rdma device.
So adjust it once we discover the device fastreg max depth limit.  Also
adjust the max_sectors based on the resulting sg tablesize.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 6a594aa..de8730d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -640,6 +640,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 						   SHOST_DIX_GUARD_CRC);
 		}
 
+		/*
+		 * Limit the sg_tablesize and max_sectors based on the device
+		 * max fastreg page list length.
+		 */
+		shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
+			ib_conn->device->dev_attr.max_fast_reg_page_list_len);
+		shost->max_sectors = min_t(unsigned int,
+			1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
+
 		if (iscsi_host_add(shost,
 				   ib_conn->device->ib_device->dma_device)) {
 			mutex_unlock(&iser_conn->state_mutex);
-- 
cgit v0.10.2


From b8ac3112462900a55a45df8e6098c67a139d8c2d Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Mon, 27 Jul 2015 14:08:52 +0530
Subject: iw_cxgb4: set the default MPA version to 2

This enables ORD/IRD negotiation and its about time to enable it by
default

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 3ad8dc7..75144d9 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -115,11 +115,11 @@ module_param(ep_timeout_secs, int, 0644);
 MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
 				   "in seconds (default=60)");
 
-static int mpa_rev = 1;
+static int mpa_rev = 2;
 module_param(mpa_rev, int, 0644);
 MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
 		"1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
-		" compliant (default=1)");
+		" compliant (default=2)");
 
 static int markers_enabled;
 module_param(markers_enabled, int, 0644);
-- 
cgit v0.10.2


From 7e967fd0b84a843b2475acc67a5a8df138c5f5c0 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Tue, 4 Aug 2015 17:13:32 -0600
Subject: IB/ucma: Fix theoretical user triggered use-after-free

Something like this:

CPU A                         CPU B
Acked-by: Sean Hefty <sean.hefty@intel.com>

========================      ================================
ucma_destroy_id()
 wait_for_completion()
                              .. anything
                                ucma_put_ctx()
                                  complete()
 .. continues ...
                              ucma_leave_multicast()
                               mutex_lock(mut)
                                 atomic_inc(ctx->ref)
                               mutex_unlock(mut)
 ucma_free_ctx()
  ucma_cleanup_multicast()
   mutex_lock(mut)
     kfree(mc)
                               rdma_leave_multicast(mc->ctx->cm_id,..

Fix it by latching the ref at 0. Once it goes to 0 mc and ctx cannot
leave the mutex(mut) protection.

The other atomic_inc in ucma_get_ctx is OK because mutex(mut) protects
it from racing with ucma_destroy_id.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Acked-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b2121..acac9ea 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1321,10 +1321,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
 		mc = ERR_PTR(-ENOENT);
 	else if (mc->ctx->file != file)
 		mc = ERR_PTR(-EINVAL);
-	else {
+	else if (!atomic_inc_not_zero(&mc->ctx->ref))
+		mc = ERR_PTR(-ENXIO);
+	else
 		idr_remove(&multicast_idr, mc->id);
-		atomic_inc(&mc->ctx->ref);
-	}
 	mutex_unlock(&mut);
 
 	if (IS_ERR(mc)) {
-- 
cgit v0.10.2


From 6c26a77124ff94102ea59ac23a54cdad2c49e644 Mon Sep 17 00:00:00 2001
From: Spencer Baugh <sbaugh@catern.com>
Date: Thu, 13 Aug 2015 12:19:10 -0700
Subject: RDMA/cma: fix IPv6 address resolution

Resolving a link-local IPv6 address with an unspecified source address
was broken by commit 5462eddd7a, which prevented the IPv6 stack from
learning the scope id of the link-local IPv6 address, causing random
failures as the IP stack chose a random link to resolve the address on.

This commit 5462eddd7a made us bail out of cma_check_linklocal early if
the address passed in was not an IPv6 link-local address. On the address
resolution path, the address passed in is the source address; if the
source address is the unspecified address, which is not link-local, we
will bail out early.

This is mostly correct, but if the destination address is a link-local
address, then we will be following a link-local route, and we'll need to
tell the IPv6 stack what the scope id of the destination address is.
This used to be done by last line of cma_check_linklocal, which is
skipped when bailing out early:

	dev_addr->bound_dev_if = sin6->sin6_scope_id;

(In cma_bind_addr, the sin6_scope_id of the source address is set to the
sin6_scope_id of the destination address, so this is correct)
This line is required in turn for the following line, L279 of
addr6_resolve, to actually inform the IPv6 stack of the scope id:

      fl6.flowi6_oif = addr->bound_dev_if;

Since we can only know we are in this failure case when we have access
to both the source IPv6 address and destination IPv6 address, we have to
deal with this further up the stack. So detect this failure case in
cma_bind_addr, and set bound_dev_if to the destination address scope id
to correct it.

Signed-off-by: Spencer Baugh <sbaugh@catern.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 143ded2..4e72e4c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2203,8 +2203,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
 		src_addr->sa_family = dst_addr->sa_family;
 		if (dst_addr->sa_family == AF_INET6) {
-			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
-				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+			struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+			struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+			src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+			if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
 		} else if (dst_addr->sa_family == AF_IB) {
 			((struct sockaddr_ib *) src_addr)->sib_pkey =
 				((struct sockaddr_ib *) dst_addr)->sib_pkey;
-- 
cgit v0.10.2


From 84cc6ac62d4386f5b6d9ccf2900686b5648e230f Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Tue, 25 Aug 2015 14:08:23 +0530
Subject: iw_cxgb4: Add support for clip

Add support for ipv6 address handling clip api provided by lld

Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Acked-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 75144d9..f0c1512 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -50,6 +50,7 @@
 #include <rdma/ib_addr.h>
 
 #include "iw_cxgb4.h"
+#include "clip_tbl.h"
 
 static char *states[] = {
 	"idle",
@@ -298,6 +299,16 @@ void _c4iw_free_ep(struct kref *kref)
 	if (test_bit(QP_REFERENCED, &ep->com.flags))
 		deref_qp(ep);
 	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		if (ep->com.remote_addr.ss_family == AF_INET6) {
+			struct sockaddr_in6 *sin6 =
+					(struct sockaddr_in6 *)
+					&ep->com.mapped_local_addr;
+
+			cxgb4_clip_release(
+					ep->com.dev->rdev.lldi.ports[0],
+					(const u32 *)&sin6->sin6_addr.s6_addr,
+					1);
+		}
 		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
 		dst_release(ep->dst);
@@ -442,6 +453,12 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
 	kfree_skb(skb);
 	connect_reply_upcall(ep, -EHOSTUNREACH);
 	state_set(&ep->com, DEAD);
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
 	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 	dst_release(ep->dst);
@@ -640,6 +657,7 @@ static int send_connect(struct c4iw_ep *ep)
 	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
 				   &ep->com.mapped_remote_addr;
 	int win;
+	int ret;
 
 	wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
 			roundup(sizev4, 16) :
@@ -693,6 +711,11 @@ static int send_connect(struct c4iw_ep *ep)
 		opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
 		opt2 |= T5_ISS_F;
 	}
+
+	if (ep->com.remote_addr.ss_family == AF_INET6)
+		cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+			       (const u32 *)&la6->sin6_addr.s6_addr, 1);
+
 	t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
 
 	if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
@@ -790,7 +813,11 @@ static int send_connect(struct c4iw_ep *ep)
 	}
 
 	set_bit(ACT_OPEN_REQ, &ep->com.history);
-	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	if (ret && ep->com.remote_addr.ss_family == AF_INET6)
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&la6->sin6_addr.s6_addr, 1);
+	return ret;
 }
 
 static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
@@ -2091,6 +2118,15 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	case CPL_ERR_CONN_EXIST:
 		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
 			set_bit(ACT_RETRY_INUSE, &ep->com.history);
+			if (ep->com.remote_addr.ss_family == AF_INET6) {
+				struct sockaddr_in6 *sin6 =
+						(struct sockaddr_in6 *)
+						&ep->com.mapped_local_addr;
+				cxgb4_clip_release(
+						ep->com.dev->rdev.lldi.ports[0],
+						(const u32 *)
+						&sin6->sin6_addr.s6_addr, 1);
+			}
 			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
 					atid);
 			cxgb4_free_atid(t, atid);
@@ -2118,6 +2154,12 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	connect_reply_upcall(ep, status2errno(status));
 	state_set(&ep->com, DEAD);
 
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
 	if (status && act_open_has_tid(status))
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
 
@@ -2302,6 +2344,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct dst_entry *dst;
 	__u8 local_ip[16], peer_ip[16];
 	__be16 local_port, peer_port;
+	struct sockaddr_in6 *sin6;
 	int err;
 	u16 peer_mss = ntohs(req->tcpopt.mss);
 	int iptype;
@@ -2400,9 +2443,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 		sin->sin_port = peer_port;
 		sin->sin_addr.s_addr = *(__be32 *)peer_ip;
 	} else {
-		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
-			&child_ep->com.mapped_local_addr;
-
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
 		sin6->sin6_family = PF_INET6;
 		sin6->sin6_port = local_port;
 		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
@@ -2436,6 +2477,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
 	accept_cr(child_ep, skb, req);
 	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	if (iptype == 6) {
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
+		cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
+			       (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
 	goto out;
 reject:
 	reject_cr(dev, hwtid, skb);
@@ -2672,6 +2718,15 @@ out:
 	if (release)
 		release_ep_resources(ep);
 	else if (ep->retry_with_mpa_v1) {
+		if (ep->com.remote_addr.ss_family == AF_INET6) {
+			struct sockaddr_in6 *sin6 =
+					(struct sockaddr_in6 *)
+					&ep->com.mapped_local_addr;
+			cxgb4_clip_release(
+					ep->com.dev->rdev.lldi.ports[0],
+					(const u32 *)&sin6->sin6_addr.s6_addr,
+					1);
+		}
 		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
 		dst_release(ep->dst);
@@ -3186,6 +3241,9 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
 		pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
 		       err, ep->stid,
 		       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
+	else
+		cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+			       (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 	return err;
 }
 
@@ -3334,6 +3392,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
 			ep->com.dev->rdev.lldi.ports[0], ep->stid,
 			ep->com.dev->rdev.lldi.rxq_ids[0], 0);
 	} else {
+		struct sockaddr_in6 *sin6;
 		c4iw_init_wr_wait(&ep->com.wr_wait);
 		err = cxgb4_remove_server(
 				ep->com.dev->rdev.lldi.ports[0], ep->stid,
@@ -3342,6 +3401,9 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
 			goto done;
 		err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
 					  0, 0, __func__);
+		sin6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 	}
 	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
@@ -3461,6 +3523,12 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
 	mutex_unlock(&dev->rdev.stats.lock);
 	connect_reply_upcall(ep, status2errno(req->retval));
 	state_set(&ep->com, DEAD);
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
 	remove_handle(dev, &dev->atid_idr, atid);
 	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
 	dst_release(ep->dst);
-- 
cgit v0.10.2


From 2dfcad3adea961916f013387889bc418c65421cd Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Thu, 27 Aug 2015 14:18:57 -0400
Subject: Staging: Add staging/rdma directory and update MAINTAINERS

Create the rdma directory in the staging area for use as we deprecate
some older drivers and as we bring in some new drivers that are in
need of work.  Update the MAINTAINERS file so that updates to these
files go to linux-rdma@vger.kernel.org.  Expected lifespan of this
directory is three releases for any deprecated drivers moved here
and an unknown, but theoretically bounded amount of time for the new
drivers as a new core RDMA transfer library needs to be written and
the drivers modified to use it in order for them to move out of this
directory.

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 569568f..a82aa07 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5278,6 +5278,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
 S:	Supported
 F:	Documentation/infiniband/
 F:	drivers/infiniband/
+F:	drivers/staging/rdma/
 F:	include/uapi/linux/if_infiniband.h
 F:	include/uapi/rdma/
 F:	include/rdma/
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 7f6cae5..936016f 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -74,6 +74,8 @@ source "drivers/staging/nvec/Kconfig"
 
 source "drivers/staging/media/Kconfig"
 
+source "drivers/staging/rdma/Kconfig"
+
 source "drivers/staging/android/Kconfig"
 
 source "drivers/staging/board/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 347f647..c6c4454 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_FT1000)		+= ft1000/
 obj-$(CONFIG_SPEAKUP)		+= speakup/
 obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4)	+= ste_rmi4/
 obj-$(CONFIG_MFD_NVEC)		+= nvec/
+obj-$(CONFIG_STAGING_RDMA)	+= rdma/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_STAGING_BOARD)	+= board/
 obj-$(CONFIG_USB_WPAN_HCD)	+= ozwpan/
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
new file mode 100644
index 0000000..0bff438
--- /dev/null
+++ b/drivers/staging/rdma/Kconfig
@@ -0,0 +1,25 @@
+menuconfig STAGING_RDMA
+        bool "RDMA staging drivers"
+	depends on INFINIBAND
+	depends on PCI || BROKEN
+	depends on HAS_IOMEM
+	depends on NET
+	depends on INET
+        default n
+        ---help---
+          This option allows you to select a number of RDMA drivers that
+	  fall into one of two categories: deprecated drivers being held
+	  here before finally being removed or new drivers that still need
+	  some work before being moved to the normal RDMA driver area.
+
+          If you wish to work on these drivers, to help improve them, or
+          to report problems you have with them, please use the
+	  linux-rdma@vger.kernel.org mailing list.
+
+          If in doubt, say N here.
+
+
+# Please keep entries in alphabetic order
+if STAGING_RDMA
+
+endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
new file mode 100644
index 0000000..b5e94f1
--- /dev/null
+++ b/drivers/staging/rdma/Makefile
@@ -0,0 +1 @@
+# Entries for RDMA_STAGING tree
-- 
cgit v0.10.2


From 6f9b38903c06c159d167344821cd7b4bae864380 Mon Sep 17 00:00:00 2001
From: Dennis Dalessandro <dennis.dalessandro@intel.com>
Date: Thu, 30 Jul 2015 09:25:42 -0400
Subject: IB/ipath: Deprecate ipath driver and move to staging.

It is now time for the ipath driver to begin to be phased out of the kernel.
This patch moves the ipath driver from the Infiniband sub tree to the staging
area where it will remain until the code is removed from the kernel in a few
releases.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index a82aa07..db1a523 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5536,7 +5536,7 @@ IPATH DRIVER
 M:	Mike Marciniszyn <infinipath@intel.com>
 L:	linux-rdma@vger.kernel.org
 S:	Maintained
-F:	drivers/infiniband/hw/ipath/
+F:	drivers/staging/rdma/ipath/
 
 IPMI SUBSYSTEM
 M:	Corey Minyard <minyard@acm.org>
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index b899531..7d6034f 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -55,7 +55,6 @@ config INFINIBAND_ADDR_TRANS
 	default y
 
 source "drivers/infiniband/hw/mthca/Kconfig"
-source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index e900b03..d1212c4 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -1,5 +1,4 @@
 obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
-obj-$(CONFIG_INFINIBAND_IPATH)		+= ipath/
 obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_EHCA)		+= ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
deleted file mode 100644
index 8fe54ff..0000000
--- a/drivers/infiniband/hw/ipath/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config INFINIBAND_IPATH
-	tristate "QLogic HTX HCA support"
-	depends on 64BIT && NET && HT_IRQ
-	---help---
-	This is a driver for the obsolete QLogic Hyper-Transport
-	IB host channel adapter (model QHT7140),
-	including InfiniBand verbs support.  This driver allows these
-	devices to be used with both kernel upper level protocols such
-	as IP-over-InfiniBand as well as with userspace applications
-	(in conjunction with InfiniBand userspace access).
-	For QLogic PCIe QLE based cards, use the QIB driver instead.
-
-	If you have this hardware you will need to boot with PAT disabled
-	on your x86-64 systems, use the nopat kernel parameter.
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
deleted file mode 100644
index 4496f28..0000000
--- a/drivers/infiniband/hw/ipath/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
-	-DIPATH_KERN_TYPE=0
-
-obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
-
-ib_ipath-y := \
-	ipath_cq.o \
-	ipath_diag.o \
-	ipath_dma.o \
-	ipath_driver.o \
-	ipath_eeprom.o \
-	ipath_file_ops.o \
-	ipath_fs.o \
-	ipath_init_chip.o \
-	ipath_intr.o \
-	ipath_keys.o \
-	ipath_mad.o \
-	ipath_mmap.o \
-	ipath_mr.o \
-	ipath_qp.o \
-	ipath_rc.o \
-	ipath_ruc.o \
-	ipath_sdma.o \
-	ipath_srq.o \
-	ipath_stats.o \
-	ipath_sysfs.o \
-	ipath_uc.o \
-	ipath_ud.o \
-	ipath_user_pages.o \
-	ipath_user_sdma.o \
-	ipath_verbs_mcast.o \
-	ipath_verbs.o
-
-ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
-
-ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
-ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
deleted file mode 100644
index 28cfe97..0000000
--- a/drivers/infiniband/hw/ipath/ipath_common.h
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_COMMON_H
-#define _IPATH_COMMON_H
-
-/*
- * This file contains defines, structures, etc. that are used
- * to communicate between kernel and user code.
- */
-
-
-/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
-#define IPATH_SRC_OUI_1 0x00
-#define IPATH_SRC_OUI_2 0x11
-#define IPATH_SRC_OUI_3 0x75
-
-/* version of protocol header (known to chip also). In the long run,
- * we should be able to generate and accept a range of version numbers;
- * for now we only accept one, and it's compiled in.
- */
-#define IPS_PROTO_VERSION 2
-
-/*
- * These are compile time constants that you may want to enable or disable
- * if you are trying to debug problems with code or performance.
- * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
- * fastpath code
- * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in faspath code
- * _IPATH_TRACING define as 0 if you want to remove all tracing in a
- * compilation unit
- * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
- */
-
-/*
- * The value in the BTH QP field that InfiniPath uses to differentiate
- * an infinipath protocol IB packet vs standard IB transport
- */
-#define IPATH_KD_QP 0x656b79
-
-/*
- * valid states passed to ipath_set_linkstate() user call
- */
-#define IPATH_IB_LINKDOWN		0
-#define IPATH_IB_LINKARM		1
-#define IPATH_IB_LINKACTIVE		2
-#define IPATH_IB_LINKDOWN_ONLY		3
-#define IPATH_IB_LINKDOWN_SLEEP		4
-#define IPATH_IB_LINKDOWN_DISABLE	5
-#define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
-#define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
-#define IPATH_IB_LINK_NO_HRTBT	8 /* disable Heartbeat, e.g. for loopback */
-#define IPATH_IB_LINK_HRTBT	9 /* enable heartbeat, normal, non-loopback */
-
-/*
- * These 3 values (SDR and DDR may be ORed for auto-speed
- * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
- * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
- * are also the the possible values for ipath_link_speed_enabled and active
- * The values were chosen to match values used within the IB spec.
- */
-#define IPATH_IB_SDR 1
-#define IPATH_IB_DDR 2
-
-/*
- * stats maintained by the driver.  For now, at least, this is global
- * to all minor devices.
- */
-struct infinipath_stats {
-	/* number of interrupts taken */
-	__u64 sps_ints;
-	/* number of interrupts for errors */
-	__u64 sps_errints;
-	/* number of errors from chip (not incl. packet errors or CRC) */
-	__u64 sps_errs;
-	/* number of packet errors from chip other than CRC */
-	__u64 sps_pkterrs;
-	/* number of packets with CRC errors (ICRC and VCRC) */
-	__u64 sps_crcerrs;
-	/* number of hardware errors reported (parity, etc.) */
-	__u64 sps_hwerrs;
-	/* number of times IB link changed state unexpectedly */
-	__u64 sps_iblink;
-	__u64 sps_unused; /* was fastrcvint, no longer implemented */
-	/* number of kernel (port0) packets received */
-	__u64 sps_port0pkts;
-	/* number of "ethernet" packets sent by driver */
-	__u64 sps_ether_spkts;
-	/* number of "ethernet" packets received by driver */
-	__u64 sps_ether_rpkts;
-	/* number of SMA packets sent by driver. Obsolete. */
-	__u64 sps_sma_spkts;
-	/* number of SMA packets received by driver. Obsolete. */
-	__u64 sps_sma_rpkts;
-	/* number of times all ports rcvhdrq was full and packet dropped */
-	__u64 sps_hdrqfull;
-	/* number of times all ports egrtid was full and packet dropped */
-	__u64 sps_etidfull;
-	/*
-	 * number of times we tried to send from driver, but no pio buffers
-	 * avail
-	 */
-	__u64 sps_nopiobufs;
-	/* number of ports currently open */
-	__u64 sps_ports;
-	/* list of pkeys (other than default) accepted (0 means not set) */
-	__u16 sps_pkeys[4];
-	__u16 sps_unused16[4]; /* available; maintaining compatible layout */
-	/* number of user ports per chip (not IB ports) */
-	__u32 sps_nports;
-	/* not our interrupt, or already handled */
-	__u32 sps_nullintr;
-	/* max number of packets handled per receive call */
-	__u32 sps_maxpkts_call;
-	/* avg number of packets handled per receive call */
-	__u32 sps_avgpkts_call;
-	/* total number of pages locked */
-	__u64 sps_pagelocks;
-	/* total number of pages unlocked */
-	__u64 sps_pageunlocks;
-	/*
-	 * Number of packets dropped in kernel other than errors (ether
-	 * packets if ipath not configured, etc.)
-	 */
-	__u64 sps_krdrops;
-	__u64 sps_txeparity; /* PIO buffer parity error, recovered */
-	/* pad for future growth */
-	__u64 __sps_pad[45];
-};
-
-/*
- * These are the status bits readable (in ascii form, 64bit value)
- * from the "status" sysfs file.
- */
-#define IPATH_STATUS_INITTED       0x1	/* basic initialization done */
-#define IPATH_STATUS_DISABLED      0x2	/* hardware disabled */
-/* Device has been disabled via admin request */
-#define IPATH_STATUS_ADMIN_DISABLED    0x4
-/* Chip has been found and initted */
-#define IPATH_STATUS_CHIP_PRESENT 0x20
-/* IB link is at ACTIVE, usable for data traffic */
-#define IPATH_STATUS_IB_READY     0x40
-/* link is configured, LID, MTU, etc. have been set */
-#define IPATH_STATUS_IB_CONF      0x80
-/* no link established, probably no cable */
-#define IPATH_STATUS_IB_NOCABLE  0x100
-/* A Fatal hardware error has occurred. */
-#define IPATH_STATUS_HWERROR     0x200
-
-/*
- * The list of usermode accessible registers.  Also see Reg_* later in file.
- */
-typedef enum _ipath_ureg {
-	/* (RO)  DMA RcvHdr to be used next. */
-	ur_rcvhdrtail = 0,
-	/* (RW)  RcvHdr entry to be processed next by host. */
-	ur_rcvhdrhead = 1,
-	/* (RO)  Index of next Eager index to use. */
-	ur_rcvegrindextail = 2,
-	/* (RW)  Eager TID to be processed next */
-	ur_rcvegrindexhead = 3,
-	/* For internal use only; max register number. */
-	_IPATH_UregMax
-} ipath_ureg;
-
-/* bit values for spi_runtime_flags */
-#define IPATH_RUNTIME_HT	0x1
-#define IPATH_RUNTIME_PCIE	0x2
-#define IPATH_RUNTIME_FORCE_WC_ORDER	0x4
-#define IPATH_RUNTIME_RCVHDR_COPY	0x8
-#define IPATH_RUNTIME_MASTER	0x10
-#define IPATH_RUNTIME_NODMA_RTAIL 0x80
-#define IPATH_RUNTIME_SDMA	      0x200
-#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
-#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
-
-/*
- * This structure is returned by ipath_userinit() immediately after
- * open to get implementation-specific info, and info specific to this
- * instance.
- *
- * This struct must have explict pad fields where type sizes
- * may result in different alignments between 32 and 64 bit
- * programs, since the 64 bit * bit kernel requires the user code
- * to have matching offsets
- */
-struct ipath_base_info {
-	/* version of hardware, for feature checking. */
-	__u32 spi_hw_version;
-	/* version of software, for feature checking. */
-	__u32 spi_sw_version;
-	/* InfiniPath port assigned, goes into sent packets */
-	__u16 spi_port;
-	__u16 spi_subport;
-	/*
-	 * IB MTU, packets IB data must be less than this.
-	 * The MTU is in bytes, and will be a multiple of 4 bytes.
-	 */
-	__u32 spi_mtu;
-	/*
-	 * Size of a PIO buffer.  Any given packet's total size must be less
-	 * than this (in words).  Included is the starting control word, so
-	 * if 513 is returned, then total pkt size is 512 words or less.
-	 */
-	__u32 spi_piosize;
-	/* size of the TID cache in infinipath, in entries */
-	__u32 spi_tidcnt;
-	/* size of the TID Eager list in infinipath, in entries */
-	__u32 spi_tidegrcnt;
-	/* size of a single receive header queue entry in words. */
-	__u32 spi_rcvhdrent_size;
-	/*
-	 * Count of receive header queue entries allocated.
-	 * This may be less than the spu_rcvhdrcnt passed in!.
-	 */
-	__u32 spi_rcvhdr_cnt;
-
-	/* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
-	__u32 spi_runtime_flags;
-
-	/* address where receive buffer queue is mapped into */
-	__u64 spi_rcvhdr_base;
-
-	/* user program. */
-
-	/* base address of eager TID receive buffers. */
-	__u64 spi_rcv_egrbufs;
-
-	/* Allocated by initialization code, not by protocol. */
-
-	/*
-	 * Size of each TID buffer in host memory, starting at
-	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
-	 */
-	__u32 spi_rcv_egrbufsize;
-	/*
-	 * The special QP (queue pair) value that identifies an infinipath
-	 * protocol packet from standard IB packets.  More, probably much
-	 * more, to be added.
-	 */
-	__u32 spi_qpair;
-
-	/*
-	 * User register base for init code, not to be used directly by
-	 * protocol or applications.
-	 */
-	__u64 __spi_uregbase;
-	/*
-	 * Maximum buffer size in bytes that can be used in a single TID
-	 * entry (assuming the buffer is aligned to this boundary).  This is
-	 * the minimum of what the hardware and software support Guaranteed
-	 * to be a power of 2.
-	 */
-	__u32 spi_tid_maxsize;
-	/*
-	 * alignment of each pio send buffer (byte count
-	 * to add to spi_piobufbase to get to second buffer)
-	 */
-	__u32 spi_pioalign;
-	/*
-	 * The index of the first pio buffer available to this process;
-	 * needed to do lookup in spi_pioavailaddr; not added to
-	 * spi_piobufbase.
-	 */
-	__u32 spi_pioindex;
-	 /* number of buffers mapped for this process */
-	__u32 spi_piocnt;
-
-	/*
-	 * Base address of writeonly pio buffers for this process.
-	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
-	 * boundaries.  spi_piocnt buffers are mapped from this address
-	 */
-	__u64 spi_piobufbase;
-
-	/*
-	 * Base address of readonly memory copy of the pioavail registers.
-	 * There are 2 bits for each buffer.
-	 */
-	__u64 spi_pioavailaddr;
-
-	/*
-	 * Address where driver updates a copy of the interface and driver
-	 * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
-	 * string indicating hardware error, if there was one.
-	 */
-	__u64 spi_status;
-
-	/* number of chip ports available to user processes */
-	__u32 spi_nports;
-	/* unit number of chip we are using */
-	__u32 spi_unit;
-	/* num bufs in each contiguous set */
-	__u32 spi_rcv_egrperchunk;
-	/* size in bytes of each contiguous set */
-	__u32 spi_rcv_egrchunksize;
-	/* total size of mmap to cover full rcvegrbuffers */
-	__u32 spi_rcv_egrbuftotlen;
-	__u32 spi_filler_for_align;
-	/* address of readonly memory copy of the rcvhdrq tail register. */
-	__u64 spi_rcvhdr_tailaddr;
-
-	/* shared memory pages for subports if port is shared */
-	__u64 spi_subport_uregbase;
-	__u64 spi_subport_rcvegrbuf;
-	__u64 spi_subport_rcvhdr_base;
-
-	/* shared memory page for hardware port if it is shared */
-	__u64 spi_port_uregbase;
-	__u64 spi_port_rcvegrbuf;
-	__u64 spi_port_rcvhdr_base;
-	__u64 spi_port_rcvhdr_tailaddr;
-
-} __attribute__ ((aligned(8)));
-
-
-/*
- * This version number is given to the driver by the user code during
- * initialization in the spu_userversion field of ipath_user_info, so
- * the driver can check for compatibility with user code.
- *
- * The major version changes when data structures
- * change in an incompatible way.  The driver must be the same or higher
- * for initialization to succeed.  In some cases, a higher version
- * driver will not interoperate with older software, and initialization
- * will return an error.
- */
-#define IPATH_USER_SWMAJOR 1
-
-/*
- * Minor version differences are always compatible
- * a within a major version, however if user software is larger
- * than driver software, some new features and/or structure fields
- * may not be implemented; the user code must deal with this if it
- * cares, or it must abort after initialization reports the difference.
- */
-#define IPATH_USER_SWMINOR 6
-
-#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
-
-#define IPATH_KERN_TYPE 0
-
-/*
- * Similarly, this is the kernel version going back to the user.  It's
- * slightly different, in that we want to tell if the driver was built as
- * part of a QLogic release, or from the driver from openfabrics.org,
- * kernel.org, or a standard distribution, for support reasons.
- * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
- *
- * It's returned by the driver to the user code during initialization in the
- * spi_sw_version field of ipath_base_info, so the user code can in turn
- * check for compatibility with the kernel.
-*/
-#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
-
-/*
- * This structure is passed to ipath_userinit() to tell the driver where
- * user code buffers are, sizes, etc.   The offsets and sizes of the
- * fields must remain unchanged, for binary compatibility.  It can
- * be extended, if userversion is changed so user code can tell, if needed
- */
-struct ipath_user_info {
-	/*
-	 * version of user software, to detect compatibility issues.
-	 * Should be set to IPATH_USER_SWVERSION.
-	 */
-	__u32 spu_userversion;
-
-	/* desired number of receive header queue entries */
-	__u32 spu_rcvhdrcnt;
-
-	/* size of struct base_info to write to */
-	__u32 spu_base_info_size;
-
-	/*
-	 * number of words in KD protocol header
-	 * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
-	 * kernel uses a default.  Once set, attempts to set any other value
-	 * are an error (EAGAIN) until driver is reloaded.
-	 */
-	__u32 spu_rcvhdrsize;
-
-	/*
-	 * If two or more processes wish to share a port, each process
-	 * must set the spu_subport_cnt and spu_subport_id to the same
-	 * values.  The only restriction on the spu_subport_id is that
-	 * it be unique for a given node.
-	 */
-	__u16 spu_subport_cnt;
-	__u16 spu_subport_id;
-
-	__u32 spu_unused; /* kept for compatible layout */
-
-	/*
-	 * address of struct base_info to write to
-	 */
-	__u64 spu_base_info;
-
-} __attribute__ ((aligned(8)));
-
-/* User commands. */
-
-#define IPATH_CMD_MIN		16
-
-#define __IPATH_CMD_USER_INIT	16	/* old set up userspace (for old user code) */
-#define IPATH_CMD_PORT_INFO	17	/* find out what resources we got */
-#define IPATH_CMD_RECV_CTRL	18	/* control receipt of packets */
-#define IPATH_CMD_TID_UPDATE	19	/* update expected TID entries */
-#define IPATH_CMD_TID_FREE	20	/* free expected TID entries */
-#define IPATH_CMD_SET_PART_KEY	21	/* add partition key */
-#define __IPATH_CMD_SLAVE_INFO	22	/* return info on slave processes (for old user code) */
-#define IPATH_CMD_ASSIGN_PORT	23	/* allocate HCA and port */
-#define IPATH_CMD_USER_INIT 	24	/* set up userspace */
-#define IPATH_CMD_UNUSED_1	25
-#define IPATH_CMD_UNUSED_2	26
-#define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
-#define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
-#define IPATH_CMD_ARMLAUNCH_CTRL	29 /* armlaunch detection control */
-/* 30 is unused */
-#define IPATH_CMD_SDMA_INFLIGHT 31	/* sdma inflight counter request */
-#define IPATH_CMD_SDMA_COMPLETE 32	/* sdma completion counter request */
-
-/*
- * Poll types
- */
-#define IPATH_POLL_TYPE_URGENT	 0x01
-#define IPATH_POLL_TYPE_OVERFLOW 0x02
-
-struct ipath_port_info {
-	__u32 num_active;	/* number of active units */
-	__u32 unit;		/* unit (chip) assigned to caller */
-	__u16 port;		/* port on unit assigned to caller */
-	__u16 subport;		/* subport on unit assigned to caller */
-	__u16 num_ports;	/* number of ports available on unit */
-	__u16 num_subports;	/* number of subports opened on port */
-};
-
-struct ipath_tid_info {
-	__u32 tidcnt;
-	/* make structure same size in 32 and 64 bit */
-	__u32 tid__unused;
-	/* virtual address of first page in transfer */
-	__u64 tidvaddr;
-	/* pointer (same size 32/64 bit) to __u16 tid array */
-	__u64 tidlist;
-
-	/*
-	 * pointer (same size 32/64 bit) to bitmap of TIDs used
-	 * for this call; checked for being large enough at open
-	 */
-	__u64 tidmap;
-};
-
-struct ipath_cmd {
-	__u32 type;			/* command type */
-	union {
-		struct ipath_tid_info tid_info;
-		struct ipath_user_info user_info;
-
-		/*
-		 * address in userspace where we should put the sdma
-		 * inflight counter
-		 */
-		__u64 sdma_inflight;
-		/*
-		 * address in userspace where we should put the sdma
-		 * completion counter
-		 */
-		__u64 sdma_complete;
-		/* address in userspace of struct ipath_port_info to
-		   write result to */
-		__u64 port_info;
-		/* enable/disable receipt of packets */
-		__u32 recv_ctrl;
-		/* enable/disable armlaunch errors (non-zero to enable) */
-		__u32 armlaunch_ctrl;
-		/* partition key to set */
-		__u16 part_key;
-		/* user address of __u32 bitmask of active slaves */
-		__u64 slave_mask_addr;
-		/* type of polling we want */
-		__u16 poll_type;
-	} cmd;
-};
-
-struct ipath_iovec {
-	/* Pointer to data, but same size 32 and 64 bit */
-	__u64 iov_base;
-
-	/*
-	 * Length of data; don't need 64 bits, but want
-	 * ipath_sendpkt to remain same size as before 32 bit changes, so...
-	 */
-	__u64 iov_len;
-};
-
-/*
- * Describes a single packet for send.  Each packet can have one or more
- * buffers, but the total length (exclusive of IB headers) must be less
- * than the MTU, and if using the PIO method, entire packet length,
- * including IB headers, must be less than the ipath_piosize value (words).
- * Use of this necessitates including sys/uio.h
- */
-struct __ipath_sendpkt {
-	__u32 sps_flags;	/* flags for packet (TBD) */
-	__u32 sps_cnt;		/* number of entries to use in sps_iov */
-	/* array of iov's describing packet. TEMPORARY */
-	struct ipath_iovec sps_iov[4];
-};
-
-/*
- * diagnostics can send a packet by "writing" one of the following
- * two structs to diag data special file
- * The first is the legacy version for backward compatibility
- */
-struct ipath_diag_pkt {
-	__u32 unit;
-	__u64 data;
-	__u32 len;
-};
-
-/* The second diag_pkt struct is the expanded version that allows
- * more control over the packet, specifically, by allowing a custom
- * pbc (+ static rate) qword, so that special modes and deliberate
- * changes to CRCs can be used. The elements were also re-ordered
- * for better alignment and to avoid padding issues.
- */
-struct ipath_diag_xpkt {
-	__u64 data;
-	__u64 pbc_wd;
-	__u32 unit;
-	__u32 len;
-};
-
-/*
- * Data layout in I2C flash (for GUID, etc.)
- * All fields are little-endian binary unless otherwise stated
- */
-#define IPATH_FLASH_VERSION 2
-struct ipath_flash {
-	/* flash layout version (IPATH_FLASH_VERSION) */
-	__u8 if_fversion;
-	/* checksum protecting if_length bytes */
-	__u8 if_csum;
-	/*
-	 * valid length (in use, protected by if_csum), including
-	 * if_fversion and if_csum themselves)
-	 */
-	__u8 if_length;
-	/* the GUID, in network order */
-	__u8 if_guid[8];
-	/* number of GUIDs to use, starting from if_guid */
-	__u8 if_numguid;
-	/* the (last 10 characters of) board serial number, in ASCII */
-	char if_serial[12];
-	/* board mfg date (YYYYMMDD ASCII) */
-	char if_mfgdate[8];
-	/* last board rework/test date (YYYYMMDD ASCII) */
-	char if_testdate[8];
-	/* logging of error counts, TBD */
-	__u8 if_errcntp[4];
-	/* powered on hours, updated at driver unload */
-	__u8 if_powerhour[2];
-	/* ASCII free-form comment field */
-	char if_comment[32];
-	/* Backwards compatible prefix for longer QLogic Serial Numbers */
-	char if_sprefix[4];
-	/* 82 bytes used, min flash size is 128 bytes */
-	__u8 if_future[46];
-};
-
-/*
- * These are the counters implemented in the chip, and are listed in order.
- * The InterCaps naming is taken straight from the chip spec.
- */
-struct infinipath_counters {
-	__u64 LBIntCnt;
-	__u64 LBFlowStallCnt;
-	__u64 TxSDmaDescCnt;	/* was Reserved1 */
-	__u64 TxUnsupVLErrCnt;
-	__u64 TxDataPktCnt;
-	__u64 TxFlowPktCnt;
-	__u64 TxDwordCnt;
-	__u64 TxLenErrCnt;
-	__u64 TxMaxMinLenErrCnt;
-	__u64 TxUnderrunCnt;
-	__u64 TxFlowStallCnt;
-	__u64 TxDroppedPktCnt;
-	__u64 RxDroppedPktCnt;
-	__u64 RxDataPktCnt;
-	__u64 RxFlowPktCnt;
-	__u64 RxDwordCnt;
-	__u64 RxLenErrCnt;
-	__u64 RxMaxMinLenErrCnt;
-	__u64 RxICRCErrCnt;
-	__u64 RxVCRCErrCnt;
-	__u64 RxFlowCtrlErrCnt;
-	__u64 RxBadFormatCnt;
-	__u64 RxLinkProblemCnt;
-	__u64 RxEBPCnt;
-	__u64 RxLPCRCErrCnt;
-	__u64 RxBufOvflCnt;
-	__u64 RxTIDFullErrCnt;
-	__u64 RxTIDValidErrCnt;
-	__u64 RxPKeyMismatchCnt;
-	__u64 RxP0HdrEgrOvflCnt;
-	__u64 RxP1HdrEgrOvflCnt;
-	__u64 RxP2HdrEgrOvflCnt;
-	__u64 RxP3HdrEgrOvflCnt;
-	__u64 RxP4HdrEgrOvflCnt;
-	__u64 RxP5HdrEgrOvflCnt;
-	__u64 RxP6HdrEgrOvflCnt;
-	__u64 RxP7HdrEgrOvflCnt;
-	__u64 RxP8HdrEgrOvflCnt;
-	__u64 RxP9HdrEgrOvflCnt;	/* was Reserved6 */
-	__u64 RxP10HdrEgrOvflCnt;	/* was Reserved7 */
-	__u64 RxP11HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 RxP12HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 RxP13HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 RxP14HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 RxP15HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 RxP16HdrEgrOvflCnt;	/* new for IBA7220 */
-	__u64 IBStatusChangeCnt;
-	__u64 IBLinkErrRecoveryCnt;
-	__u64 IBLinkDownedCnt;
-	__u64 IBSymbolErrCnt;
-	/* The following are new for IBA7220 */
-	__u64 RxVL15DroppedPktCnt;
-	__u64 RxOtherLocalPhyErrCnt;
-	__u64 PcieRetryBufDiagQwordCnt;
-	__u64 ExcessBufferOvflCnt;
-	__u64 LocalLinkIntegrityErrCnt;
-	__u64 RxVlErrCnt;
-	__u64 RxDlidFltrCnt;
-};
-
-/*
- * The next set of defines are for packet headers, and chip register
- * and memory bits that are visible to and/or used by user-mode software
- * The other bits that are used only by the driver or diags are in
- * ipath_registers.h
- */
-
-/* RcvHdrFlags bits */
-#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
-#define INFINIPATH_RHF_LENGTH_SHIFT 0
-#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
-#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
-#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
-#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
-#define INFINIPATH_RHF_SEQ_MASK 0xF
-#define INFINIPATH_RHF_SEQ_SHIFT 0
-#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
-#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
-#define INFINIPATH_RHF_H_ICRCERR   0x80000000
-#define INFINIPATH_RHF_H_VCRCERR   0x40000000
-#define INFINIPATH_RHF_H_PARITYERR 0x20000000
-#define INFINIPATH_RHF_H_LENERR    0x10000000
-#define INFINIPATH_RHF_H_MTUERR    0x08000000
-#define INFINIPATH_RHF_H_IHDRERR   0x04000000
-#define INFINIPATH_RHF_H_TIDERR    0x02000000
-#define INFINIPATH_RHF_H_MKERR     0x01000000
-#define INFINIPATH_RHF_H_IBERR     0x00800000
-#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
-#define INFINIPATH_RHF_L_USE_EGR   0x80000000
-#define INFINIPATH_RHF_L_SWA       0x00008000
-#define INFINIPATH_RHF_L_SWB       0x00004000
-
-/* infinipath header fields */
-#define INFINIPATH_I_VERS_MASK 0xF
-#define INFINIPATH_I_VERS_SHIFT 28
-#define INFINIPATH_I_PORT_MASK 0xF
-#define INFINIPATH_I_PORT_SHIFT 24
-#define INFINIPATH_I_TID_MASK 0x7FF
-#define INFINIPATH_I_TID_SHIFT 13
-#define INFINIPATH_I_OFFSET_MASK 0x1FFF
-#define INFINIPATH_I_OFFSET_SHIFT 0
-
-/* K_PktFlags bits */
-#define INFINIPATH_KPF_INTR 0x1
-#define INFINIPATH_KPF_SUBPORT_MASK 0x3
-#define INFINIPATH_KPF_SUBPORT_SHIFT 1
-
-#define INFINIPATH_MAX_SUBPORT	4
-
-/* SendPIO per-buffer control */
-#define INFINIPATH_SP_TEST    0x40
-#define INFINIPATH_SP_TESTEBP 0x20
-#define INFINIPATH_SP_TRIGGER_SHIFT  15
-
-/* SendPIOAvail bits */
-#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
-#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
-
-/* infinipath header format */
-struct ipath_header {
-	/*
-	 * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
-	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
-	 * Port 4, TID 11, offset 13.
-	 */
-	__le32 ver_port_tid_offset;
-	__le16 chksum;
-	__le16 pkt_flags;
-};
-
-/* infinipath user message header format.
- * This structure contains the first 4 fields common to all protocols
- * that employ infinipath.
- */
-struct ipath_message_header {
-	__be16 lrh[4];
-	__be32 bth[3];
-	/* fields below this point are in host byte order */
-	struct ipath_header iph;
-	__u8 sub_opcode;
-};
-
-/* infinipath ethernet header format */
-struct ether_header {
-	__be16 lrh[4];
-	__be32 bth[3];
-	struct ipath_header iph;
-	__u8 sub_opcode;
-	__u8 cmd;
-	__be16 lid;
-	__u16 mac[3];
-	__u8 frag_num;
-	__u8 seq_num;
-	__le32 len;
-	/* MUST be of word size due to PIO write requirements */
-	__le32 csum;
-	__le16 csum_offset;
-	__le16 flags;
-	__u16 first_2_bytes;
-	__u8 unused[2];		/* currently unused */
-};
-
-
-/* IB - LRH header consts */
-#define IPATH_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
-#define IPATH_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
-
-/* misc. */
-#define SIZE_OF_CRC 1
-
-#define IPATH_DEFAULT_P_KEY 0xFFFF
-#define IPATH_PERMISSIVE_LID 0xFFFF
-#define IPATH_AETH_CREDIT_SHIFT 24
-#define IPATH_AETH_CREDIT_MASK 0x1F
-#define IPATH_AETH_CREDIT_INVAL 0x1F
-#define IPATH_PSN_MASK 0xFFFFFF
-#define IPATH_MSN_MASK 0xFFFFFF
-#define IPATH_QPN_MASK 0xFFFFFF
-#define IPATH_MULTICAST_LID_BASE 0xC000
-#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
-#define IPATH_MULTICAST_QPN 0xFFFFFF
-
-/* Receive Header Queue: receive type (from infinipath) */
-#define RCVHQ_RCV_TYPE_EXPECTED  0
-#define RCVHQ_RCV_TYPE_EAGER     1
-#define RCVHQ_RCV_TYPE_NON_KD    2
-#define RCVHQ_RCV_TYPE_ERROR     3
-
-
-/* sub OpCodes - ith4x  */
-#define IPATH_ITH4X_OPCODE_ENCAP 0x81
-#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
-
-#define IPATH_HEADER_QUEUE_WORDS 9
-
-/* functions for extracting fields from rcvhdrq entries for the driver.
- */
-static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
-{
-	return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
-}
-
-static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
-{
-	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
-	    & INFINIPATH_RHF_RCVTYPE_MASK;
-}
-
-static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
-{
-	return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
-		& INFINIPATH_RHF_LENGTH_MASK) << 2;
-}
-
-static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
-{
-	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
-	    & INFINIPATH_RHF_EGRINDEX_MASK;
-}
-
-static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
-{
-	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
-		& INFINIPATH_RHF_SEQ_MASK;
-}
-
-static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
-{
-	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
-		& INFINIPATH_RHF_HDRQ_OFFSET_MASK;
-}
-
-static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
-{
-	return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
-}
-
-static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
-{
-	return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
-	    & INFINIPATH_I_VERS_MASK;
-}
-
-#endif				/* _IPATH_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
deleted file mode 100644
index e9dd911..0000000
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_cq_enter - add a new entry to the completion queue
- * @cq: completion queue
- * @entry: work completion entry to add
- * @sig: true if @entry is a solicitated entry
- *
- * This may be called with qp->s_lock held.
- */
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
-{
-	struct ipath_cq_wc *wc;
-	unsigned long flags;
-	u32 head;
-	u32 next;
-
-	spin_lock_irqsave(&cq->lock, flags);
-
-	/*
-	 * Note that the head pointer might be writable by user processes.
-	 * Take care to verify it is a sane value.
-	 */
-	wc = cq->queue;
-	head = wc->head;
-	if (head >= (unsigned) cq->ibcq.cqe) {
-		head = cq->ibcq.cqe;
-		next = 0;
-	} else
-		next = head + 1;
-	if (unlikely(next == wc->tail)) {
-		spin_unlock_irqrestore(&cq->lock, flags);
-		if (cq->ibcq.event_handler) {
-			struct ib_event ev;
-
-			ev.device = cq->ibcq.device;
-			ev.element.cq = &cq->ibcq;
-			ev.event = IB_EVENT_CQ_ERR;
-			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
-		}
-		return;
-	}
-	if (cq->ip) {
-		wc->uqueue[head].wr_id = entry->wr_id;
-		wc->uqueue[head].status = entry->status;
-		wc->uqueue[head].opcode = entry->opcode;
-		wc->uqueue[head].vendor_err = entry->vendor_err;
-		wc->uqueue[head].byte_len = entry->byte_len;
-		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
-		wc->uqueue[head].qp_num = entry->qp->qp_num;
-		wc->uqueue[head].src_qp = entry->src_qp;
-		wc->uqueue[head].wc_flags = entry->wc_flags;
-		wc->uqueue[head].pkey_index = entry->pkey_index;
-		wc->uqueue[head].slid = entry->slid;
-		wc->uqueue[head].sl = entry->sl;
-		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-		wc->uqueue[head].port_num = entry->port_num;
-		/* Make sure entry is written before the head index. */
-		smp_wmb();
-	} else
-		wc->kqueue[head] = *entry;
-	wc->head = next;
-
-	if (cq->notify == IB_CQ_NEXT_COMP ||
-	    (cq->notify == IB_CQ_SOLICITED && solicited)) {
-		cq->notify = IB_CQ_NONE;
-		cq->triggered++;
-		/*
-		 * This will cause send_complete() to be called in
-		 * another thread.
-		 */
-		tasklet_hi_schedule(&cq->comptask);
-	}
-
-	spin_unlock_irqrestore(&cq->lock, flags);
-
-	if (entry->status != IB_WC_SUCCESS)
-		to_idev(cq->ibcq.device)->n_wqe_errs++;
-}
-
-/**
- * ipath_poll_cq - poll for work completion entries
- * @ibcq: the completion queue to poll
- * @num_entries: the maximum number of entries to return
- * @entry: pointer to array where work completions are placed
- *
- * Returns the number of completion entries polled.
- *
- * This may be called from interrupt context.  Also called by ib_poll_cq()
- * in the generic verbs code.
- */
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-	struct ipath_cq *cq = to_icq(ibcq);
-	struct ipath_cq_wc *wc;
-	unsigned long flags;
-	int npolled;
-	u32 tail;
-
-	/* The kernel can only poll a kernel completion queue */
-	if (cq->ip) {
-		npolled = -EINVAL;
-		goto bail;
-	}
-
-	spin_lock_irqsave(&cq->lock, flags);
-
-	wc = cq->queue;
-	tail = wc->tail;
-	if (tail > (u32) cq->ibcq.cqe)
-		tail = (u32) cq->ibcq.cqe;
-	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
-		if (tail == wc->head)
-			break;
-		/* The kernel doesn't need a RMB since it has the lock. */
-		*entry = wc->kqueue[tail];
-		if (tail >= cq->ibcq.cqe)
-			tail = 0;
-		else
-			tail++;
-	}
-	wc->tail = tail;
-
-	spin_unlock_irqrestore(&cq->lock, flags);
-
-bail:
-	return npolled;
-}
-
-static void send_complete(unsigned long data)
-{
-	struct ipath_cq *cq = (struct ipath_cq *)data;
-
-	/*
-	 * The completion handler will most likely rearm the notification
-	 * and poll for all pending entries.  If a new completion entry
-	 * is added while we are in this routine, tasklet_hi_schedule()
-	 * won't call us again until we return so we check triggered to
-	 * see if we need to call the handler again.
-	 */
-	for (;;) {
-		u8 triggered = cq->triggered;
-
-		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
-
-		if (cq->triggered == triggered)
-			return;
-	}
-}
-
-/**
- * ipath_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
- * @attr: creation attributes
- * @context: unused by the InfiniPath driver
- * @udata: unused by the InfiniPath driver
- *
- * Returns a pointer to the completion queue or negative errno values
- * for failure.
- *
- * Called by ib_create_cq() in the generic verbs code.
- */
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-			      const struct ib_cq_init_attr *attr,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata)
-{
-	int entries = attr->cqe;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_cq *cq;
-	struct ipath_cq_wc *wc;
-	struct ib_cq *ret;
-	u32 sz;
-
-	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	if (entries < 1 || entries > ib_ipath_max_cqes) {
-		ret = ERR_PTR(-EINVAL);
-		goto done;
-	}
-
-	/* Allocate the completion queue structure. */
-	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		ret = ERR_PTR(-ENOMEM);
-		goto done;
-	}
-
-	/*
-	 * Allocate the completion queue entries and head/tail pointers.
-	 * This is allocated separately so that it can be resized and
-	 * also mapped into user space.
-	 * We need to use vmalloc() in order to support mmap and large
-	 * numbers of entries.
-	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-	else
-		sz += sizeof(struct ib_wc) * (entries + 1);
-	wc = vmalloc_user(sz);
-	if (!wc) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_cq;
-	}
-
-	/*
-	 * Return the address of the WC as the offset to mmap.
-	 * See ipath_mmap() for details.
-	 */
-	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
-
-		cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
-		if (!cq->ip) {
-			ret = ERR_PTR(-ENOMEM);
-			goto bail_wc;
-		}
-
-		err = ib_copy_to_udata(udata, &cq->ip->offset,
-				       sizeof(cq->ip->offset));
-		if (err) {
-			ret = ERR_PTR(err);
-			goto bail_ip;
-		}
-	} else
-		cq->ip = NULL;
-
-	spin_lock(&dev->n_cqs_lock);
-	if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
-		spin_unlock(&dev->n_cqs_lock);
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_ip;
-	}
-
-	dev->n_cqs_allocated++;
-	spin_unlock(&dev->n_cqs_lock);
-
-	if (cq->ip) {
-		spin_lock_irq(&dev->pending_lock);
-		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
-		spin_unlock_irq(&dev->pending_lock);
-	}
-
-	/*
-	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
-	 * The number of entries should be >= the number requested or return
-	 * an error.
-	 */
-	cq->ibcq.cqe = entries;
-	cq->notify = IB_CQ_NONE;
-	cq->triggered = 0;
-	spin_lock_init(&cq->lock);
-	tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
-	wc->head = 0;
-	wc->tail = 0;
-	cq->queue = wc;
-
-	ret = &cq->ibcq;
-
-	goto done;
-
-bail_ip:
-	kfree(cq->ip);
-bail_wc:
-	vfree(wc);
-bail_cq:
-	kfree(cq);
-done:
-	return ret;
-}
-
-/**
- * ipath_destroy_cq - destroy a completion queue
- * @ibcq: the completion queue to destroy.
- *
- * Returns 0 for success.
- *
- * Called by ib_destroy_cq() in the generic verbs code.
- */
-int ipath_destroy_cq(struct ib_cq *ibcq)
-{
-	struct ipath_ibdev *dev = to_idev(ibcq->device);
-	struct ipath_cq *cq = to_icq(ibcq);
-
-	tasklet_kill(&cq->comptask);
-	spin_lock(&dev->n_cqs_lock);
-	dev->n_cqs_allocated--;
-	spin_unlock(&dev->n_cqs_lock);
-	if (cq->ip)
-		kref_put(&cq->ip->ref, ipath_release_mmap_info);
-	else
-		vfree(cq->queue);
-	kfree(cq);
-
-	return 0;
-}
-
-/**
- * ipath_req_notify_cq - change the notification type for a completion queue
- * @ibcq: the completion queue
- * @notify_flags: the type of notification to request
- *
- * Returns 0 for success.
- *
- * This may be called from interrupt context.  Also called by
- * ib_req_notify_cq() in the generic verbs code.
- */
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-	struct ipath_cq *cq = to_icq(ibcq);
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&cq->lock, flags);
-	/*
-	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
-	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
-	 */
-	if (cq->notify != IB_CQ_NEXT_COMP)
-		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
-
-	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-	    cq->queue->head != cq->queue->tail)
-		ret = 1;
-
-	spin_unlock_irqrestore(&cq->lock, flags);
-
-	return ret;
-}
-
-/**
- * ipath_resize_cq - change the size of the CQ
- * @ibcq: the completion queue
- *
- * Returns 0 for success.
- */
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
-{
-	struct ipath_cq *cq = to_icq(ibcq);
-	struct ipath_cq_wc *old_wc;
-	struct ipath_cq_wc *wc;
-	u32 head, tail, n;
-	int ret;
-	u32 sz;
-
-	if (cqe < 1 || cqe > ib_ipath_max_cqes) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/*
-	 * Need to use vmalloc() if we want to support large #s of entries.
-	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-	else
-		sz += sizeof(struct ib_wc) * (cqe + 1);
-	wc = vmalloc_user(sz);
-	if (!wc) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	/* Check that we can write the offset to mmap. */
-	if (udata && udata->outlen >= sizeof(__u64)) {
-		__u64 offset = 0;
-
-		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
-		if (ret)
-			goto bail_free;
-	}
-
-	spin_lock_irq(&cq->lock);
-	/*
-	 * Make sure head and tail are sane since they
-	 * might be user writable.
-	 */
-	old_wc = cq->queue;
-	head = old_wc->head;
-	if (head > (u32) cq->ibcq.cqe)
-		head = (u32) cq->ibcq.cqe;
-	tail = old_wc->tail;
-	if (tail > (u32) cq->ibcq.cqe)
-		tail = (u32) cq->ibcq.cqe;
-	if (head < tail)
-		n = cq->ibcq.cqe + 1 + head - tail;
-	else
-		n = head - tail;
-	if (unlikely((u32)cqe < n)) {
-		ret = -EINVAL;
-		goto bail_unlock;
-	}
-	for (n = 0; tail != head; n++) {
-		if (cq->ip)
-			wc->uqueue[n] = old_wc->uqueue[tail];
-		else
-			wc->kqueue[n] = old_wc->kqueue[tail];
-		if (tail == (u32) cq->ibcq.cqe)
-			tail = 0;
-		else
-			tail++;
-	}
-	cq->ibcq.cqe = cqe;
-	wc->head = n;
-	wc->tail = 0;
-	cq->queue = wc;
-	spin_unlock_irq(&cq->lock);
-
-	vfree(old_wc);
-
-	if (cq->ip) {
-		struct ipath_ibdev *dev = to_idev(ibcq->device);
-		struct ipath_mmap_info *ip = cq->ip;
-
-		ipath_update_mmap_info(dev, ip, sz, wc);
-
-		/*
-		 * Return the offset to mmap.
-		 * See ipath_mmap() for details.
-		 */
-		if (udata && udata->outlen >= sizeof(__u64)) {
-			ret = ib_copy_to_udata(udata, &ip->offset,
-					       sizeof(ip->offset));
-			if (ret)
-				goto bail;
-		}
-
-		spin_lock_irq(&dev->pending_lock);
-		if (list_empty(&ip->pending_mmaps))
-			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
-		spin_unlock_irq(&dev->pending_lock);
-	}
-
-	ret = 0;
-	goto bail;
-
-bail_unlock:
-	spin_unlock_irq(&cq->lock);
-bail_free:
-	vfree(wc);
-bail:
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h
deleted file mode 100644
index 65926cd..0000000
--- a/drivers/infiniband/hw/ipath/ipath_debug.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_DEBUG_H
-#define _IPATH_DEBUG_H
-
-#ifndef _IPATH_DEBUGGING	/* debugging enabled or not */
-#define _IPATH_DEBUGGING 1
-#endif
-
-#if _IPATH_DEBUGGING
-
-/*
- * Mask values for debugging.  The scheme allows us to compile out any
- * of the debug tracing stuff, and if compiled in, to enable or disable
- * dynamically.  This can be set at modprobe time also:
- *      modprobe infinipath.ko infinipath_debug=7
- */
-
-#define __IPATH_INFO        0x1	/* generic low verbosity stuff */
-#define __IPATH_DBG         0x2	/* generic debug */
-#define __IPATH_TRSAMPLE    0x8	/* generate trace buffer sample entries */
-/* leave some low verbosity spots open */
-#define __IPATH_VERBDBG     0x40	/* very verbose debug */
-#define __IPATH_PKTDBG      0x80	/* print packet data */
-/* print process startup (init)/exit messages */
-#define __IPATH_PROCDBG     0x100
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG       0x200
-#define __IPATH_ERRPKTDBG   0x400
-#define __IPATH_USER_SEND   0x1000	/* use user mode send */
-#define __IPATH_KERNEL_SEND 0x2000	/* use kernel mode send */
-#define __IPATH_EPKTDBG     0x4000	/* print ethernet packet data */
-#define __IPATH_IPATHDBG    0x10000	/* Ethernet (IPATH) gen debug */
-#define __IPATH_IPATHWARN   0x20000	/* Ethernet (IPATH) warnings */
-#define __IPATH_IPATHERR    0x40000	/* Ethernet (IPATH) errors */
-#define __IPATH_IPATHPD     0x80000	/* Ethernet (IPATH) packet dump */
-#define __IPATH_IPATHTABLE  0x100000	/* Ethernet (IPATH) table dump */
-#define __IPATH_LINKVERBDBG 0x200000	/* very verbose linkchange debug */
-
-#else				/* _IPATH_DEBUGGING */
-
-/*
- * define all of these even with debugging off, for the few places that do
- * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
- * compiler eliminate the code
- */
-
-#define __IPATH_INFO      0x0	/* generic low verbosity stuff */
-#define __IPATH_DBG       0x0	/* generic debug */
-#define __IPATH_TRSAMPLE  0x0	/* generate trace buffer sample entries */
-#define __IPATH_VERBDBG   0x0	/* very verbose debug */
-#define __IPATH_PKTDBG    0x0	/* print packet data */
-#define __IPATH_PROCDBG   0x0	/* process startup (init)/exit messages */
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG     0x0
-#define __IPATH_EPKTDBG   0x0	/* print ethernet packet data */
-#define __IPATH_IPATHDBG  0x0	/* Ethernet (IPATH) table dump on */
-#define __IPATH_IPATHWARN 0x0	/* Ethernet (IPATH) warnings on   */
-#define __IPATH_IPATHERR  0x0	/* Ethernet (IPATH) errors on   */
-#define __IPATH_IPATHPD   0x0	/* Ethernet (IPATH) packet dump on   */
-#define __IPATH_IPATHTABLE 0x0	/* Ethernet (IPATH) packet dump on   */
-#define __IPATH_LINKVERBDBG 0x0	/* very verbose linkchange debug */
-
-#endif				/* _IPATH_DEBUGGING */
-
-#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
-
-#endif				/* _IPATH_DEBUG_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
deleted file mode 100644
index 45802e9..0000000
--- a/drivers/infiniband/hw/ipath/ipath_diag.c
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the ipath_diag device, normally minor number 129.  Diagnostic use
- * of the InfiniPath chip may render the chip or board unusable until the
- * driver is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/export.h>
-#include <asm/uaccess.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-int ipath_diag_inuse;
-static int diag_set_link;
-
-static int ipath_diag_open(struct inode *in, struct file *fp);
-static int ipath_diag_release(struct inode *in, struct file *fp);
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-			       size_t count, loff_t *off);
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-				size_t count, loff_t *off);
-
-static const struct file_operations diag_file_ops = {
-	.owner = THIS_MODULE,
-	.write = ipath_diag_write,
-	.read = ipath_diag_read,
-	.open = ipath_diag_open,
-	.release = ipath_diag_release,
-	.llseek = default_llseek,
-};
-
-static ssize_t ipath_diagpkt_write(struct file *fp,
-				   const char __user *data,
-				   size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-	.owner = THIS_MODULE,
-	.write = ipath_diagpkt_write,
-	.llseek = noop_llseek,
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev *diagpkt_cdev;
-static struct device *diagpkt_dev;
-
-int ipath_diag_add(struct ipath_devdata *dd)
-{
-	char name[16];
-	int ret = 0;
-
-	if (atomic_inc_return(&diagpkt_count) == 1) {
-		ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
-				      "ipath_diagpkt", &diagpkt_file_ops,
-				      &diagpkt_cdev, &diagpkt_dev);
-
-		if (ret) {
-			ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
-				      "device: %d", ret);
-			goto done;
-		}
-	}
-
-	snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
-
-	ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
-			      &diag_file_ops, &dd->diag_cdev,
-			      &dd->diag_dev);
-	if (ret)
-		ipath_dev_err(dd, "Couldn't create %s device: %d",
-			      name, ret);
-
-done:
-	return ret;
-}
-
-void ipath_diag_remove(struct ipath_devdata *dd)
-{
-	if (atomic_dec_and_test(&diagpkt_count))
-		ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
-
-	ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
-}
-
-/**
- * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy (multiple of 32 bits)
- *
- * This function also localizes all chip memory accesses.
- * The copy should be written such that we read full cacheline packets
- * from the chip.  This is usually used for a single qword
- *
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
-			     const void __iomem *caddr, size_t count)
-{
-	const u64 __iomem *reg_addr = caddr;
-	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-	int ret;
-
-	/* not very efficient, but it works for now */
-	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	while (reg_addr < reg_end) {
-		u64 data = readq(reg_addr);
-		if (copy_to_user(uaddr, &data, sizeof(u64))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-		reg_addr++;
-		uaddr += sizeof(u64);
-	}
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: the number of bytes to copy (multiple of 32 bits)
- *
- * This is usually used for a single qword
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-
-static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
-			      const void __user *uaddr, size_t count)
-{
-	u64 __iomem *reg_addr = caddr;
-	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-	int ret;
-
-	/* not very efficient, but it works for now */
-	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	while (reg_addr < reg_end) {
-		u64 data;
-		if (copy_from_user(&data, uaddr, sizeof(data))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-		writeq(data, reg_addr);
-
-		reg_addr++;
-		uaddr += sizeof(u64);
-	}
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy
- *
- * read 32 bit values, not 64 bit; for memories that only
- * support 32 bit reads; usually a single dword.
- */
-static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
-			     const void __iomem *caddr, size_t count)
-{
-	const u32 __iomem *reg_addr = caddr;
-	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-	int ret;
-
-	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	/* not very efficient, but it works for now */
-	while (reg_addr < reg_end) {
-		u32 data = readl(reg_addr);
-		if (copy_to_user(uaddr, &data, sizeof(data))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-
-		reg_addr++;
-		uaddr += sizeof(u32);
-
-	}
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: number of bytes to copy
- *
- * write 32 bit values, not 64 bit; for memories that only
- * support 32 bit write; usually a single dword.
- */
-
-static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
-			      const void __user *uaddr, size_t count)
-{
-	u32 __iomem *reg_addr = caddr;
-	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-	int ret;
-
-	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	while (reg_addr < reg_end) {
-		u32 data;
-		if (copy_from_user(&data, uaddr, sizeof(data))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-		writel(data, reg_addr);
-
-		reg_addr++;
-		uaddr += sizeof(u32);
-	}
-	ret = 0;
-bail:
-	return ret;
-}
-
-static int ipath_diag_open(struct inode *in, struct file *fp)
-{
-	int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
-	struct ipath_devdata *dd;
-	int ret;
-
-	mutex_lock(&ipath_mutex);
-
-	if (ipath_diag_inuse) {
-		ret = -EBUSY;
-		goto bail;
-	}
-
-	dd = ipath_lookup(unit);
-
-	if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
-	    !dd->ipath_kregbase) {
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	fp->private_data = dd;
-	ipath_diag_inuse = -2;
-	diag_set_link = 0;
-	ret = 0;
-
-	/* Only expose a way to reset the device if we
-	   make it into diag mode. */
-	ipath_expose_reset(&dd->pcidev->dev);
-
-bail:
-	mutex_unlock(&ipath_mutex);
-
-	return ret;
-}
-
-/**
- * ipath_diagpkt_write - write an IB packet
- * @fp: the diag data device file pointer
- * @data: ipath_diag_pkt structure saying where to get the packet
- * @count: size of data to write
- * @off: unused by this code
- */
-static ssize_t ipath_diagpkt_write(struct file *fp,
-				   const char __user *data,
-				   size_t count, loff_t *off)
-{
-	u32 __iomem *piobuf;
-	u32 plen, pbufn, maxlen_reserve;
-	struct ipath_diag_pkt odp;
-	struct ipath_diag_xpkt dp;
-	u32 *tmpbuf = NULL;
-	struct ipath_devdata *dd;
-	ssize_t ret = 0;
-	u64 val;
-	u32 l_state, lt_state; /* LinkState, LinkTrainingState */
-
-
-	if (count == sizeof(dp)) {
-		if (copy_from_user(&dp, data, sizeof(dp))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-	} else if (count == sizeof(odp)) {
-		if (copy_from_user(&odp, data, sizeof(odp))) {
-			ret = -EFAULT;
-			goto bail;
-		}
-		dp.len = odp.len;
-		dp.unit = odp.unit;
-		dp.data = odp.data;
-		dp.pbc_wd = 0;
-	} else {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* send count must be an exact number of dwords */
-	if (dp.len & 3) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	plen = dp.len >> 2;
-
-	dd = ipath_lookup(dp.unit);
-	if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
-	    !dd->ipath_kregbase) {
-		ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
-			   dp.unit);
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	if (ipath_diag_inuse && !diag_set_link &&
-	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-		diag_set_link = 1;
-		ipath_cdbg(VERBOSE, "Trying to set to set link active for "
-			   "diag pkt\n");
-		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-	}
-
-	if (!(dd->ipath_flags & IPATH_INITTED)) {
-		/* no hardware, freeze, etc. */
-		ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
-		ret = -ENODEV;
-		goto bail;
-	}
-	/*
-	 * Want to skip check for l_state if using custom PBC,
-	 * because we might be trying to force an SM packet out.
-	 * first-cut, skip _all_ state checking in that case.
-	 */
-	val = ipath_ib_state(dd, dd->ipath_lastibcstat);
-	lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-	l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-	if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
-	    (val != dd->ib_init && val != dd->ib_arm &&
-	    val != dd->ib_active))) {
-		ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
-			   dd->ipath_unit, (unsigned long long) val);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/*
-	 * need total length before first word written, plus 2 Dwords. One Dword
-	 * is for padding so we get the full user data when not aligned on
-	 * a word boundary. The other Dword is to make sure we have room for the
-	 * ICRC which gets tacked on later.
-	 */
-	maxlen_reserve = 2 * sizeof(u32);
-	if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
-		ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
-			  dp.len, dd->ipath_ibmaxlen);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	plen = sizeof(u32) + dp.len;
-
-	tmpbuf = vmalloc(plen);
-	if (!tmpbuf) {
-		dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
-			 "failing\n");
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	if (copy_from_user(tmpbuf,
-			   (const void __user *) (unsigned long) dp.data,
-			   dp.len)) {
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	plen >>= 2;		/* in dwords */
-
-	piobuf = ipath_getpiobuf(dd, plen, &pbufn);
-	if (!piobuf) {
-		ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
-			   dd->ipath_unit);
-		ret = -EBUSY;
-		goto bail;
-	}
-	/* disarm it just to be extra sure */
-	ipath_disarm_piobufs(dd, pbufn, 1);
-
-	if (ipath_debug & __IPATH_PKTDBG)
-		ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
-			   dd->ipath_unit, plen - 1, pbufn);
-
-	if (dp.pbc_wd == 0)
-		dp.pbc_wd = plen;
-	writeq(dp.pbc_wd, piobuf);
-	/*
-	 * Copy all by the trigger word, then flush, so it's written
-	 * to chip before trigger word, then write trigger word, then
-	 * flush again, so packet is sent.
-	 */
-	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-		ipath_flush_wc();
-		__iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
-		ipath_flush_wc();
-		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
-	} else
-		__iowrite32_copy(piobuf + 2, tmpbuf, plen);
-
-	ipath_flush_wc();
-
-	ret = sizeof(dp);
-
-bail:
-	vfree(tmpbuf);
-	return ret;
-}
-
-static int ipath_diag_release(struct inode *in, struct file *fp)
-{
-	mutex_lock(&ipath_mutex);
-	ipath_diag_inuse = 0;
-	fp->private_data = NULL;
-	mutex_unlock(&ipath_mutex);
-	return 0;
-}
-
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-			       size_t count, loff_t *off)
-{
-	struct ipath_devdata *dd = fp->private_data;
-	void __iomem *kreg_base;
-	ssize_t ret;
-
-	kreg_base = dd->ipath_kregbase;
-
-	if (count == 0)
-		ret = 0;
-	else if ((count % 4) || (*off % 4))
-		/* address or length is not 32-bit aligned, hence invalid */
-		ret = -EINVAL;
-	else if (ipath_diag_inuse < 1 && (*off || count != 8))
-		ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
-	else if ((count % 8) || (*off % 8))
-		/* address or length not 64-bit aligned; do 32-bit reads */
-		ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
-	else
-		ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
-
-	if (ret >= 0) {
-		*off += count;
-		ret = count;
-		if (ipath_diag_inuse == -2)
-			ipath_diag_inuse++;
-	}
-
-	return ret;
-}
-
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-				size_t count, loff_t *off)
-{
-	struct ipath_devdata *dd = fp->private_data;
-	void __iomem *kreg_base;
-	ssize_t ret;
-
-	kreg_base = dd->ipath_kregbase;
-
-	if (count == 0)
-		ret = 0;
-	else if ((count % 4) || (*off % 4))
-		/* address or length is not 32-bit aligned, hence invalid */
-		ret = -EINVAL;
-	else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
-		 ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
-		ret = -EINVAL;  /* before any other write allowed */
-	else if ((count % 8) || (*off % 8))
-		/* address or length not 64-bit aligned; do 32-bit writes */
-		ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
-	else
-		ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
-
-	if (ret >= 0) {
-		*off += count;
-		ret = count;
-		if (ipath_diag_inuse == -1)
-			ipath_diag_inuse = 1; /* all read/write OK now */
-	}
-
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c
deleted file mode 100644
index 123a8c0..0000000
--- a/drivers/infiniband/hw/ipath/ipath_dma.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/scatterlist.h>
-#include <linux/gfp.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_verbs.h"
-
-#define BAD_DMA_ADDRESS ((u64) 0)
-
-/*
- * The following functions implement driver specific replacements
- * for the ib_dma_*() functions.
- *
- * These functions return kernel virtual addresses instead of
- * device bus addresses since the driver uses the CPU to copy
- * data instead of using hardware DMA.
- */
-
-static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-	return dma_addr == BAD_DMA_ADDRESS;
-}
-
-static u64 ipath_dma_map_single(struct ib_device *dev,
-			        void *cpu_addr, size_t size,
-			        enum dma_data_direction direction)
-{
-	BUG_ON(!valid_dma_direction(direction));
-	return (u64) cpu_addr;
-}
-
-static void ipath_dma_unmap_single(struct ib_device *dev,
-				   u64 addr, size_t size,
-				   enum dma_data_direction direction)
-{
-	BUG_ON(!valid_dma_direction(direction));
-}
-
-static u64 ipath_dma_map_page(struct ib_device *dev,
-			      struct page *page,
-			      unsigned long offset,
-			      size_t size,
-			      enum dma_data_direction direction)
-{
-	u64 addr;
-
-	BUG_ON(!valid_dma_direction(direction));
-
-	if (offset + size > PAGE_SIZE) {
-		addr = BAD_DMA_ADDRESS;
-		goto done;
-	}
-
-	addr = (u64) page_address(page);
-	if (addr)
-		addr += offset;
-	/* TODO: handle highmem pages */
-
-done:
-	return addr;
-}
-
-static void ipath_dma_unmap_page(struct ib_device *dev,
-				 u64 addr, size_t size,
-				 enum dma_data_direction direction)
-{
-	BUG_ON(!valid_dma_direction(direction));
-}
-
-static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-			int nents, enum dma_data_direction direction)
-{
-	struct scatterlist *sg;
-	u64 addr;
-	int i;
-	int ret = nents;
-
-	BUG_ON(!valid_dma_direction(direction));
-
-	for_each_sg(sgl, sg, nents, i) {
-		addr = (u64) page_address(sg_page(sg));
-		/* TODO: handle highmem pages */
-		if (!addr) {
-			ret = 0;
-			break;
-		}
-		sg->dma_address = addr + sg->offset;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-		sg->dma_length = sg->length;
-#endif
-	}
-	return ret;
-}
-
-static void ipath_unmap_sg(struct ib_device *dev,
-			   struct scatterlist *sg, int nents,
-			   enum dma_data_direction direction)
-{
-	BUG_ON(!valid_dma_direction(direction));
-}
-
-static void ipath_sync_single_for_cpu(struct ib_device *dev,
-				      u64 addr,
-				      size_t size,
-				      enum dma_data_direction dir)
-{
-}
-
-static void ipath_sync_single_for_device(struct ib_device *dev,
-					 u64 addr,
-					 size_t size,
-					 enum dma_data_direction dir)
-{
-}
-
-static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
-				      u64 *dma_handle, gfp_t flag)
-{
-	struct page *p;
-	void *addr = NULL;
-
-	p = alloc_pages(flag, get_order(size));
-	if (p)
-		addr = page_address(p);
-	if (dma_handle)
-		*dma_handle = (u64) addr;
-	return addr;
-}
-
-static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
-				    void *cpu_addr, u64 dma_handle)
-{
-	free_pages((unsigned long) cpu_addr, get_order(size));
-}
-
-struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
-	.mapping_error = ipath_mapping_error,
-	.map_single = ipath_dma_map_single,
-	.unmap_single = ipath_dma_unmap_single,
-	.map_page = ipath_dma_map_page,
-	.unmap_page = ipath_dma_unmap_page,
-	.map_sg = ipath_map_sg,
-	.unmap_sg = ipath_unmap_sg,
-	.sync_single_for_cpu = ipath_sync_single_for_cpu,
-	.sync_single_for_device = ipath_sync_single_for_device,
-	.alloc_coherent = ipath_dma_alloc_coherent,
-	.free_coherent = ipath_dma_free_coherent
-};
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
deleted file mode 100644
index 871dbe5..0000000
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ /dev/null
@@ -1,2789 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#ifdef CONFIG_X86_64
-#include <asm/pat.h>
-#endif
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-
-static void ipath_update_pio_bufs(struct ipath_devdata *);
-
-const char *ipath_get_unit_name(int unit)
-{
-	static char iname[16];
-	snprintf(iname, sizeof iname, "infinipath%u", unit);
-	return iname;
-}
-
-#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
-#define PFX IPATH_DRV_NAME ": "
-
-/*
- * The size has to be longer than this string, so we can append
- * board/chip information to it in the init code.
- */
-const char ib_ipath_version[] = IPATH_IDSTR "\n";
-
-static struct idr unit_table;
-DEFINE_SPINLOCK(ipath_devs_lock);
-LIST_HEAD(ipath_dev_list);
-
-wait_queue_head_t ipath_state_wait;
-
-unsigned ipath_debug = __IPATH_INFO;
-
-module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(debug, "mask for debug prints");
-EXPORT_SYMBOL_GPL(ipath_debug);
-
-unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
-module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
-MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
-
-static unsigned ipath_hol_timeout_ms = 13000;
-module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
-MODULE_PARM_DESC(hol_timeout_ms,
-	"duration of user app suspension after link failure");
-
-unsigned ipath_linkrecovery = 1;
-module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("QLogic <support@qlogic.com>");
-MODULE_DESCRIPTION("QLogic InfiniPath driver");
-
-/*
- * Table to translate the LINKTRAININGSTATE portion of
- * IBCStatus to a human-readable form.
- */
-const char *ipath_ibcstatus_str[] = {
-	"Disabled",
-	"LinkUp",
-	"PollActive",
-	"PollQuiet",
-	"SleepDelay",
-	"SleepQuiet",
-	"LState6",		/* unused */
-	"LState7",		/* unused */
-	"CfgDebounce",
-	"CfgRcvfCfg",
-	"CfgWaitRmt",
-	"CfgIdle",
-	"RecovRetrain",
-	"CfgTxRevLane",		/* unused before IBA7220 */
-	"RecovWaitRmt",
-	"RecovIdle",
-	/* below were added for IBA7220 */
-	"CfgEnhanced",
-	"CfgTest",
-	"CfgWaitRmtTest",
-	"CfgWaitCfgEnhanced",
-	"SendTS_T",
-	"SendTstIdles",
-	"RcvTS_T",
-	"SendTst_TS1s",
-	"LTState18", "LTState19", "LTState1A", "LTState1B",
-	"LTState1C", "LTState1D", "LTState1E", "LTState1F"
-};
-
-static void ipath_remove_one(struct pci_dev *);
-static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
-
-/* Only needed for registration, nothing else needs this info */
-#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
-#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
-
-/* Number of seconds before our card status check...  */
-#define STATUS_TIMEOUT 60
-
-static const struct pci_device_id ipath_pci_tbl[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
-	{ 0, }
-};
-
-MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
-
-static struct pci_driver ipath_driver = {
-	.name = IPATH_DRV_NAME,
-	.probe = ipath_init_one,
-	.remove = ipath_remove_one,
-	.id_table = ipath_pci_tbl,
-	.driver = {
-		.groups = ipath_driver_attr_groups,
-	},
-};
-
-static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
-			     u32 *bar0, u32 *bar1)
-{
-	int ret;
-
-	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
-	if (ret)
-		ipath_dev_err(dd, "failed to read bar0 before enable: "
-			      "error %d\n", -ret);
-
-	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
-	if (ret)
-		ipath_dev_err(dd, "failed to read bar1 before enable: "
-			      "error %d\n", -ret);
-
-	ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
-}
-
-static void ipath_free_devdata(struct pci_dev *pdev,
-			       struct ipath_devdata *dd)
-{
-	unsigned long flags;
-
-	pci_set_drvdata(pdev, NULL);
-
-	if (dd->ipath_unit != -1) {
-		spin_lock_irqsave(&ipath_devs_lock, flags);
-		idr_remove(&unit_table, dd->ipath_unit);
-		list_del(&dd->ipath_list);
-		spin_unlock_irqrestore(&ipath_devs_lock, flags);
-	}
-	vfree(dd);
-}
-
-static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
-{
-	unsigned long flags;
-	struct ipath_devdata *dd;
-	int ret;
-
-	dd = vzalloc(sizeof(*dd));
-	if (!dd) {
-		dd = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-	dd->ipath_unit = -1;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irqsave(&ipath_devs_lock, flags);
-
-	ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
-	if (ret < 0) {
-		printk(KERN_ERR IPATH_DRV_NAME
-		       ": Could not allocate unit ID: error %d\n", -ret);
-		ipath_free_devdata(pdev, dd);
-		dd = ERR_PTR(ret);
-		goto bail_unlock;
-	}
-	dd->ipath_unit = ret;
-
-	dd->pcidev = pdev;
-	pci_set_drvdata(pdev, dd);
-
-	list_add(&dd->ipath_list, &ipath_dev_list);
-
-bail_unlock:
-	spin_unlock_irqrestore(&ipath_devs_lock, flags);
-	idr_preload_end();
-bail:
-	return dd;
-}
-
-static inline struct ipath_devdata *__ipath_lookup(int unit)
-{
-	return idr_find(&unit_table, unit);
-}
-
-struct ipath_devdata *ipath_lookup(int unit)
-{
-	struct ipath_devdata *dd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ipath_devs_lock, flags);
-	dd = __ipath_lookup(unit);
-	spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-	return dd;
-}
-
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
-{
-	int nunits, npresent, nup;
-	struct ipath_devdata *dd;
-	unsigned long flags;
-	int maxports;
-
-	nunits = npresent = nup = maxports = 0;
-
-	spin_lock_irqsave(&ipath_devs_lock, flags);
-
-	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-		nunits++;
-		if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
-			npresent++;
-		if (dd->ipath_lid &&
-		    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
-					 | IPATH_LINKUNK)))
-			nup++;
-		if (dd->ipath_cfgports > maxports)
-			maxports = dd->ipath_cfgports;
-	}
-
-	spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-	if (npresentp)
-		*npresentp = npresent;
-	if (nupp)
-		*nupp = nup;
-	if (maxportsp)
-		*maxportsp = maxports;
-
-	return nunits;
-}
-
-/*
- * These next two routines are placeholders in case we don't have per-arch
- * code for controlling write combining.  If explicit control of write
- * combining is not available, performance will probably be awful.
- */
-
-int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
-{
-	return -EOPNOTSUPP;
-}
-
-void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
-{
-}
-
-/*
- * Perform a PIO buffer bandwidth write test, to verify proper system
- * configuration.  Even when all the setup calls work, occasionally
- * BIOS or other issues can prevent write combining from working, or
- * can cause other bandwidth problems to the chip.
- *
- * This test simply writes the same buffer over and over again, and
- * measures close to the peak bandwidth to the chip (not testing
- * data bandwidth to the wire).   On chips that use an address-based
- * trigger to send packets to the wire, this is easy.  On chips that
- * use a count to trigger, we want to make sure that the packet doesn't
- * go out on the wire, or trigger flow control checks.
- */
-static void ipath_verify_pioperf(struct ipath_devdata *dd)
-{
-	u32 pbnum, cnt, lcnt;
-	u32 __iomem *piobuf;
-	u32 *addr;
-	u64 msecs, emsecs;
-
-	piobuf = ipath_getpiobuf(dd, 0, &pbnum);
-	if (!piobuf) {
-		dev_info(&dd->pcidev->dev,
-			"No PIObufs for checking perf, skipping\n");
-		return;
-	}
-
-	/*
-	 * Enough to give us a reasonable test, less than piobuf size, and
-	 * likely multiple of store buffer length.
-	 */
-	cnt = 1024;
-
-	addr = vmalloc(cnt);
-	if (!addr) {
-		dev_info(&dd->pcidev->dev,
-			"Couldn't get memory for checking PIO perf,"
-			" skipping\n");
-		goto done;
-	}
-
-	preempt_disable();  /* we want reasonably accurate elapsed time */
-	msecs = 1 + jiffies_to_msecs(jiffies);
-	for (lcnt = 0; lcnt < 10000U; lcnt++) {
-		/* wait until we cross msec boundary */
-		if (jiffies_to_msecs(jiffies) >= msecs)
-			break;
-		udelay(1);
-	}
-
-	ipath_disable_armlaunch(dd);
-
-	/*
-	 * length 0, no dwords actually sent, and mark as VL15
-	 * on chips where that may matter (due to IB flowcontrol)
-	 */
-	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
-		writeq(1UL << 63, piobuf);
-	else
-		writeq(0, piobuf);
-	ipath_flush_wc();
-
-	/*
-	 * this is only roughly accurate, since even with preempt we
-	 * still take interrupts that could take a while.   Running for
-	 * >= 5 msec seems to get us "close enough" to accurate values
-	 */
-	msecs = jiffies_to_msecs(jiffies);
-	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
-		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
-		emsecs = jiffies_to_msecs(jiffies) - msecs;
-	}
-
-	/* 1 GiB/sec, slightly over IB SDR line rate */
-	if (lcnt < (emsecs * 1024U))
-		ipath_dev_err(dd,
-			"Performance problem: bandwidth to PIO buffers is "
-			"only %u MiB/sec\n",
-			lcnt / (u32) emsecs);
-	else
-		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
-			lcnt / (u32) emsecs);
-
-	preempt_enable();
-
-	vfree(addr);
-
-done:
-	/* disarm piobuf, so it's available again */
-	ipath_disarm_piobufs(dd, pbnum, 1);
-	ipath_enable_armlaunch(dd);
-}
-
-static void cleanup_device(struct ipath_devdata *dd);
-
-static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	int ret, len, j;
-	struct ipath_devdata *dd;
-	unsigned long long addr;
-	u32 bar0 = 0, bar1 = 0;
-
-#ifdef CONFIG_X86_64
-	if (pat_enabled()) {
-		pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
-		ret = -ENODEV;
-		goto bail;
-	}
-#endif
-
-	dd = ipath_alloc_devdata(pdev);
-	if (IS_ERR(dd)) {
-		ret = PTR_ERR(dd);
-		printk(KERN_ERR IPATH_DRV_NAME
-		       ": Could not allocate devdata: error %d\n", -ret);
-		goto bail;
-	}
-
-	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
-
-	ret = pci_enable_device(pdev);
-	if (ret) {
-		/* This can happen iff:
-		 *
-		 * We did a chip reset, and then failed to reprogram the
-		 * BAR, or the chip reset due to an internal error.  We then
-		 * unloaded the driver and reloaded it.
-		 *
-		 * Both reset cases set the BAR back to initial state.  For
-		 * the latter case, the AER sticky error bit at offset 0x718
-		 * should be set, but the Linux kernel doesn't yet know
-		 * about that, it appears.  If the original BAR was retained
-		 * in the kernel data structures, this may be OK.
-		 */
-		ipath_dev_err(dd, "enable unit %d failed: error %d\n",
-			      dd->ipath_unit, -ret);
-		goto bail_devdata;
-	}
-	addr = pci_resource_start(pdev, 0);
-	len = pci_resource_len(pdev, 0);
-	ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
-		   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
-		   ent->device, ent->driver_data);
-
-	read_bars(dd, pdev, &bar0, &bar1);
-
-	if (!bar1 && !(bar0 & ~0xf)) {
-		if (addr) {
-			dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
-				 "rewriting as %llx\n", addr);
-			ret = pci_write_config_dword(
-				pdev, PCI_BASE_ADDRESS_0, addr);
-			if (ret) {
-				ipath_dev_err(dd, "rewrite of BAR0 "
-					      "failed: err %d\n", -ret);
-				goto bail_disable;
-			}
-			ret = pci_write_config_dword(
-				pdev, PCI_BASE_ADDRESS_1, addr >> 32);
-			if (ret) {
-				ipath_dev_err(dd, "rewrite of BAR1 "
-					      "failed: err %d\n", -ret);
-				goto bail_disable;
-			}
-		} else {
-			ipath_dev_err(dd, "BAR is 0 (probable RESET), "
-				      "not usable until reboot\n");
-			ret = -ENODEV;
-			goto bail_disable;
-		}
-	}
-
-	ret = pci_request_regions(pdev, IPATH_DRV_NAME);
-	if (ret) {
-		dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
-			 "err %d\n", dd->ipath_unit, -ret);
-		goto bail_disable;
-	}
-
-	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (ret) {
-		/*
-		 * if the 64 bit setup fails, try 32 bit.  Some systems
-		 * do not setup 64 bit maps on systems with 2GB or less
-		 * memory installed.
-		 */
-		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (ret) {
-			dev_info(&pdev->dev,
-				"Unable to set DMA mask for unit %u: %d\n",
-				dd->ipath_unit, ret);
-			goto bail_regions;
-		}
-		else {
-			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
-			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-			if (ret)
-				dev_info(&pdev->dev,
-					"Unable to set DMA consistent mask "
-					"for unit %u: %d\n",
-					dd->ipath_unit, ret);
-
-		}
-	}
-	else {
-		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-		if (ret)
-			dev_info(&pdev->dev,
-				"Unable to set DMA consistent mask "
-				"for unit %u: %d\n",
-				dd->ipath_unit, ret);
-	}
-
-	pci_set_master(pdev);
-
-	/*
-	 * Save BARs to rewrite after device reset.  Save all 64 bits of
-	 * BAR, just in case.
-	 */
-	dd->ipath_pcibar0 = addr;
-	dd->ipath_pcibar1 = addr >> 32;
-	dd->ipath_deviceid = ent->device;	/* save for later use */
-	dd->ipath_vendorid = ent->vendor;
-
-	/* setup the chip-specific functions, as early as possible. */
-	switch (ent->device) {
-	case PCI_DEVICE_ID_INFINIPATH_HT:
-		ipath_init_iba6110_funcs(dd);
-		break;
-
-	default:
-		ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
-			      "failing\n", ent->device);
-		return -ENODEV;
-	}
-
-	for (j = 0; j < 6; j++) {
-		if (!pdev->resource[j].start)
-			continue;
-		ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
-			   j, &pdev->resource[j],
-			   (unsigned long long)pci_resource_len(pdev, j));
-	}
-
-	if (!addr) {
-		ipath_dev_err(dd, "No valid address in BAR 0!\n");
-		ret = -ENODEV;
-		goto bail_regions;
-	}
-
-	dd->ipath_pcirev = pdev->revision;
-
-#if defined(__powerpc__)
-	/* There isn't a generic way to specify writethrough mappings */
-	dd->ipath_kregbase = __ioremap(addr, len,
-		(_PAGE_NO_CACHE|_PAGE_WRITETHRU));
-#else
-	/* XXX: split this properly to enable on PAT */
-	dd->ipath_kregbase = ioremap_nocache(addr, len);
-#endif
-
-	if (!dd->ipath_kregbase) {
-		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
-			  addr);
-		ret = -ENOMEM;
-		goto bail_iounmap;
-	}
-	dd->ipath_kregend = (u64 __iomem *)
-		((void __iomem *)dd->ipath_kregbase + len);
-	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
-	/* for user mmap */
-	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
-		   addr, dd->ipath_kregbase);
-
-	if (dd->ipath_f_bus(dd, pdev))
-		ipath_dev_err(dd, "Failed to setup config space; "
-			      "continuing anyway\n");
-
-	/*
-	 * set up our interrupt handler; IRQF_SHARED probably not needed,
-	 * since MSI interrupts shouldn't be shared but won't  hurt for now.
-	 * check 0 irq after we return from chip-specific bus setup, since
-	 * that can affect this due to setup
-	 */
-	if (!dd->ipath_irq)
-		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
-			      "work\n");
-	else {
-		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
-				  IPATH_DRV_NAME, dd);
-		if (ret) {
-			ipath_dev_err(dd, "Couldn't setup irq handler, "
-				      "irq=%d: %d\n", dd->ipath_irq, ret);
-			goto bail_iounmap;
-		}
-	}
-
-	ret = ipath_init_chip(dd, 0);	/* do the chip-specific init */
-	if (ret)
-		goto bail_irqsetup;
-
-	ret = ipath_enable_wc(dd);
-
-	if (ret)
-		ret = 0;
-
-	ipath_verify_pioperf(dd);
-
-	ipath_device_create_group(&pdev->dev, dd);
-	ipathfs_add_device(dd);
-	ipath_user_add(dd);
-	ipath_diag_add(dd);
-	ipath_register_ib_device(dd);
-
-	goto bail;
-
-bail_irqsetup:
-	cleanup_device(dd);
-
-	if (dd->ipath_irq)
-		dd->ipath_f_free_irq(dd);
-
-	if (dd->ipath_f_cleanup)
-		dd->ipath_f_cleanup(dd);
-
-bail_iounmap:
-	iounmap((volatile void __iomem *) dd->ipath_kregbase);
-
-bail_regions:
-	pci_release_regions(pdev);
-
-bail_disable:
-	pci_disable_device(pdev);
-
-bail_devdata:
-	ipath_free_devdata(pdev, dd);
-
-bail:
-	return ret;
-}
-
-static void cleanup_device(struct ipath_devdata *dd)
-{
-	int port;
-	struct ipath_portdata **tmp;
-	unsigned long flags;
-
-	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
-		/* can't do anything more with chip; needs re-init */
-		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
-		if (dd->ipath_kregbase) {
-			/*
-			 * if we haven't already cleaned up before these are
-			 * to ensure any register reads/writes "fail" until
-			 * re-init
-			 */
-			dd->ipath_kregbase = NULL;
-			dd->ipath_uregbase = 0;
-			dd->ipath_sregbase = 0;
-			dd->ipath_cregbase = 0;
-			dd->ipath_kregsize = 0;
-		}
-		ipath_disable_wc(dd);
-	}
-
-	if (dd->ipath_spectriggerhit)
-		dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
-			 dd->ipath_spectriggerhit);
-
-	if (dd->ipath_pioavailregs_dma) {
-		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-				  (void *) dd->ipath_pioavailregs_dma,
-				  dd->ipath_pioavailregs_phys);
-		dd->ipath_pioavailregs_dma = NULL;
-	}
-	if (dd->ipath_dummy_hdrq) {
-		dma_free_coherent(&dd->pcidev->dev,
-			dd->ipath_pd[0]->port_rcvhdrq_size,
-			dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
-		dd->ipath_dummy_hdrq = NULL;
-	}
-
-	if (dd->ipath_pageshadow) {
-		struct page **tmpp = dd->ipath_pageshadow;
-		dma_addr_t *tmpd = dd->ipath_physshadow;
-		int i, cnt = 0;
-
-		ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
-			   "locked\n");
-		for (port = 0; port < dd->ipath_cfgports; port++) {
-			int port_tidbase = port * dd->ipath_rcvtidcnt;
-			int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-			for (i = port_tidbase; i < maxtid; i++) {
-				if (!tmpp[i])
-					continue;
-				pci_unmap_page(dd->pcidev, tmpd[i],
-					PAGE_SIZE, PCI_DMA_FROMDEVICE);
-				ipath_release_user_pages(&tmpp[i], 1);
-				tmpp[i] = NULL;
-				cnt++;
-			}
-		}
-		if (cnt) {
-			ipath_stats.sps_pageunlocks += cnt;
-			ipath_cdbg(VERBOSE, "There were still %u expTID "
-				   "entries locked\n", cnt);
-		}
-		if (ipath_stats.sps_pagelocks ||
-		    ipath_stats.sps_pageunlocks)
-			ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
-				   "unlocked via ipath_m{un}lock\n",
-				   (unsigned long long)
-				   ipath_stats.sps_pagelocks,
-				   (unsigned long long)
-				   ipath_stats.sps_pageunlocks);
-
-		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
-			   dd->ipath_pageshadow);
-		tmpp = dd->ipath_pageshadow;
-		dd->ipath_pageshadow = NULL;
-		vfree(tmpp);
-
-		dd->ipath_egrtidbase = NULL;
-	}
-
-	/*
-	 * free any resources still in use (usually just kernel ports)
-	 * at unload; we do for portcnt, because that's what we allocate.
-	 * We acquire lock to be really paranoid that ipath_pd isn't being
-	 * accessed from some interrupt-related code (that should not happen,
-	 * but best to be sure).
-	 */
-	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-	tmp = dd->ipath_pd;
-	dd->ipath_pd = NULL;
-	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-	for (port = 0; port < dd->ipath_portcnt; port++) {
-		struct ipath_portdata *pd = tmp[port];
-		tmp[port] = NULL; /* debugging paranoia */
-		ipath_free_pddata(dd, pd);
-	}
-	kfree(tmp);
-}
-
-static void ipath_remove_one(struct pci_dev *pdev)
-{
-	struct ipath_devdata *dd = pci_get_drvdata(pdev);
-
-	ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
-
-	/*
-	 * disable the IB link early, to be sure no new packets arrive, which
-	 * complicates the shutdown process
-	 */
-	ipath_shutdown_device(dd);
-
-	flush_workqueue(ib_wq);
-
-	if (dd->verbs_dev)
-		ipath_unregister_ib_device(dd->verbs_dev);
-
-	ipath_diag_remove(dd);
-	ipath_user_remove(dd);
-	ipathfs_remove_device(dd);
-	ipath_device_remove_group(&pdev->dev, dd);
-
-	ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
-		   "unit %u\n", dd, (u32) dd->ipath_unit);
-
-	cleanup_device(dd);
-
-	/*
-	 * turn off rcv, send, and interrupts for all ports, all drivers
-	 * should also hard reset the chip here?
-	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
-	 * for all versions of the driver, if they were allocated
-	 */
-	if (dd->ipath_irq) {
-		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
-			   dd->ipath_unit, dd->ipath_irq);
-		dd->ipath_f_free_irq(dd);
-	} else
-		ipath_dbg("irq is 0, not doing free_irq "
-			  "for unit %u\n", dd->ipath_unit);
-	/*
-	 * we check for NULL here, because it's outside
-	 * the kregbase check, and we need to call it
-	 * after the free_irq.	Thus it's possible that
-	 * the function pointers were never initialized.
-	 */
-	if (dd->ipath_f_cleanup)
-		/* clean up chip-specific stuff */
-		dd->ipath_f_cleanup(dd);
-
-	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
-	iounmap((volatile void __iomem *) dd->ipath_kregbase);
-	pci_release_regions(pdev);
-	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
-	pci_disable_device(pdev);
-
-	ipath_free_devdata(pdev, dd);
-}
-
-/* general driver use */
-DEFINE_MUTEX(ipath_mutex);
-
-static DEFINE_SPINLOCK(ipath_pioavail_lock);
-
-/**
- * ipath_disarm_piobufs - cancel a range of PIO buffers
- * @dd: the infinipath device
- * @first: the first PIO buffer to cancel
- * @cnt: the number of PIO buffers to cancel
- *
- * cancel a range of PIO buffers, used when they might be armed, but
- * not triggered.  Used at init to ensure buffer state, and also user
- * process close, in case it died while writing to a PIO buffer
- * Also after errors.
- */
-void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
-			  unsigned cnt)
-{
-	unsigned i, last = first + cnt;
-	unsigned long flags;
-
-	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
-	for (i = first; i < last; i++) {
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		/*
-		 * The disarm-related bits are write-only, so it
-		 * is ok to OR them in with our copy of sendctrl
-		 * while we hold the lock.
-		 */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
-			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
-		/* can't disarm bufs back-to-back per iba7220 spec */
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-	}
-	/* on some older chips, update may not happen after cancel */
-	ipath_force_pio_avail_update(dd);
-}
-
-/**
- * ipath_wait_linkstate - wait for an IB link state change to occur
- * @dd: the infinipath device
- * @state: the state to wait for
- * @msecs: the number of milliseconds to wait
- *
- * wait up to msecs milliseconds for IB link state change to occur for
- * now, take the easy polling route.  Currently used only by
- * ipath_set_linkstate.  Returns 0 if state reached, otherwise
- * -ETIMEDOUT state can have multiple states set, for any of several
- * transitions.
- */
-int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
-{
-	dd->ipath_state_wanted = state;
-	wait_event_interruptible_timeout(ipath_state_wait,
-					 (dd->ipath_flags & state),
-					 msecs_to_jiffies(msecs));
-	dd->ipath_state_wanted = 0;
-
-	if (!(dd->ipath_flags & state)) {
-		u64 val;
-		ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
-			   " ms\n",
-			   /* test INIT ahead of DOWN, both can be set */
-			   (state & IPATH_LINKINIT) ? "INIT" :
-			   ((state & IPATH_LINKDOWN) ? "DOWN" :
-			    ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
-			   msecs);
-		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-		ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
-			   (unsigned long long) ipath_read_kreg64(
-				   dd, dd->ipath_kregs->kr_ibcctrl),
-			   (unsigned long long) val,
-			   ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
-	}
-	return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
-}
-
-static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
-	char *buf, size_t blen)
-{
-	static const struct {
-		ipath_err_t err;
-		const char *msg;
-	} errs[] = {
-		{ INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
-		{ INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
-		{ INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
-		{ INFINIPATH_E_SDMABASE, "SDmaBase" },
-		{ INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
-		{ INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
-		{ INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
-		{ INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
-		{ INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
-		{ INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
-		{ INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
-		{ INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
-	};
-	int i;
-	int expected;
-	size_t bidx = 0;
-
-	for (i = 0; i < ARRAY_SIZE(errs); i++) {
-		expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
-			test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-		if ((err & errs[i].err) && !expected)
-			bidx += snprintf(buf + bidx, blen - bidx,
-					 "%s ", errs[i].msg);
-	}
-}
-
-/*
- * Decode the error status into strings, deciding whether to always
- * print * it or not depending on "normal packet errors" vs everything
- * else.   Return 1 if "real" errors, otherwise 0 if only packet
- * errors, so caller can decide what to print with the string.
- */
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-	ipath_err_t err)
-{
-	int iserr = 1;
-	*buf = '\0';
-	if (err & INFINIPATH_E_PKTERRS) {
-		if (!(err & ~INFINIPATH_E_PKTERRS))
-			iserr = 0; // if only packet errors.
-		if (ipath_debug & __IPATH_ERRPKTDBG) {
-			if (err & INFINIPATH_E_REBP)
-				strlcat(buf, "EBP ", blen);
-			if (err & INFINIPATH_E_RVCRC)
-				strlcat(buf, "VCRC ", blen);
-			if (err & INFINIPATH_E_RICRC) {
-				strlcat(buf, "CRC ", blen);
-				// clear for check below, so only once
-				err &= INFINIPATH_E_RICRC;
-			}
-			if (err & INFINIPATH_E_RSHORTPKTLEN)
-				strlcat(buf, "rshortpktlen ", blen);
-			if (err & INFINIPATH_E_SDROPPEDDATAPKT)
-				strlcat(buf, "sdroppeddatapkt ", blen);
-			if (err & INFINIPATH_E_SPKTLEN)
-				strlcat(buf, "spktlen ", blen);
-		}
-		if ((err & INFINIPATH_E_RICRC) &&
-			!(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
-			strlcat(buf, "CRC ", blen);
-		if (!iserr)
-			goto done;
-	}
-	if (err & INFINIPATH_E_RHDRLEN)
-		strlcat(buf, "rhdrlen ", blen);
-	if (err & INFINIPATH_E_RBADTID)
-		strlcat(buf, "rbadtid ", blen);
-	if (err & INFINIPATH_E_RBADVERSION)
-		strlcat(buf, "rbadversion ", blen);
-	if (err & INFINIPATH_E_RHDR)
-		strlcat(buf, "rhdr ", blen);
-	if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
-		strlcat(buf, "sendspecialtrigger ", blen);
-	if (err & INFINIPATH_E_RLONGPKTLEN)
-		strlcat(buf, "rlongpktlen ", blen);
-	if (err & INFINIPATH_E_RMAXPKTLEN)
-		strlcat(buf, "rmaxpktlen ", blen);
-	if (err & INFINIPATH_E_RMINPKTLEN)
-		strlcat(buf, "rminpktlen ", blen);
-	if (err & INFINIPATH_E_SMINPKTLEN)
-		strlcat(buf, "sminpktlen ", blen);
-	if (err & INFINIPATH_E_RFORMATERR)
-		strlcat(buf, "rformaterr ", blen);
-	if (err & INFINIPATH_E_RUNSUPVL)
-		strlcat(buf, "runsupvl ", blen);
-	if (err & INFINIPATH_E_RUNEXPCHAR)
-		strlcat(buf, "runexpchar ", blen);
-	if (err & INFINIPATH_E_RIBFLOW)
-		strlcat(buf, "ribflow ", blen);
-	if (err & INFINIPATH_E_SUNDERRUN)
-		strlcat(buf, "sunderrun ", blen);
-	if (err & INFINIPATH_E_SPIOARMLAUNCH)
-		strlcat(buf, "spioarmlaunch ", blen);
-	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
-		strlcat(buf, "sunexperrpktnum ", blen);
-	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
-		strlcat(buf, "sdroppedsmppkt ", blen);
-	if (err & INFINIPATH_E_SMAXPKTLEN)
-		strlcat(buf, "smaxpktlen ", blen);
-	if (err & INFINIPATH_E_SUNSUPVL)
-		strlcat(buf, "sunsupVL ", blen);
-	if (err & INFINIPATH_E_INVALIDADDR)
-		strlcat(buf, "invalidaddr ", blen);
-	if (err & INFINIPATH_E_RRCVEGRFULL)
-		strlcat(buf, "rcvegrfull ", blen);
-	if (err & INFINIPATH_E_RRCVHDRFULL)
-		strlcat(buf, "rcvhdrfull ", blen);
-	if (err & INFINIPATH_E_IBSTATUSCHANGED)
-		strlcat(buf, "ibcstatuschg ", blen);
-	if (err & INFINIPATH_E_RIBLOSTLINK)
-		strlcat(buf, "riblostlink ", blen);
-	if (err & INFINIPATH_E_HARDWARE)
-		strlcat(buf, "hardware ", blen);
-	if (err & INFINIPATH_E_RESET)
-		strlcat(buf, "reset ", blen);
-	if (err & INFINIPATH_E_SDMAERRS)
-		decode_sdma_errs(dd, err, buf, blen);
-	if (err & INFINIPATH_E_INVALIDEEPCMD)
-		strlcat(buf, "invalideepromcmd ", blen);
-done:
-	return iserr;
-}
-
-/**
- * get_rhf_errstring - decode RHF errors
- * @err: the err number
- * @msg: the output buffer
- * @len: the length of the output buffer
- *
- * only used one place now, may want more later
- */
-static void get_rhf_errstring(u32 err, char *msg, size_t len)
-{
-	/* if no errors, and so don't need to check what's first */
-	*msg = '\0';
-
-	if (err & INFINIPATH_RHF_H_ICRCERR)
-		strlcat(msg, "icrcerr ", len);
-	if (err & INFINIPATH_RHF_H_VCRCERR)
-		strlcat(msg, "vcrcerr ", len);
-	if (err & INFINIPATH_RHF_H_PARITYERR)
-		strlcat(msg, "parityerr ", len);
-	if (err & INFINIPATH_RHF_H_LENERR)
-		strlcat(msg, "lenerr ", len);
-	if (err & INFINIPATH_RHF_H_MTUERR)
-		strlcat(msg, "mtuerr ", len);
-	if (err & INFINIPATH_RHF_H_IHDRERR)
-		/* infinipath hdr checksum error */
-		strlcat(msg, "ipathhdrerr ", len);
-	if (err & INFINIPATH_RHF_H_TIDERR)
-		strlcat(msg, "tiderr ", len);
-	if (err & INFINIPATH_RHF_H_MKERR)
-		/* bad port, offset, etc. */
-		strlcat(msg, "invalid ipathhdr ", len);
-	if (err & INFINIPATH_RHF_H_IBERR)
-		strlcat(msg, "iberr ", len);
-	if (err & INFINIPATH_RHF_L_SWA)
-		strlcat(msg, "swA ", len);
-	if (err & INFINIPATH_RHF_L_SWB)
-		strlcat(msg, "swB ", len);
-}
-
-/**
- * ipath_get_egrbuf - get an eager buffer
- * @dd: the infinipath device
- * @bufnum: the eager buffer to get
- *
- * must only be called if ipath_pd[port] is known to be allocated
- */
-static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
-{
-	return dd->ipath_port0_skbinfo ?
-		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
-}
-
-/**
- * ipath_alloc_skb - allocate an skb and buffer with possible constraints
- * @dd: the infinipath device
- * @gfp_mask: the sk_buff SFP mask
- */
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
-				gfp_t gfp_mask)
-{
-	struct sk_buff *skb;
-	u32 len;
-
-	/*
-	 * Only fully supported way to handle this is to allocate lots
-	 * extra, align as needed, and then do skb_reserve().  That wastes
-	 * a lot of memory...  I'll have to hack this into infinipath_copy
-	 * also.
-	 */
-
-	/*
-	 * We need 2 extra bytes for ipath_ether data sent in the
-	 * key header.  In order to keep everything dword aligned,
-	 * we'll reserve 4 bytes.
-	 */
-	len = dd->ipath_ibmaxlen + 4;
-
-	if (dd->ipath_flags & IPATH_4BYTE_TID) {
-		/* We need a 2KB multiple alignment, and there is no way
-		 * to do it except to allocate extra and then skb_reserve
-		 * enough to bring it up to the right alignment.
-		 */
-		len += 2047;
-	}
-
-	skb = __dev_alloc_skb(len, gfp_mask);
-	if (!skb) {
-		ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
-			      len);
-		goto bail;
-	}
-
-	skb_reserve(skb, 4);
-
-	if (dd->ipath_flags & IPATH_4BYTE_TID) {
-		u32 una = (unsigned long)skb->data & 2047;
-		if (una)
-			skb_reserve(skb, 2048 - una);
-	}
-
-bail:
-	return skb;
-}
-
-static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
-			     u32 eflags,
-			     u32 l,
-			     u32 etail,
-			     __le32 *rhf_addr,
-			     struct ipath_message_header *hdr)
-{
-	char emsg[128];
-
-	get_rhf_errstring(eflags, emsg, sizeof emsg);
-	ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
-		   "tlen=%x opcode=%x egridx=%x: %s\n",
-		   eflags, l,
-		   ipath_hdrget_rcv_type(rhf_addr),
-		   ipath_hdrget_length_in_bytes(rhf_addr),
-		   be32_to_cpu(hdr->bth[0]) >> 24,
-		   etail, emsg);
-
-	/* Count local link integrity errors. */
-	if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
-		u8 n = (dd->ipath_ibcctrl >>
-			INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-			INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-
-		if (++dd->ipath_lli_counter > n) {
-			dd->ipath_lli_counter = 0;
-			dd->ipath_lli_errors++;
-		}
-	}
-}
-
-/*
- * ipath_kreceive - receive a packet
- * @pd: the infinipath port
- *
- * called from interrupt handler for errors or receive interrupt
- */
-void ipath_kreceive(struct ipath_portdata *pd)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	__le32 *rhf_addr;
-	void *ebuf;
-	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
-	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
-	u32 etail = -1, l, hdrqtail;
-	struct ipath_message_header *hdr;
-	u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
-	static u64 totcalls;	/* stats, may eventually remove */
-	int last;
-
-	l = pd->port_head;
-	rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
-	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-		u32 seq = ipath_hdrget_seq(rhf_addr);
-
-		if (seq != pd->port_seq_cnt)
-			goto bail;
-		hdrqtail = 0;
-	} else {
-		hdrqtail = ipath_get_rcvhdrtail(pd);
-		if (l == hdrqtail)
-			goto bail;
-		smp_rmb();
-	}
-
-reloop:
-	for (last = 0, i = 1; !last; i += !last) {
-		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
-		eflags = ipath_hdrget_err_flags(rhf_addr);
-		etype = ipath_hdrget_rcv_type(rhf_addr);
-		/* total length */
-		tlen = ipath_hdrget_length_in_bytes(rhf_addr);
-		ebuf = NULL;
-		if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
-		    ipath_hdrget_use_egr_buf(rhf_addr) :
-		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
-			/*
-			 * It turns out that the chip uses an eager buffer
-			 * for all non-expected packets, whether it "needs"
-			 * one or not.  So always get the index, but don't
-			 * set ebuf (so we try to copy data) unless the
-			 * length requires it.
-			 */
-			etail = ipath_hdrget_index(rhf_addr);
-			updegr = 1;
-			if (tlen > sizeof(*hdr) ||
-			    etype == RCVHQ_RCV_TYPE_NON_KD)
-				ebuf = ipath_get_egrbuf(dd, etail);
-		}
-
-		/*
-		 * both tiderr and ipathhdrerr are set for all plain IB
-		 * packets; only ipathhdrerr should be set.
-		 */
-
-		if (etype != RCVHQ_RCV_TYPE_NON_KD &&
-		    etype != RCVHQ_RCV_TYPE_ERROR &&
-		    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
-		    IPS_PROTO_VERSION)
-			ipath_cdbg(PKT, "Bad InfiniPath protocol version "
-				   "%x\n", etype);
-
-		if (unlikely(eflags))
-			ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
-		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
-			ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
-			if (dd->ipath_lli_counter)
-				dd->ipath_lli_counter--;
-		} else if (etype == RCVHQ_RCV_TYPE_EAGER) {
-			u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
-			u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
-			ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
-				   "qp=%x), len %x; ignored\n",
-				   etype, opcode, qp, tlen);
-		}
-		else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
-			ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
-				  be32_to_cpu(hdr->bth[0]) >> 24);
-		else {
-			/*
-			 * error packet, type of error unknown.
-			 * Probably type 3, but we don't know, so don't
-			 * even try to print the opcode, etc.
-			 * Usually caused by a "bad packet", that has no
-			 * BTH, when the LRH says it should.
-			 */
-			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
-				  " %x, len %x hdrq+%x rhf: %Lx\n",
-				  etail, tlen, l, (unsigned long long)
-				  le64_to_cpu(*(__le64 *) rhf_addr));
-			if (ipath_debug & __IPATH_ERRPKTDBG) {
-				u32 j, *d, dw = rsize-2;
-				if (rsize > (tlen>>2))
-					dw = tlen>>2;
-				d = (u32 *)hdr;
-				printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
-					dw);
-				for (j = 0; j < dw; j++)
-					printk(KERN_DEBUG "%8x%s", d[j],
-						(j%8) == 7 ? "\n" : " ");
-				printk(KERN_DEBUG ".\n");
-			}
-		}
-		l += rsize;
-		if (l >= maxcnt)
-			l = 0;
-		rhf_addr = (__le32 *) pd->port_rcvhdrq +
-			l + dd->ipath_rhf_offset;
-		if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-			u32 seq = ipath_hdrget_seq(rhf_addr);
-
-			if (++pd->port_seq_cnt > 13)
-				pd->port_seq_cnt = 1;
-			if (seq != pd->port_seq_cnt)
-				last = 1;
-		} else if (l == hdrqtail)
-			last = 1;
-		/*
-		 * update head regs on last packet, and every 16 packets.
-		 * Reduce bus traffic, while still trying to prevent
-		 * rcvhdrq overflows, for when the queue is nearly full
-		 */
-		if (last || !(i & 0xf)) {
-			u64 lval = l;
-
-			/* request IBA6120 and 7220 interrupt only on last */
-			if (last)
-				lval |= dd->ipath_rhdrhead_intr_off;
-			ipath_write_ureg(dd, ur_rcvhdrhead, lval,
-				pd->port_port);
-			if (updegr) {
-				ipath_write_ureg(dd, ur_rcvegrindexhead,
-						 etail, pd->port_port);
-				updegr = 0;
-			}
-		}
-	}
-
-	if (!dd->ipath_rhdrhead_intr_off && !reloop &&
-	    !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-		/* IBA6110 workaround; we can have a race clearing chip
-		 * interrupt with another interrupt about to be delivered,
-		 * and can clear it before it is delivered on the GPIO
-		 * workaround.  By doing the extra check here for the
-		 * in-memory tail register updating while we were doing
-		 * earlier packets, we "almost" guarantee we have covered
-		 * that case.
-		 */
-		u32 hqtail = ipath_get_rcvhdrtail(pd);
-		if (hqtail != hdrqtail) {
-			hdrqtail = hqtail;
-			reloop = 1; /* loop 1 extra time at most */
-			goto reloop;
-		}
-	}
-
-	pkttot += i;
-
-	pd->port_head = l;
-
-	if (pkttot > ipath_stats.sps_maxpkts_call)
-		ipath_stats.sps_maxpkts_call = pkttot;
-	ipath_stats.sps_port0pkts += pkttot;
-	ipath_stats.sps_avgpkts_call =
-		ipath_stats.sps_port0pkts / ++totcalls;
-
-bail:;
-}
-
-/**
- * ipath_update_pio_bufs - update shadow copy of the PIO availability map
- * @dd: the infinipath device
- *
- * called whenever our local copy indicates we have run out of send buffers
- * NOTE: This can be called from interrupt context by some code
- * and from non-interrupt context by ipath_getpiobuf().
- */
-
-static void ipath_update_pio_bufs(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-	int i;
-	const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
-
-	/* If the generation (check) bits have changed, then we update the
-	 * busy bit for the corresponding PIO buffer.  This algorithm will
-	 * modify positions to the value they already have in some cases
-	 * (i.e., no change), but it's faster than changing only the bits
-	 * that have changed.
-	 *
-	 * We would like to do this atomicly, to avoid spinlocks in the
-	 * critical send path, but that's not really possible, given the
-	 * type of changes, and that this routine could be called on
-	 * multiple cpu's simultaneously, so we lock in this routine only,
-	 * to avoid conflicting updates; all we change is the shadow, and
-	 * it's a single 64 bit memory location, so by definition the update
-	 * is atomic in terms of what other cpu's can see in testing the
-	 * bits.  The spin_lock overhead isn't too bad, since it only
-	 * happens when all buffers are in use, so only cpu overhead, not
-	 * latency or bandwidth is affected.
-	 */
-	if (!dd->ipath_pioavailregs_dma) {
-		ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
-		return;
-	}
-	if (ipath_debug & __IPATH_VERBDBG) {
-		/* only if packet debug and verbose */
-		volatile __le64 *dma = dd->ipath_pioavailregs_dma;
-		unsigned long *shadow = dd->ipath_pioavailshadow;
-
-		ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
-			   "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
-			   "s3=%lx\n",
-			   (unsigned long long) le64_to_cpu(dma[0]),
-			   shadow[0],
-			   (unsigned long long) le64_to_cpu(dma[1]),
-			   shadow[1],
-			   (unsigned long long) le64_to_cpu(dma[2]),
-			   shadow[2],
-			   (unsigned long long) le64_to_cpu(dma[3]),
-			   shadow[3]);
-		if (piobregs > 4)
-			ipath_cdbg(
-				PKT, "2nd group, dma4=%llx shad4=%lx, "
-				"d5=%llx s5=%lx, d6=%llx s6=%lx, "
-				"d7=%llx s7=%lx\n",
-				(unsigned long long) le64_to_cpu(dma[4]),
-				shadow[4],
-				(unsigned long long) le64_to_cpu(dma[5]),
-				shadow[5],
-				(unsigned long long) le64_to_cpu(dma[6]),
-				shadow[6],
-				(unsigned long long) le64_to_cpu(dma[7]),
-				shadow[7]);
-	}
-	spin_lock_irqsave(&ipath_pioavail_lock, flags);
-	for (i = 0; i < piobregs; i++) {
-		u64 pchbusy, pchg, piov, pnew;
-		/*
-		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
-		 */
-		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
-		else
-			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
-		pchg = dd->ipath_pioavailkernel[i] &
-			~(dd->ipath_pioavailshadow[i] ^ piov);
-		pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
-		if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
-			pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
-			pnew |= piov & pchbusy;
-			dd->ipath_pioavailshadow[i] = pnew;
-		}
-	}
-	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/*
- * used to force update of pioavailshadow if we can't get a pio buffer.
- * Needed primarily due to exitting freeze mode after recovering
- * from errors.  Done lazily, because it's safer (known to not
- * be writing pio buffers).
- */
-static void ipath_reset_availshadow(struct ipath_devdata *dd)
-{
-	int i, im;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ipath_pioavail_lock, flags);
-	for (i = 0; i < dd->ipath_pioavregs; i++) {
-		u64 val, oldval;
-		/* deal with 6110 chip bug on high register #s */
-		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-			i ^ 1 : i;
-		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
-		/*
-		 * busy out the buffers not in the kernel avail list,
-		 * without changing the generation bits.
-		 */
-		oldval = dd->ipath_pioavailshadow[i];
-		dd->ipath_pioavailshadow[i] = val |
-			((~dd->ipath_pioavailkernel[i] <<
-			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
-			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
-		if (oldval != dd->ipath_pioavailshadow[i])
-			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
-				i, (unsigned long long) oldval,
-				dd->ipath_pioavailshadow[i]);
-	}
-	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/**
- * ipath_setrcvhdrsize - set the receive header size
- * @dd: the infinipath device
- * @rhdrsize: the receive header size
- *
- * called from user init code, and also layered driver init
- */
-int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
-{
-	int ret = 0;
-
-	if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
-		if (dd->ipath_rcvhdrsize != rhdrsize) {
-			dev_info(&dd->pcidev->dev,
-				 "Error: can't set protocol header "
-				 "size %u, already %u\n",
-				 rhdrsize, dd->ipath_rcvhdrsize);
-			ret = -EAGAIN;
-		} else
-			ipath_cdbg(VERBOSE, "Reuse same protocol header "
-				   "size %u\n", dd->ipath_rcvhdrsize);
-	} else if (rhdrsize > (dd->ipath_rcvhdrentsize -
-			       (sizeof(u64) / sizeof(u32)))) {
-		ipath_dbg("Error: can't set protocol header size %u "
-			  "(> max %u)\n", rhdrsize,
-			  dd->ipath_rcvhdrentsize -
-			  (u32) (sizeof(u64) / sizeof(u32)));
-		ret = -EOVERFLOW;
-	} else {
-		dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
-		dd->ipath_rcvhdrsize = rhdrsize;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-				 dd->ipath_rcvhdrsize);
-		ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
-			   dd->ipath_rcvhdrsize);
-	}
-	return ret;
-}
-
-/*
- * debugging code and stats updates if no pio buffers available.
- */
-static noinline void no_pio_bufs(struct ipath_devdata *dd)
-{
-	unsigned long *shadow = dd->ipath_pioavailshadow;
-	__le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
-
-	dd->ipath_upd_pio_shadow = 1;
-
-	/*
-	 * not atomic, but if we lose a stat count in a while, that's OK
-	 */
-	ipath_stats.sps_nopiobufs++;
-	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
-		ipath_force_pio_avail_update(dd); /* at start */
-		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
-			"%llx %llx %llx %llx\n"
-			"ipath  shadow:  %lx %lx %lx %lx\n",
-			dd->ipath_consec_nopiobuf,
-			(unsigned long)get_cycles(),
-			(unsigned long long) le64_to_cpu(dma[0]),
-			(unsigned long long) le64_to_cpu(dma[1]),
-			(unsigned long long) le64_to_cpu(dma[2]),
-			(unsigned long long) le64_to_cpu(dma[3]),
-			shadow[0], shadow[1], shadow[2], shadow[3]);
-		/*
-		 * 4 buffers per byte, 4 registers above, cover rest
-		 * below
-		 */
-		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
-		    (sizeof(shadow[0]) * 4 * 4))
-			ipath_dbg("2nd group: dmacopy: "
-				  "%llx %llx %llx %llx\n"
-				  "ipath  shadow:  %lx %lx %lx %lx\n",
-				  (unsigned long long)le64_to_cpu(dma[4]),
-				  (unsigned long long)le64_to_cpu(dma[5]),
-				  (unsigned long long)le64_to_cpu(dma[6]),
-				  (unsigned long long)le64_to_cpu(dma[7]),
-				  shadow[4], shadow[5], shadow[6], shadow[7]);
-
-		/* at end, so update likely happened */
-		ipath_reset_availshadow(dd);
-	}
-}
-
-/*
- * common code for normal driver pio buffer allocation, and reserved
- * allocation.
- *
- * do appropriate marking as busy, etc.
- * returns buffer number if one found (>=0), negative number is error.
- */
-static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
-	u32 *pbufnum, u32 first, u32 last, u32 firsti)
-{
-	int i, j, updated = 0;
-	unsigned piobcnt;
-	unsigned long flags;
-	unsigned long *shadow = dd->ipath_pioavailshadow;
-	u32 __iomem *buf;
-
-	piobcnt = last - first;
-	if (dd->ipath_upd_pio_shadow) {
-		/*
-		 * Minor optimization.  If we had no buffers on last call,
-		 * start out by doing the update; continue and do scan even
-		 * if no buffers were updated, to be paranoid
-		 */
-		ipath_update_pio_bufs(dd);
-		updated++;
-		i = first;
-	} else
-		i = firsti;
-rescan:
-	/*
-	 * while test_and_set_bit() is atomic, we do that and then the
-	 * change_bit(), and the pair is not.  See if this is the cause
-	 * of the remaining armlaunch errors.
-	 */
-	spin_lock_irqsave(&ipath_pioavail_lock, flags);
-	for (j = 0; j < piobcnt; j++, i++) {
-		if (i >= last)
-			i = first;
-		if (__test_and_set_bit((2 * i) + 1, shadow))
-			continue;
-		/* flip generation bit */
-		__change_bit(2 * i, shadow);
-		break;
-	}
-	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-	if (j == piobcnt) {
-		if (!updated) {
-			/*
-			 * first time through; shadow exhausted, but may be
-			 * buffers available, try an update and then rescan.
-			 */
-			ipath_update_pio_bufs(dd);
-			updated++;
-			i = first;
-			goto rescan;
-		} else if (updated == 1 && piobcnt <=
-			((dd->ipath_sendctrl
-			>> INFINIPATH_S_UPDTHRESH_SHIFT) &
-			INFINIPATH_S_UPDTHRESH_MASK)) {
-			/*
-			 * for chips supporting and using the update
-			 * threshold we need to force an update of the
-			 * in-memory copy if the count is less than the
-			 * thershold, then check one more time.
-			 */
-			ipath_force_pio_avail_update(dd);
-			ipath_update_pio_bufs(dd);
-			updated++;
-			i = first;
-			goto rescan;
-		}
-
-		no_pio_bufs(dd);
-		buf = NULL;
-	} else {
-		if (i < dd->ipath_piobcnt2k)
-			buf = (u32 __iomem *) (dd->ipath_pio2kbase +
-					       i * dd->ipath_palign);
-		else
-			buf = (u32 __iomem *)
-				(dd->ipath_pio4kbase +
-				 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
-		if (pbufnum)
-			*pbufnum = i;
-	}
-
-	return buf;
-}
-
-/**
- * ipath_getpiobuf - find an available pio buffer
- * @dd: the infinipath device
- * @plen: the size of the PIO buffer needed in 32-bit words
- * @pbufnum: the buffer number is placed here
- */
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
-{
-	u32 __iomem *buf;
-	u32 pnum, nbufs;
-	u32 first, lasti;
-
-	if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
-		first = dd->ipath_piobcnt2k;
-		lasti = dd->ipath_lastpioindexl;
-	} else {
-		first = 0;
-		lasti = dd->ipath_lastpioindex;
-	}
-	nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-	buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
-
-	if (buf) {
-		/*
-		 * Set next starting place.  It's just an optimization,
-		 * it doesn't matter who wins on this, so no locking
-		 */
-		if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-			dd->ipath_lastpioindexl = pnum + 1;
-		else
-			dd->ipath_lastpioindex = pnum + 1;
-		if (dd->ipath_upd_pio_shadow)
-			dd->ipath_upd_pio_shadow = 0;
-		if (dd->ipath_consec_nopiobuf)
-			dd->ipath_consec_nopiobuf = 0;
-		ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
-			   pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
-		if (pbufnum)
-			*pbufnum = pnum;
-
-	}
-	return buf;
-}
-
-/**
- * ipath_chg_pioavailkernel - change which send buffers are available for kernel
- * @dd: the infinipath device
- * @start: the starting send buffer number
- * @len: the number of send buffers
- * @avail: true if the buffers are available for kernel use, false otherwise
- */
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-			      unsigned len, int avail)
-{
-	unsigned long flags;
-	unsigned end, cnt = 0;
-
-	/* There are two bits per send buffer (busy and generation) */
-	start *= 2;
-	end = start + len * 2;
-
-	spin_lock_irqsave(&ipath_pioavail_lock, flags);
-	/* Set or clear the busy bit in the shadow. */
-	while (start < end) {
-		if (avail) {
-			unsigned long dma;
-			int i, im;
-			/*
-			 * the BUSY bit will never be set, because we disarm
-			 * the user buffers before we hand them back to the
-			 * kernel.  We do have to make sure the generation
-			 * bit is set correctly in shadow, since it could
-			 * have changed many times while allocated to user.
-			 * We can't use the bitmap functions on the full
-			 * dma array because it is always little-endian, so
-			 * we have to flip to host-order first.
-			 * BITS_PER_LONG is slightly wrong, since it's
-			 * always 64 bits per register in chip...
-			 * We only work on 64 bit kernels, so that's OK.
-			 */
-			/* deal with 6110 chip bug on high register #s */
-			i = start / BITS_PER_LONG;
-			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-				i ^ 1 : i;
-			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
-				+ start, dd->ipath_pioavailshadow);
-			dma = (unsigned long) le64_to_cpu(
-				dd->ipath_pioavailregs_dma[im]);
-			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-				+ start) % BITS_PER_LONG, &dma))
-				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-					+ start, dd->ipath_pioavailshadow);
-			else
-				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-					+ start, dd->ipath_pioavailshadow);
-			__set_bit(start, dd->ipath_pioavailkernel);
-		} else {
-			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
-				dd->ipath_pioavailshadow);
-			__clear_bit(start, dd->ipath_pioavailkernel);
-		}
-		start += 2;
-	}
-
-	if (dd->ipath_pioupd_thresh) {
-		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-		cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
-	}
-	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-	/*
-	 * When moving buffers from kernel to user, if number assigned to
-	 * the user is less than the pio update threshold, and threshold
-	 * is supported (cnt was computed > 0), drop the update threshold
-	 * so we update at least once per allocated number of buffers.
-	 * In any case, if the kernel buffers are less than the threshold,
-	 * drop the threshold.  We don't bother increasing it, having once
-	 * decreased it, since it would typically just cycle back and forth.
-	 * If we don't decrease below buffers in use, we can wait a long
-	 * time for an update, until some other context uses PIO buffers.
-	 */
-	if (!avail && len < cnt)
-		cnt = len;
-	if (cnt < dd->ipath_pioupd_thresh) {
-		dd->ipath_pioupd_thresh = cnt;
-		ipath_dbg("Decreased pio update threshold to %u\n",
-			dd->ipath_pioupd_thresh);
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
-			<< INFINIPATH_S_UPDTHRESH_SHIFT);
-		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-			<< INFINIPATH_S_UPDTHRESH_SHIFT;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-	}
-}
-
-/**
- * ipath_create_rcvhdrq - create a receive header queue
- * @dd: the infinipath device
- * @pd: the port data
- *
- * this must be contiguous memory (from an i/o perspective), and must be
- * DMA'able (which means for some systems, it will go through an IOMMU,
- * or be forced into a low address range).
- */
-int ipath_create_rcvhdrq(struct ipath_devdata *dd,
-			 struct ipath_portdata *pd)
-{
-	int ret = 0;
-
-	if (!pd->port_rcvhdrq) {
-		dma_addr_t phys_hdrqtail;
-		gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-		int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-				sizeof(u32), PAGE_SIZE);
-
-		pd->port_rcvhdrq = dma_alloc_coherent(
-			&dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
-			gfp_flags);
-
-		if (!pd->port_rcvhdrq) {
-			ipath_dev_err(dd, "attempt to allocate %d bytes "
-				      "for port %u rcvhdrq failed\n",
-				      amt, pd->port_port);
-			ret = -ENOMEM;
-			goto bail;
-		}
-
-		if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-			pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
-				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
-				GFP_KERNEL);
-			if (!pd->port_rcvhdrtail_kvaddr) {
-				ipath_dev_err(dd, "attempt to allocate 1 page "
-					"for port %u rcvhdrqtailaddr "
-					"failed\n", pd->port_port);
-				ret = -ENOMEM;
-				dma_free_coherent(&dd->pcidev->dev, amt,
-					pd->port_rcvhdrq,
-					pd->port_rcvhdrq_phys);
-				pd->port_rcvhdrq = NULL;
-				goto bail;
-			}
-			pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
-			ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
-				   "physical\n", pd->port_port,
-				   (unsigned long long) phys_hdrqtail);
-		}
-
-		pd->port_rcvhdrq_size = amt;
-
-		ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
-			   "for port %u rcvhdr Q\n",
-			   amt >> PAGE_SHIFT, pd->port_rcvhdrq,
-			   (unsigned long) pd->port_rcvhdrq_phys,
-			   (unsigned long) pd->port_rcvhdrq_size,
-			   pd->port_port);
-	}
-	else
-		ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
-			   "hdrtailaddr@%p %llx physical\n",
-			   pd->port_port, pd->port_rcvhdrq,
-			   (unsigned long long) pd->port_rcvhdrq_phys,
-			   pd->port_rcvhdrtail_kvaddr, (unsigned long long)
-			   pd->port_rcvhdrqtailaddr_phys);
-
-	/* clear for security and sanity on each use */
-	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
-	if (pd->port_rcvhdrtail_kvaddr)
-		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
-
-	/*
-	 * tell chip each time we init it, even if we are re-using previous
-	 * memory (we zero the register at process close)
-	 */
-	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
-			      pd->port_port, pd->port_rcvhdrqtailaddr_phys);
-	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-			      pd->port_port, pd->port_rcvhdrq_phys);
-
-bail:
-	return ret;
-}
-
-
-/*
- * Flush all sends that might be in the ready to send state, as well as any
- * that are in the process of being sent.   Used whenever we need to be
- * sure the send side is idle.  Cleans up all buffer state by canceling
- * all pio buffers, and issuing an abort, which cleans up anything in the
- * launch fifo.  The cancel is superfluous on some chip versions, but
- * it's safer to always do it.
- * PIOAvail bits are updated by the chip as if normal send had happened.
- */
-void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
-{
-	unsigned long flags;
-
-	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
-		ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
-		goto bail;
-	}
-	/*
-	 * If we have SDMA, and it's not disabled, we have to kick off the
-	 * abort state machine, provided we aren't already aborting.
-	 * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
-	 * we skip the rest of this routine. It is already "in progress"
-	 */
-	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
-		int skip_cancel;
-		unsigned long *statp = &dd->ipath_sdma_status;
-
-		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-		skip_cancel =
-			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
-			&& !test_bit(IPATH_SDMA_DISABLED, statp);
-		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-		if (skip_cancel)
-			goto bail;
-	}
-
-	ipath_dbg("Cancelling all in-progress send buffers\n");
-
-	/* skip armlaunch errs for a while */
-	dd->ipath_lastcancel = jiffies + HZ / 2;
-
-	/*
-	 * The abort bit is auto-clearing.  We also don't want pioavail
-	 * update happening during this, and we don't want any other
-	 * sends going out, so turn those off for the duration.  We read
-	 * the scratch register to be sure that cancels and the abort
-	 * have taken effect in the chip.  Otherwise two parts are same
-	 * as ipath_force_pio_avail_update()
-	 */
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
-		| INFINIPATH_S_PIOENABLE);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-		dd->ipath_sendctrl | INFINIPATH_S_ABORT);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	/* disarm all send buffers */
-	ipath_disarm_piobufs(dd, 0,
-		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-
-	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-
-	if (restore_sendctrl) {
-		/* else done by caller later if needed */
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
-			INFINIPATH_S_PIOENABLE;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl);
-		/* and again, be sure all have hit the chip */
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-	}
-
-	if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
-	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
-	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
-		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-		/* only wait so long for intr */
-		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
-		dd->ipath_sdma_reset_wait = 200;
-		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-	}
-bail:;
-}
-
-/*
- * Force an update of in-memory copy of the pioavail registers, when
- * needed for any of a variety of reasons.  We read the scratch register
- * to make it highly likely that the update will have happened by the
- * time we return.  If already off (as in cancel_sends above), this
- * routine is a nop, on the assumption that the caller will "do the
- * right thing".
- */
-void ipath_force_pio_avail_update(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl);
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	}
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
-				int linitcmd)
-{
-	u64 mod_wd;
-	static const char *what[4] = {
-		[0] = "NOP",
-		[INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
-		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
-		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
-	};
-
-	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
-		/*
-		 * If we are told to disable, note that so link-recovery
-		 * code does not attempt to bring us back up.
-		 */
-		preempt_disable();
-		dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-		preempt_enable();
-	} else if (linitcmd) {
-		/*
-		 * Any other linkinitcmd will lead to LINKDOWN and then
-		 * to INIT (if all is well), so clear flag to let
-		 * link-recovery code attempt to bring us back up.
-		 */
-		preempt_disable();
-		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-		preempt_enable();
-	}
-
-	mod_wd = (linkcmd << dd->ibcc_lc_shift) |
-		(linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
-	ipath_cdbg(VERBOSE,
-		"Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
-		dd->ipath_unit, what[linkcmd], linitcmd,
-		ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
-			ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-			 dd->ipath_ibcctrl | mod_wd);
-	/* read from chip so write is flushed */
-	(void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-}
-
-int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
-{
-	u32 lstate;
-	int ret;
-
-	switch (newstate) {
-	case IPATH_IB_LINKDOWN_ONLY:
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	case IPATH_IB_LINKDOWN:
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-					INFINIPATH_IBCC_LINKINITCMD_POLL);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	case IPATH_IB_LINKDOWN_SLEEP:
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-					INFINIPATH_IBCC_LINKINITCMD_SLEEP);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	case IPATH_IB_LINKDOWN_DISABLE:
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-					INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	case IPATH_IB_LINKARM:
-		if (dd->ipath_flags & IPATH_LINKARMED) {
-			ret = 0;
-			goto bail;
-		}
-		if (!(dd->ipath_flags &
-		      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
-			ret = -EINVAL;
-			goto bail;
-		}
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
-
-		/*
-		 * Since the port can transition to ACTIVE by receiving
-		 * a non VL 15 packet, wait for either state.
-		 */
-		lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
-		break;
-
-	case IPATH_IB_LINKACTIVE:
-		if (dd->ipath_flags & IPATH_LINKACTIVE) {
-			ret = 0;
-			goto bail;
-		}
-		if (!(dd->ipath_flags & IPATH_LINKARMED)) {
-			ret = -EINVAL;
-			goto bail;
-		}
-		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
-		lstate = IPATH_LINKACTIVE;
-		break;
-
-	case IPATH_IB_LINK_LOOPBACK:
-		dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
-		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-				 dd->ipath_ibcctrl);
-
-		/* turn heartbeat off, as it causes loopback to fail */
-		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-				       IPATH_IB_HRTBT_OFF);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	case IPATH_IB_LINK_EXTERNAL:
-		dev_info(&dd->pcidev->dev,
-			"Disabling IB local loopback (normal)\n");
-		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-				       IPATH_IB_HRTBT_ON);
-		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-				 dd->ipath_ibcctrl);
-		/* don't wait */
-		ret = 0;
-		goto bail;
-
-	/*
-	 * Heartbeat can be explicitly enabled by the user via
-	 * "hrtbt_enable" "file", and if disabled, trying to enable here
-	 * will have no effect.  Implicit changes (heartbeat off when
-	 * loopback on, and vice versa) are included to ease testing.
-	 */
-	case IPATH_IB_LINK_HRTBT:
-		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-			IPATH_IB_HRTBT_ON);
-		goto bail;
-
-	case IPATH_IB_LINK_NO_HRTBT:
-		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-			IPATH_IB_HRTBT_OFF);
-		goto bail;
-
-	default:
-		ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
-		ret = -EINVAL;
-		goto bail;
-	}
-	ret = ipath_wait_linkstate(dd, lstate, 2000);
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_set_mtu - set the MTU
- * @dd: the infinipath device
- * @arg: the new MTU
- *
- * we can handle "any" incoming size, the issue here is whether we
- * need to restrict our outgoing size.   For now, we don't do any
- * sanity checking on this, and we don't deal with what happens to
- * programs that are already running when the size changes.
- * NOTE: changing the MTU will usually cause the IBC to go back to
- * link INIT state...
- */
-int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
-{
-	u32 piosize;
-	int changed = 0;
-	int ret;
-
-	/*
-	 * mtu is IB data payload max.  It's the largest power of 2 less
-	 * than piosize (or even larger, since it only really controls the
-	 * largest we can receive; we can send the max of the mtu and
-	 * piosize).  We check that it's one of the valid IB sizes.
-	 */
-	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
-	    (arg != 4096 || !ipath_mtu4096)) {
-		ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
-		ret = -EINVAL;
-		goto bail;
-	}
-	if (dd->ipath_ibmtu == arg) {
-		ret = 0;        /* same as current */
-		goto bail;
-	}
-
-	piosize = dd->ipath_ibmaxlen;
-	dd->ipath_ibmtu = arg;
-
-	if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
-		/* Only if it's not the initial value (or reset to it) */
-		if (piosize != dd->ipath_init_ibmaxlen) {
-			if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
-				piosize = dd->ipath_init_ibmaxlen;
-			dd->ipath_ibmaxlen = piosize;
-			changed = 1;
-		}
-	} else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
-		piosize = arg + IPATH_PIO_MAXIBHDR;
-		ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
-			   "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
-			   arg);
-		dd->ipath_ibmaxlen = piosize;
-		changed = 1;
-	}
-
-	if (changed) {
-		u64 ibc = dd->ipath_ibcctrl, ibdw;
-		/*
-		 * update our housekeeping variables, and set IBC max
-		 * size, same as init code; max IBC is max we allow in
-		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
-		 */
-		dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
-		ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
-		ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
-			 dd->ibcc_mpl_shift);
-		ibc |= ibdw << dd->ibcc_mpl_shift;
-		dd->ipath_ibcctrl = ibc;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-				 dd->ipath_ibcctrl);
-		dd->ipath_f_tidtemplate(dd);
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
-{
-	dd->ipath_lid = lid;
-	dd->ipath_lmc = lmc;
-
-	dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
-		(~((1U << lmc) - 1)) << 16);
-
-	dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
-
-	return 0;
-}
-
-
-/**
- * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
- * @dd: the infinipath device
- * @regno: the register number to write
- * @port: the port containing the register
- * @value: the value to write
- *
- * Registers that vary with the chip implementation constants (port)
- * use this routine.
- */
-void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
-			  unsigned port, u64 value)
-{
-	u16 where;
-
-	if (port < dd->ipath_portcnt &&
-	    (regno == dd->ipath_kregs->kr_rcvhdraddr ||
-	     regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
-		where = regno + port;
-	else
-		where = -1;
-
-	ipath_write_kreg(dd, where, value);
-}
-
-/*
- * Following deal with the "obviously simple" task of overriding the state
- * of the LEDS, which normally indicate link physical and logical status.
- * The complications arise in dealing with different hardware mappings
- * and the board-dependent routine being called from interrupts.
- * and then there's the requirement to _flash_ them.
- */
-#define LED_OVER_FREQ_SHIFT 8
-#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
-/* Below is "non-zero" to force override, but both actual LEDs are off */
-#define LED_OVER_BOTH_OFF (8)
-
-static void ipath_run_led_override(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-	int timeoff;
-	int pidx;
-	u64 lstate, ltstate, val;
-
-	if (!(dd->ipath_flags & IPATH_INITTED))
-		return;
-
-	pidx = dd->ipath_led_override_phase++ & 1;
-	dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
-	timeoff = dd->ipath_led_override_timeoff;
-
-	/*
-	 * below potentially restores the LED values per current status,
-	 * should also possibly setup the traffic-blink register,
-	 * but leave that to per-chip functions.
-	 */
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-	ltstate = ipath_ib_linktrstate(dd, val);
-	lstate = ipath_ib_linkstate(dd, val);
-
-	dd->ipath_f_setextled(dd, lstate, ltstate);
-	mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
-}
-
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
-{
-	int timeoff, freq;
-
-	if (!(dd->ipath_flags & IPATH_INITTED))
-		return;
-
-	/* First check if we are blinking. If not, use 1HZ polling */
-	timeoff = HZ;
-	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
-
-	if (freq) {
-		/* For blink, set each phase from one nybble of val */
-		dd->ipath_led_override_vals[0] = val & 0xF;
-		dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
-		timeoff = (HZ << 4)/freq;
-	} else {
-		/* Non-blink set both phases the same. */
-		dd->ipath_led_override_vals[0] = val & 0xF;
-		dd->ipath_led_override_vals[1] = val & 0xF;
-	}
-	dd->ipath_led_override_timeoff = timeoff;
-
-	/*
-	 * If the timer has not already been started, do so. Use a "quick"
-	 * timeout so the function will be called soon, to look at our request.
-	 */
-	if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
-		/* Need to start timer */
-		init_timer(&dd->ipath_led_override_timer);
-		dd->ipath_led_override_timer.function =
-						 ipath_run_led_override;
-		dd->ipath_led_override_timer.data = (unsigned long) dd;
-		dd->ipath_led_override_timer.expires = jiffies + 1;
-		add_timer(&dd->ipath_led_override_timer);
-	} else
-		atomic_dec(&dd->ipath_led_override_timer_active);
-}
-
-/**
- * ipath_shutdown_device - shut down a device
- * @dd: the infinipath device
- *
- * This is called to make the device quiet when we are about to
- * unload the driver, and also when the device is administratively
- * disabled.   It does not free any data structures.
- * Everything it does has to be setup again by ipath_init_chip(dd,1)
- */
-void ipath_shutdown_device(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-
-	ipath_dbg("Shutting down the device\n");
-
-	ipath_hol_up(dd); /* make sure user processes aren't suspended */
-
-	dd->ipath_flags |= IPATH_LINKUNK;
-	dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
-			     IPATH_LINKINIT | IPATH_LINKARMED |
-			     IPATH_LINKACTIVE);
-	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
-				IPATH_STATUS_IB_READY);
-
-	/* mask interrupts, but not errors */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-	dd->ipath_rcvctrl = 0;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl);
-
-	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		teardown_sdma(dd);
-
-	/*
-	 * gracefully stop all sends allowing any in progress to trickle out
-	 * first.
-	 */
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl = 0;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	/* flush it */
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	/*
-	 * enough for anything that's going to trickle out to have actually
-	 * done so.
-	 */
-	udelay(5);
-
-	dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
-
-	ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-	ipath_cancel_sends(dd, 0);
-
-	/*
-	 * we are shutting down, so tell components that care.  We don't do
-	 * this on just a link state change, much like ethernet, a cable
-	 * unplug, etc. doesn't change driver state
-	 */
-	signal_ib_event(dd, IB_EVENT_PORT_ERR);
-
-	/* disable IBC */
-	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-			 dd->ipath_control | INFINIPATH_C_FREEZEMODE);
-
-	/*
-	 * clear SerdesEnable and turn the leds off; do this here because
-	 * we are unloading, so don't count on interrupts to move along
-	 * Turn the LEDs off explicitly for the same reason.
-	 */
-	dd->ipath_f_quiet_serdes(dd);
-
-	/* stop all the timers that might still be running */
-	del_timer_sync(&dd->ipath_hol_timer);
-	if (dd->ipath_stats_timer_active) {
-		del_timer_sync(&dd->ipath_stats_timer);
-		dd->ipath_stats_timer_active = 0;
-	}
-	if (dd->ipath_intrchk_timer.data) {
-		del_timer_sync(&dd->ipath_intrchk_timer);
-		dd->ipath_intrchk_timer.data = 0;
-	}
-	if (atomic_read(&dd->ipath_led_override_timer_active)) {
-		del_timer_sync(&dd->ipath_led_override_timer);
-		atomic_set(&dd->ipath_led_override_timer_active, 0);
-	}
-
-	/*
-	 * clear all interrupts and errors, so that the next time the driver
-	 * is loaded or device is enabled, we know that whatever is set
-	 * happened while we were unloaded
-	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-			 ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-	ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
-	ipath_update_eeprom_log(dd);
-}
-
-/**
- * ipath_free_pddata - free a port's allocated data
- * @dd: the infinipath device
- * @pd: the portdata structure
- *
- * free up any allocated data for a port
- * This should not touch anything that would affect a simultaneous
- * re-allocation of port data, because it is called after ipath_mutex
- * is released (and can be called from reinit as well).
- * It should never change any chip state, or global driver state.
- * (The only exception to global state is freeing the port0 port0_skbs.)
- */
-void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
-{
-	if (!pd)
-		return;
-
-	if (pd->port_rcvhdrq) {
-		ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
-			   "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
-			   (unsigned long) pd->port_rcvhdrq_size);
-		dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
-				  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
-		pd->port_rcvhdrq = NULL;
-		if (pd->port_rcvhdrtail_kvaddr) {
-			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-					 pd->port_rcvhdrtail_kvaddr,
-					 pd->port_rcvhdrqtailaddr_phys);
-			pd->port_rcvhdrtail_kvaddr = NULL;
-		}
-	}
-	if (pd->port_port && pd->port_rcvegrbuf) {
-		unsigned e;
-
-		for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-			void *base = pd->port_rcvegrbuf[e];
-			size_t size = pd->port_rcvegrbuf_size;
-
-			ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
-				   "chunk %u/%u\n", base,
-				   (unsigned long) size,
-				   e, pd->port_rcvegrbuf_chunks);
-			dma_free_coherent(&dd->pcidev->dev, size,
-				base, pd->port_rcvegrbuf_phys[e]);
-		}
-		kfree(pd->port_rcvegrbuf);
-		pd->port_rcvegrbuf = NULL;
-		kfree(pd->port_rcvegrbuf_phys);
-		pd->port_rcvegrbuf_phys = NULL;
-		pd->port_rcvegrbuf_chunks = 0;
-	} else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
-		unsigned e;
-		struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
-
-		dd->ipath_port0_skbinfo = NULL;
-		ipath_cdbg(VERBOSE, "free closed port %d "
-			   "ipath_port0_skbinfo @ %p\n", pd->port_port,
-			   skbinfo);
-		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
-			if (skbinfo[e].skb) {
-				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
-						 dd->ipath_ibmaxlen,
-						 PCI_DMA_FROMDEVICE);
-				dev_kfree_skb(skbinfo[e].skb);
-			}
-		vfree(skbinfo);
-	}
-	kfree(pd->port_tid_pg_list);
-	vfree(pd->subport_uregbase);
-	vfree(pd->subport_rcvegrbuf);
-	vfree(pd->subport_rcvhdr_base);
-	kfree(pd);
-}
-
-static int __init infinipath_init(void)
-{
-	int ret;
-
-	if (ipath_debug & __IPATH_DBG)
-		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
-
-	/*
-	 * These must be called before the driver is registered with
-	 * the PCI subsystem.
-	 */
-	idr_init(&unit_table);
-
-	ret = pci_register_driver(&ipath_driver);
-	if (ret < 0) {
-		printk(KERN_ERR IPATH_DRV_NAME
-		       ": Unable to register driver: error %d\n", -ret);
-		goto bail_unit;
-	}
-
-	ret = ipath_init_ipathfs();
-	if (ret < 0) {
-		printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
-		       "ipathfs: error %d\n", -ret);
-		goto bail_pci;
-	}
-
-	goto bail;
-
-bail_pci:
-	pci_unregister_driver(&ipath_driver);
-
-bail_unit:
-	idr_destroy(&unit_table);
-
-bail:
-	return ret;
-}
-
-static void __exit infinipath_cleanup(void)
-{
-	ipath_exit_ipathfs();
-
-	ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
-	pci_unregister_driver(&ipath_driver);
-
-	idr_destroy(&unit_table);
-}
-
-/**
- * ipath_reset_device - reset the chip if possible
- * @unit: the device to reset
- *
- * Whether or not reset is successful, we attempt to re-initialize the chip
- * (that is, much like a driver unload/reload).  We clear the INITTED flag
- * so that the various entry points will fail until we reinitialize.  For
- * now, we only allow this if no user ports are open that use chip resources
- */
-int ipath_reset_device(int unit)
-{
-	int ret, i;
-	struct ipath_devdata *dd = ipath_lookup(unit);
-	unsigned long flags;
-
-	if (!dd) {
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	if (atomic_read(&dd->ipath_led_override_timer_active)) {
-		/* Need to stop LED timer, _then_ shut off LEDs */
-		del_timer_sync(&dd->ipath_led_override_timer);
-		atomic_set(&dd->ipath_led_override_timer_active, 0);
-	}
-
-	/* Shut off LEDs after we are sure timer is not running */
-	dd->ipath_led_override = LED_OVER_BOTH_OFF;
-	dd->ipath_f_setextled(dd, 0, 0);
-
-	dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
-
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
-		dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
-			 "not initialized or not present\n", unit);
-		ret = -ENXIO;
-		goto bail;
-	}
-
-	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-	if (dd->ipath_pd)
-		for (i = 1; i < dd->ipath_cfgports; i++) {
-			if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-				continue;
-			spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-			ipath_dbg("unit %u port %d is in use "
-				  "(PID %u cmd %s), can't reset\n",
-				  unit, i,
-				  pid_nr(dd->ipath_pd[i]->port_pid),
-				  dd->ipath_pd[i]->port_comm);
-			ret = -EBUSY;
-			goto bail;
-		}
-	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		teardown_sdma(dd);
-
-	dd->ipath_flags &= ~IPATH_INITTED;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-	ret = dd->ipath_f_reset(dd);
-	if (ret == 1) {
-		ipath_dbg("Reinitializing unit %u after reset attempt\n",
-			  unit);
-		ret = ipath_init_chip(dd, 1);
-	} else
-		ret = -EAGAIN;
-	if (ret)
-		ipath_dev_err(dd, "Reinitialize unit %u after "
-			      "reset failed with %d\n", unit, ret);
-	else
-		dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
-			 "resetting\n", unit);
-
-bail:
-	return ret;
-}
-
-/*
- * send a signal to all the processes that have the driver open
- * through the normal interfaces (i.e., everything other than diags
- * interface).  Returns number of signalled processes.
- */
-static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
-{
-	int i, sub, any = 0;
-	struct pid *pid;
-	unsigned long flags;
-
-	if (!dd->ipath_pd)
-		return 0;
-
-	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-	for (i = 1; i < dd->ipath_cfgports; i++) {
-		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-			continue;
-		pid = dd->ipath_pd[i]->port_pid;
-		if (!pid)
-			continue;
-
-		dev_info(&dd->pcidev->dev, "context %d in use "
-			  "(PID %u), sending signal %d\n",
-			  i, pid_nr(pid), sig);
-		kill_pid(pid, sig, 1);
-		any++;
-		for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
-			pid = dd->ipath_pd[i]->port_subpid[sub];
-			if (!pid)
-				continue;
-			dev_info(&dd->pcidev->dev, "sub-context "
-				"%d:%d in use (PID %u), sending "
-				"signal %d\n", i, sub, pid_nr(pid), sig);
-			kill_pid(pid, sig, 1);
-			any++;
-		}
-	}
-	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-	return any;
-}
-
-static void ipath_hol_signal_down(struct ipath_devdata *dd)
-{
-	if (ipath_signal_procs(dd, SIGSTOP))
-		ipath_dbg("Stopped some processes\n");
-	ipath_cancel_sends(dd, 1);
-}
-
-
-static void ipath_hol_signal_up(struct ipath_devdata *dd)
-{
-	if (ipath_signal_procs(dd, SIGCONT))
-		ipath_dbg("Continued some processes\n");
-}
-
-/*
- * link is down, stop any users processes, and flush pending sends
- * to prevent HoL blocking, then start the HoL timer that
- * periodically continues, then stop procs, so they can detect
- * link down if they want, and do something about it.
- * Timer may already be running, so use mod_timer, not add_timer.
- */
-void ipath_hol_down(struct ipath_devdata *dd)
-{
-	dd->ipath_hol_state = IPATH_HOL_DOWN;
-	ipath_hol_signal_down(dd);
-	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-	dd->ipath_hol_timer.expires = jiffies +
-		msecs_to_jiffies(ipath_hol_timeout_ms);
-	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
-}
-
-/*
- * link is up, continue any user processes, and ensure timer
- * is a nop, if running.  Let timer keep running, if set; it
- * will nop when it sees the link is up
- */
-void ipath_hol_up(struct ipath_devdata *dd)
-{
-	ipath_hol_signal_up(dd);
-	dd->ipath_hol_state = IPATH_HOL_UP;
-}
-
-/*
- * toggle the running/not running state of user proceses
- * to prevent HoL blocking on chip resources, but still allow
- * user processes to do link down special case handling.
- * Should only be called via the timer
- */
-void ipath_hol_event(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-	if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
-		&& dd->ipath_hol_state != IPATH_HOL_UP) {
-		dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-		ipath_dbg("Stopping processes\n");
-		ipath_hol_signal_down(dd);
-	} else { /* may do "extra" if also in ipath_hol_up() */
-		dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
-		ipath_dbg("Continuing processes\n");
-		ipath_hol_signal_up(dd);
-	}
-	if (dd->ipath_hol_state == IPATH_HOL_UP)
-		ipath_dbg("link's up, don't resched timer\n");
-	else {
-		dd->ipath_hol_timer.expires = jiffies +
-			msecs_to_jiffies(ipath_hol_timeout_ms);
-		mod_timer(&dd->ipath_hol_timer,
-			dd->ipath_hol_timer.expires);
-	}
-}
-
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
-{
-	u64 val;
-
-	if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
-		return -1;
-	if (dd->ipath_rx_pol_inv != new_pol_inv) {
-		dd->ipath_rx_pol_inv = new_pol_inv;
-		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-			 INFINIPATH_XGXS_RX_POL_SHIFT);
-		val |= ((u64)dd->ipath_rx_pol_inv) <<
-			INFINIPATH_XGXS_RX_POL_SHIFT;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-	}
-	return 0;
-}
-
-/*
- * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
- * the 7220, which is count-based, rather than trigger-based.  Safe for the
- * driver check, since it's at init.   Not completely safe when used for
- * user-mode checking, since some error checking can be lost, but not
- * particularly risky, and only has problematic side-effects in the face of
- * very buggy user code.  There is no reference counting, but that's also
- * fine, given the intended use.
- */
-void ipath_enable_armlaunch(struct ipath_devdata *dd)
-{
-	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-		INFINIPATH_E_SPIOARMLAUNCH);
-	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-		dd->ipath_errormask);
-}
-
-void ipath_disable_armlaunch(struct ipath_devdata *dd)
-{
-	/* so don't re-enable if already set */
-	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
-	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-		dd->ipath_errormask);
-}
-
-module_init(infinipath_init);
-module_exit(infinipath_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
deleted file mode 100644
index fc71819..0000000
--- a/drivers/infiniband/hw/ipath/ipath_eeprom.c
+++ /dev/null
@@ -1,1183 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-
-/*
- * InfiniPath I2C driver for a serial eeprom.  This is not a generic
- * I2C interface.  For a start, the device we're using (Atmel AT24C11)
- * doesn't work like a regular I2C device.  It looks like one
- * electrically, but not logically.  Normal I2C devices have a single
- * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
- * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
- * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
- * call" address.)  The Atmel device, on the other hand, responds to ALL
- * 7-bit addresses.  It's designed to be the only device on a given I2C
- * bus.  A 7-bit address corresponds to the memory address within the
- * Atmel device itself.
- *
- * Also, the timing requirements mean more than simple software
- * bitbanging, with readbacks from chip to ensure timing (simple udelay
- * is not enough).
- *
- * This all means that accessing the device is specialized enough
- * that using the standard kernel I2C bitbanging interface would be
- * impossible.  For example, the core I2C eeprom driver expects to find
- * a device at one or more of a limited set of addresses only.  It doesn't
- * allow writing to an eeprom.  It also doesn't provide any means of
- * accessing eeprom contents from within the kernel, only via sysfs.
- */
-
-/* Added functionality for IBA7220-based cards */
-#define IPATH_EEPROM_DEV_V1 0xA0
-#define IPATH_EEPROM_DEV_V2 0xA2
-#define IPATH_TEMP_DEV 0x98
-#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
-#define IPATH_NO_DEV (0xFF)
-
-/*
- * The number of I2C chains is proliferating. Table below brings
- * some order to the madness. The basic principle is that the
- * table is scanned from the top, and a "probe" is made to the
- * device probe_dev. If that succeeds, the chain is considered
- * to be of that type, and dd->i2c_chain_type is set to the index+1
- * of the entry.
- * The +1 is so static initialization can mean "unknown, do probe."
- */
-static struct i2c_chain_desc {
-	u8 probe_dev;	/* If seen at probe, chain is this type */
-	u8 eeprom_dev;	/* Dev addr (if any) for EEPROM */
-	u8 temp_dev;	/* Dev Addr (if any) for Temp-sense */
-} i2c_chains[] = {
-	{ IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
-	{ IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
-	{ IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
-	{ IPATH_NO_DEV }
-};
-
-enum i2c_type {
-	i2c_line_scl = 0,
-	i2c_line_sda
-};
-
-enum i2c_state {
-	i2c_line_low = 0,
-	i2c_line_high
-};
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_gpio_set - set a GPIO line
- * @dd: the infinipath device
- * @line: the line to set
- * @new_line_state: the state to set
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.
- */
-static int i2c_gpio_set(struct ipath_devdata *dd,
-			enum i2c_type line,
-			enum i2c_state new_line_state)
-{
-	u64 out_mask, dir_mask, *gpioval;
-	unsigned long flags = 0;
-
-	gpioval = &dd->ipath_gpio_out;
-
-	if (line == i2c_line_scl) {
-		dir_mask = dd->ipath_gpio_scl;
-		out_mask = (1UL << dd->ipath_gpio_scl_num);
-	} else {
-		dir_mask = dd->ipath_gpio_sda;
-		out_mask = (1UL << dd->ipath_gpio_sda_num);
-	}
-
-	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-	if (new_line_state == i2c_line_high) {
-		/* tri-state the output rather than force high */
-		dd->ipath_extctrl &= ~dir_mask;
-	} else {
-		/* config line to be an output */
-		dd->ipath_extctrl |= dir_mask;
-	}
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-
-	/* set output as well (no real verify) */
-	if (new_line_state == i2c_line_high)
-		*gpioval |= out_mask;
-	else
-		*gpioval &= ~out_mask;
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
-	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-	return 0;
-}
-
-/**
- * i2c_gpio_get - get a GPIO line state
- * @dd: the infinipath device
- * @line: the line to get
- * @curr_statep: where to put the line state
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.  curr_state is not set on error.
- */
-static int i2c_gpio_get(struct ipath_devdata *dd,
-			enum i2c_type line,
-			enum i2c_state *curr_statep)
-{
-	u64 read_val, mask;
-	int ret;
-	unsigned long flags = 0;
-
-	/* check args */
-	if (curr_statep == NULL) {
-		ret = 1;
-		goto bail;
-	}
-
-	/* config line to be an input */
-	if (line == i2c_line_scl)
-		mask = dd->ipath_gpio_scl;
-	else
-		mask = dd->ipath_gpio_sda;
-
-	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-	dd->ipath_extctrl &= ~mask;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-	/*
-	 * Below is very unlikely to reflect true input state if Output
-	 * Enable actually changed.
-	 */
-	read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-	if (read_val & mask)
-		*curr_statep = i2c_line_high;
-	else
-		*curr_statep = i2c_line_low;
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the infinipath device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct ipath_devdata *dd)
-{
-	(void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-	rmb();
-}
-
-static void scl_out(struct ipath_devdata *dd, u8 bit)
-{
-	udelay(1);
-	i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
-
-	i2c_wait_for_writes(dd);
-}
-
-static void sda_out(struct ipath_devdata *dd, u8 bit)
-{
-	i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
-
-	i2c_wait_for_writes(dd);
-}
-
-static u8 sda_in(struct ipath_devdata *dd, int wait)
-{
-	enum i2c_state bit;
-
-	if (i2c_gpio_get(dd, i2c_line_sda, &bit))
-		ipath_dbg("get bit failed!\n");
-
-	if (wait)
-		i2c_wait_for_writes(dd);
-
-	return bit == i2c_line_high ? 1U : 0;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the infinipath device
- */
-static int i2c_ackrcv(struct ipath_devdata *dd)
-{
-	u8 ack_received;
-
-	/* AT ENTRY SCL = LOW */
-	/* change direction, ignore data */
-	ack_received = sda_in(dd, 1);
-	scl_out(dd, i2c_line_high);
-	ack_received = sda_in(dd, 1) == 0;
-	scl_out(dd, i2c_line_low);
-	return ack_received;
-}
-
-/**
- * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
- * @dd: the infinipath device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct ipath_devdata *dd)
-{
-	int bit_cntr, data;
-
-	data = 0;
-
-	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-		data <<= 1;
-		scl_out(dd, i2c_line_high);
-		data |= sda_in(dd, 0);
-		scl_out(dd, i2c_line_low);
-	}
-	return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the infinipath device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct ipath_devdata *dd, u8 data)
-{
-	int bit_cntr;
-	u8 bit;
-
-	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-		bit = (data >> bit_cntr) & 1;
-		sda_out(dd, bit);
-		scl_out(dd, i2c_line_high);
-		scl_out(dd, i2c_line_low);
-	}
-	return (!i2c_ackrcv(dd)) ? 1 : 0;
-}
-
-static void send_ack(struct ipath_devdata *dd)
-{
-	sda_out(dd, i2c_line_low);
-	scl_out(dd, i2c_line_high);
-	scl_out(dd, i2c_line_low);
-	sda_out(dd, i2c_line_high);
-}
-
-/**
- * i2c_startcmd - transmit the start condition, followed by address/cmd
- * @dd: the infinipath device
- * @offset_dir: direction byte
- *
- *      (both clock/data high, clock high, data low while clock is high)
- */
-static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
-{
-	int res;
-
-	/* issue start sequence */
-	sda_out(dd, i2c_line_high);
-	scl_out(dd, i2c_line_high);
-	sda_out(dd, i2c_line_low);
-	scl_out(dd, i2c_line_low);
-
-	/* issue length and direction byte */
-	res = wr_byte(dd, offset_dir);
-
-	if (res)
-		ipath_cdbg(VERBOSE, "No ack to complete start\n");
-
-	return res;
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the infinipath device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct ipath_devdata *dd)
-{
-	scl_out(dd, i2c_line_low);
-	sda_out(dd, i2c_line_low);
-	scl_out(dd, i2c_line_high);
-	sda_out(dd, i2c_line_high);
-	udelay(2);
-}
-
-/**
- * eeprom_reset - reset I2C communication
- * @dd: the infinipath device
- */
-
-static int eeprom_reset(struct ipath_devdata *dd)
-{
-	int clock_cycles_left = 9;
-	u64 *gpioval = &dd->ipath_gpio_out;
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-	/* Make sure shadows are consistent */
-	dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
-	*gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
-	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-	ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
-		   "is %llx\n", (unsigned long long) *gpioval);
-
-	/*
-	 * This is to get the i2c into a known state, by first going low,
-	 * then tristate sda (and then tristate scl as first thing
-	 * in loop)
-	 */
-	scl_out(dd, i2c_line_low);
-	sda_out(dd, i2c_line_high);
-
-	/* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
-	while (clock_cycles_left--) {
-		scl_out(dd, i2c_line_high);
-
-		/* SDA seen high, issue START by dropping it while SCL high */
-		if (sda_in(dd, 0)) {
-			sda_out(dd, i2c_line_low);
-			scl_out(dd, i2c_line_low);
-			/* ATMEL spec says must be followed by STOP. */
-			scl_out(dd, i2c_line_high);
-			sda_out(dd, i2c_line_high);
-			ret = 0;
-			goto bail;
-		}
-
-		scl_out(dd, i2c_line_low);
-	}
-
-	ret = 1;
-
-bail:
-	return ret;
-}
-
-/*
- * Probe for I2C device at specified address. Returns 0 for "success"
- * to match rest of this file.
- * Leave bus in "reasonable" state for further commands.
- */
-static int i2c_probe(struct ipath_devdata *dd, int devaddr)
-{
-	int ret = 0;
-
-	ret = eeprom_reset(dd);
-	if (ret) {
-		ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
-			      devaddr);
-		return ret;
-	}
-	/*
-	 * Reset no longer leaves bus in start condition, so normal
-	 * i2c_startcmd() will do.
-	 */
-	ret = i2c_startcmd(dd, devaddr | READ_CMD);
-	if (ret)
-		ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
-			   devaddr);
-	else {
-		/*
-		 * Device did respond. Complete a single-byte read, because some
-		 * devices apparently cannot handle STOP immediately after they
-		 * ACK the start-cmd.
-		 */
-		int data;
-		data = rd_byte(dd);
-		stop_cmd(dd);
-		ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
-	}
-	return ret;
-}
-
-/*
- * Returns the "i2c type". This is a pointer to a struct that describes
- * the I2C chain on this board. To minimize impact on struct ipath_devdata,
- * the (small integer) index into the table is actually memoized, rather
- * then the pointer.
- * Memoization is because the type is determined on the first call per chip.
- * An alternative would be to move type determination to early
- * init code.
- */
-static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
-{
-	int idx;
-
-	/* Get memoized index, from previous successful probes */
-	idx = dd->ipath_i2c_chain_type - 1;
-	if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
-		goto done;
-
-	idx = 0;
-	while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
-		/* if probe succeeds, this is type */
-		if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
-			break;
-		++idx;
-	}
-
-	/*
-	 * Old EEPROM (first entry) may require a reset after probe,
-	 * rather than being able to "start" after "stop"
-	 */
-	if (idx == 0)
-		eeprom_reset(dd);
-
-	if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
-		idx = -1;
-	else
-		dd->ipath_i2c_chain_type = idx + 1;
-done:
-	return (idx >= 0) ? i2c_chains + idx : NULL;
-}
-
-static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
-					u8 eeprom_offset, void *buffer, int len)
-{
-	int ret;
-	struct i2c_chain_desc *icd;
-	u8 *bp = buffer;
-
-	ret = 1;
-	icd = ipath_i2c_type(dd);
-	if (!icd)
-		goto bail;
-
-	if (icd->eeprom_dev == IPATH_NO_DEV) {
-		/* legacy not-really-I2C */
-		ipath_cdbg(VERBOSE, "Start command only address\n");
-		eeprom_offset = (eeprom_offset << 1) | READ_CMD;
-		ret = i2c_startcmd(dd, eeprom_offset);
-	} else {
-		/* Actual I2C */
-		ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
-		if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-			ipath_dbg("Failed EEPROM startcmd\n");
-			stop_cmd(dd);
-			ret = 1;
-			goto bail;
-		}
-		ret = wr_byte(dd, eeprom_offset);
-		stop_cmd(dd);
-		if (ret) {
-			ipath_dev_err(dd, "Failed to write EEPROM address\n");
-			ret = 1;
-			goto bail;
-		}
-		ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
-	}
-	if (ret) {
-		ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
-		stop_cmd(dd);
-		ret = 1;
-		goto bail;
-	}
-
-	/*
-	 * eeprom keeps clocking data out as long as we ack, automatically
-	 * incrementing the address.
-	 */
-	while (len-- > 0) {
-		/* get and store data */
-		*bp++ = rd_byte(dd);
-		/* send ack if not the last byte */
-		if (len)
-			send_ack(dd);
-	}
-
-	stop_cmd(dd);
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
-				       const void *buffer, int len)
-{
-	int sub_len;
-	const u8 *bp = buffer;
-	int max_wait_time, i;
-	int ret;
-	struct i2c_chain_desc *icd;
-
-	ret = 1;
-	icd = ipath_i2c_type(dd);
-	if (!icd)
-		goto bail;
-
-	while (len > 0) {
-		if (icd->eeprom_dev == IPATH_NO_DEV) {
-			if (i2c_startcmd(dd,
-					 (eeprom_offset << 1) | WRITE_CMD)) {
-				ipath_dbg("Failed to start cmd offset %u\n",
-					eeprom_offset);
-				goto failed_write;
-			}
-		} else {
-			/* Real I2C */
-			if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-				ipath_dbg("Failed EEPROM startcmd\n");
-				goto failed_write;
-			}
-			ret = wr_byte(dd, eeprom_offset);
-			if (ret) {
-				ipath_dev_err(dd, "Failed to write EEPROM "
-					      "address\n");
-				goto failed_write;
-			}
-		}
-
-		sub_len = min(len, 4);
-		eeprom_offset += sub_len;
-		len -= sub_len;
-
-		for (i = 0; i < sub_len; i++) {
-			if (wr_byte(dd, *bp++)) {
-				ipath_dbg("no ack after byte %u/%u (%u "
-					  "total remain)\n", i, sub_len,
-					  len + sub_len - i);
-				goto failed_write;
-			}
-		}
-
-		stop_cmd(dd);
-
-		/*
-		 * wait for write complete by waiting for a successful
-		 * read (the chip replies with a zero after the write
-		 * cmd completes, and before it writes to the eeprom.
-		 * The startcmd for the read will fail the ack until
-		 * the writes have completed.   We do this inline to avoid
-		 * the debug prints that are in the real read routine
-		 * if the startcmd fails.
-		 * We also use the proper device address, so it doesn't matter
-		 * whether we have real eeprom_dev. legacy likes any address.
-		 */
-		max_wait_time = 100;
-		while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
-			stop_cmd(dd);
-			if (!--max_wait_time) {
-				ipath_dbg("Did not get successful read to "
-					  "complete write\n");
-				goto failed_write;
-			}
-		}
-		/* now read (and ignore) the resulting byte */
-		rd_byte(dd);
-		stop_cmd(dd);
-	}
-
-	ret = 0;
-	goto bail;
-
-failed_write:
-	stop_cmd(dd);
-	ret = 1;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_eeprom_read - receives bytes from the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: address to read from
- * @buffer: where to store result
- * @len: number of bytes to receive
- */
-int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
-			void *buff, int len)
-{
-	int ret;
-
-	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-	if (!ret) {
-		ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
-		mutex_unlock(&dd->ipath_eep_lock);
-	}
-
-	return ret;
-}
-
-/**
- * ipath_eeprom_write - writes data to the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: where to place data
- * @buffer: data to write
- * @len: number of bytes to write
- */
-int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
-			const void *buff, int len)
-{
-	int ret;
-
-	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-	if (!ret) {
-		ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
-		mutex_unlock(&dd->ipath_eep_lock);
-	}
-
-	return ret;
-}
-
-static u8 flash_csum(struct ipath_flash *ifp, int adjust)
-{
-	u8 *ip = (u8 *) ifp;
-	u8 csum = 0, len;
-
-	/*
-	 * Limit length checksummed to max length of actual data.
-	 * Checksum of erased eeprom will still be bad, but we avoid
-	 * reading past the end of the buffer we were passed.
-	 */
-	len = ifp->if_length;
-	if (len > sizeof(struct ipath_flash))
-		len = sizeof(struct ipath_flash);
-	while (len--)
-		csum += *ip++;
-	csum -= ifp->if_csum;
-	csum = ~csum;
-	if (adjust)
-		ifp->if_csum = csum;
-
-	return csum;
-}
-
-/**
- * ipath_get_guid - get the GUID from the i2c device
- * @dd: the infinipath device
- *
- * We have the capability to use the ipath_nguid field, and get
- * the guid from the first chip's flash, to use for all of them.
- */
-void ipath_get_eeprom_info(struct ipath_devdata *dd)
-{
-	void *buf;
-	struct ipath_flash *ifp;
-	__be64 guid;
-	int len, eep_stat;
-	u8 csum, *bguid;
-	int t = dd->ipath_unit;
-	struct ipath_devdata *dd0 = ipath_lookup(0);
-
-	if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
-		u8 oguid;
-		dd->ipath_guid = dd0->ipath_guid;
-		bguid = (u8 *) & dd->ipath_guid;
-
-		oguid = bguid[7];
-		bguid[7] += t;
-		if (oguid > bguid[7]) {
-			if (bguid[6] == 0xff) {
-				if (bguid[5] == 0xff) {
-					ipath_dev_err(
-						dd,
-						"Can't set %s GUID from "
-						"base, wraps to OUI!\n",
-						ipath_get_unit_name(t));
-					dd->ipath_guid = 0;
-					goto bail;
-				}
-				bguid[5]++;
-			}
-			bguid[6]++;
-		}
-		dd->ipath_nguid = 1;
-
-		ipath_dbg("nguid %u, so adding %u to device 0 guid, "
-			  "for %llx\n",
-			  dd0->ipath_nguid, t,
-			  (unsigned long long) be64_to_cpu(dd->ipath_guid));
-		goto bail;
-	}
-
-	/*
-	 * read full flash, not just currently used part, since it may have
-	 * been written with a newer definition
-	 * */
-	len = sizeof(struct ipath_flash);
-	buf = vmalloc(len);
-	if (!buf) {
-		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-			      "bytes from eeprom for GUID\n", len);
-		goto bail;
-	}
-
-	mutex_lock(&dd->ipath_eep_lock);
-	eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
-	mutex_unlock(&dd->ipath_eep_lock);
-
-	if (eep_stat) {
-		ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
-		goto done;
-	}
-	ifp = (struct ipath_flash *)buf;
-
-	csum = flash_csum(ifp, 0);
-	if (csum != ifp->if_csum) {
-		dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
-			 "0x%x, not 0x%x\n", csum, ifp->if_csum);
-		goto done;
-	}
-	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
-	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
-		ipath_dev_err(dd, "Invalid GUID %llx from flash; "
-			      "ignoring\n",
-			      *(unsigned long long *) ifp->if_guid);
-		/* don't allow GUID if all 0 or all 1's */
-		goto done;
-	}
-
-	/* complain, but allow it */
-	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
-		dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
-			 "default, probably not correct!\n",
-			 *(unsigned long long *) ifp->if_guid);
-
-	bguid = ifp->if_guid;
-	if (!bguid[0] && !bguid[1] && !bguid[2]) {
-		/* original incorrect GUID format in flash; fix in
-		 * core copy, by shifting up 2 octets; don't need to
-		 * change top octet, since both it and shifted are
-		 * 0.. */
-		bguid[1] = bguid[3];
-		bguid[2] = bguid[4];
-		bguid[3] = bguid[4] = 0;
-		guid = *(__be64 *) ifp->if_guid;
-		ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
-			   "shifting 2 octets\n");
-	} else
-		guid = *(__be64 *) ifp->if_guid;
-	dd->ipath_guid = guid;
-	dd->ipath_nguid = ifp->if_numguid;
-	/*
-	 * Things are slightly complicated by the desire to transparently
-	 * support both the Pathscale 10-digit serial number and the QLogic
-	 * 13-character version.
-	 */
-	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
-		&& ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
-		/* This board has a Serial-prefix, which is stored
-		 * elsewhere for backward-compatibility.
-		 */
-		char *snp = dd->ipath_serial;
-		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
-		snp[sizeof ifp->if_sprefix] = '\0';
-		len = strlen(snp);
-		snp += len;
-		len = (sizeof dd->ipath_serial) - len;
-		if (len > sizeof ifp->if_serial) {
-			len = sizeof ifp->if_serial;
-		}
-		memcpy(snp, ifp->if_serial, len);
-	} else
-		memcpy(dd->ipath_serial, ifp->if_serial,
-		       sizeof ifp->if_serial);
-	if (!strstr(ifp->if_comment, "Tested successfully"))
-		ipath_dev_err(dd, "Board SN %s did not pass functional "
-			"test: %s\n", dd->ipath_serial,
-			ifp->if_comment);
-
-	ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
-		   (unsigned long long) be64_to_cpu(dd->ipath_guid));
-
-	memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
-	/*
-	 * Power-on (actually "active") hours are kept as little-endian value
-	 * in EEPROM, but as seconds in a (possibly as small as 24-bit)
-	 * atomic_t while running.
-	 */
-	atomic_set(&dd->ipath_active_time, 0);
-	dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
-
-done:
-	vfree(buf);
-
-bail:;
-}
-
-/**
- * ipath_update_eeprom_log - copy active-time and error counters to eeprom
- * @dd: the infinipath device
- *
- * Although the time is kept as seconds in the ipath_devdata struct, it is
- * rounded to hours for re-write, as we have only 16 bits in EEPROM.
- * First-cut code reads whole (expected) struct ipath_flash, modifies,
- * re-writes. Future direction: read/write only what we need, assuming
- * that the EEPROM had to have been "good enough" for driver init, and
- * if not, we aren't making it worse.
- *
- */
-
-int ipath_update_eeprom_log(struct ipath_devdata *dd)
-{
-	void *buf;
-	struct ipath_flash *ifp;
-	int len, hi_water;
-	uint32_t new_time, new_hrs;
-	u8 csum;
-	int ret, idx;
-	unsigned long flags;
-
-	/* first, check if we actually need to do anything. */
-	ret = 0;
-	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-		if (dd->ipath_eep_st_new_errs[idx]) {
-			ret = 1;
-			break;
-		}
-	}
-	new_time = atomic_read(&dd->ipath_active_time);
-
-	if (ret == 0 && new_time < 3600)
-		return 0;
-
-	/*
-	 * The quick-check above determined that there is something worthy
-	 * of logging, so get current contents and do a more detailed idea.
-	 * read full flash, not just currently used part, since it may have
-	 * been written with a newer definition
-	 */
-	len = sizeof(struct ipath_flash);
-	buf = vmalloc(len);
-	ret = 1;
-	if (!buf) {
-		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-				"bytes from eeprom for logging\n", len);
-		goto bail;
-	}
-
-	/* Grab semaphore and read current EEPROM. If we get an
-	 * error, let go, but if not, keep it until we finish write.
-	 */
-	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-	if (ret) {
-		ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
-		goto free_bail;
-	}
-	ret = ipath_eeprom_internal_read(dd, 0, buf, len);
-	if (ret) {
-		mutex_unlock(&dd->ipath_eep_lock);
-		ipath_dev_err(dd, "Unable read EEPROM for logging\n");
-		goto free_bail;
-	}
-	ifp = (struct ipath_flash *)buf;
-
-	csum = flash_csum(ifp, 0);
-	if (csum != ifp->if_csum) {
-		mutex_unlock(&dd->ipath_eep_lock);
-		ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
-				csum, ifp->if_csum);
-		ret = 1;
-		goto free_bail;
-	}
-	hi_water = 0;
-	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-		int new_val = dd->ipath_eep_st_new_errs[idx];
-		if (new_val) {
-			/*
-			 * If we have seen any errors, add to EEPROM values
-			 * We need to saturate at 0xFF (255) and we also
-			 * would need to adjust the checksum if we were
-			 * trying to minimize EEPROM traffic
-			 * Note that we add to actual current count in EEPROM,
-			 * in case it was altered while we were running.
-			 */
-			new_val += ifp->if_errcntp[idx];
-			if (new_val > 0xFF)
-				new_val = 0xFF;
-			if (ifp->if_errcntp[idx] != new_val) {
-				ifp->if_errcntp[idx] = new_val;
-				hi_water = offsetof(struct ipath_flash,
-						if_errcntp) + idx;
-			}
-			/*
-			 * update our shadow (used to minimize EEPROM
-			 * traffic), to match what we are about to write.
-			 */
-			dd->ipath_eep_st_errs[idx] = new_val;
-			dd->ipath_eep_st_new_errs[idx] = 0;
-		}
-	}
-	/*
-	 * now update active-time. We would like to round to the nearest hour
-	 * but unless atomic_t are sure to be proper signed ints we cannot,
-	 * because we need to account for what we "transfer" to EEPROM and
-	 * if we log an hour at 31 minutes, then we would need to set
-	 * active_time to -29 to accurately count the _next_ hour.
-	 */
-	if (new_time >= 3600) {
-		new_hrs = new_time / 3600;
-		atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
-		new_hrs += dd->ipath_eep_hrs;
-		if (new_hrs > 0xFFFF)
-			new_hrs = 0xFFFF;
-		dd->ipath_eep_hrs = new_hrs;
-		if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
-			ifp->if_powerhour[0] = new_hrs & 0xFF;
-			hi_water = offsetof(struct ipath_flash, if_powerhour);
-		}
-		if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
-			ifp->if_powerhour[1] = new_hrs >> 8;
-			hi_water = offsetof(struct ipath_flash, if_powerhour)
-					+ 1;
-		}
-	}
-	/*
-	 * There is a tiny possibility that we could somehow fail to write
-	 * the EEPROM after updating our shadows, but problems from holding
-	 * the spinlock too long are a much bigger issue.
-	 */
-	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-	if (hi_water) {
-		/* we made some change to the data, uopdate cksum and write */
-		csum = flash_csum(ifp, 1);
-		ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
-	}
-	mutex_unlock(&dd->ipath_eep_lock);
-	if (ret)
-		ipath_dev_err(dd, "Failed updating EEPROM\n");
-
-free_bail:
-	vfree(buf);
-bail:
-	return ret;
-
-}
-
-/**
- * ipath_inc_eeprom_err - increment one of the four error counters
- * that are logged to EEPROM.
- * @dd: the infinipath device
- * @eidx: 0..3, the counter to increment
- * @incr: how much to add
- *
- * Each counter is 8-bits, and saturates at 255 (0xFF). They
- * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
- * is called, but it can only be called in a context that allows sleep.
- * This function can be called even at interrupt level.
- */
-
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
-{
-	uint new_val;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-	new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
-	if (new_val > 255)
-		new_val = 255;
-	dd->ipath_eep_st_new_errs[eidx] = new_val;
-	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-	return;
-}
-
-static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
-{
-	int ret;
-	struct i2c_chain_desc *icd;
-
-	ret = -ENOENT;
-
-	icd = ipath_i2c_type(dd);
-	if (!icd)
-		goto bail;
-
-	if (icd->temp_dev == IPATH_NO_DEV) {
-		/* tempsense only exists on new, real-I2C boards */
-		ret = -ENXIO;
-		goto bail;
-	}
-
-	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-		ipath_dbg("Failed tempsense startcmd\n");
-		stop_cmd(dd);
-		ret = -ENXIO;
-		goto bail;
-	}
-	ret = wr_byte(dd, regnum);
-	stop_cmd(dd);
-	if (ret) {
-		ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
-			      regnum);
-		ret = -ENXIO;
-		goto bail;
-	}
-	if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
-		ipath_dbg("Failed tempsense RD startcmd\n");
-		stop_cmd(dd);
-		ret = -ENXIO;
-		goto bail;
-	}
-	/*
-	 * We can only clock out one byte per command, sensibly
-	 */
-	ret = rd_byte(dd);
-	stop_cmd(dd);
-
-bail:
-	return ret;
-}
-
-#define VALID_TS_RD_REG_MASK 0xBF
-
-/**
- * ipath_tempsense_read - read register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to read from
- *
- * returns reg contents (0..255) or < 0 for error
- */
-int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
-{
-	int ret;
-
-	if (regnum > 7)
-		return -EINVAL;
-
-	/* return a bogus value for (the one) register we do not have */
-	if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
-		return 0;
-
-	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-	if (!ret) {
-		ret = ipath_tempsense_internal_read(dd, regnum);
-		mutex_unlock(&dd->ipath_eep_lock);
-	}
-
-	/*
-	 * There are three possibilities here:
-	 * ret is actual value (0..255)
-	 * ret is -ENXIO or -EINVAL from code in this file
-	 * ret is -EINTR from mutex_lock_interruptible.
-	 */
-	return ret;
-}
-
-static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
-					  u8 regnum, u8 data)
-{
-	int ret = -ENOENT;
-	struct i2c_chain_desc *icd;
-
-	icd = ipath_i2c_type(dd);
-	if (!icd)
-		goto bail;
-
-	if (icd->temp_dev == IPATH_NO_DEV) {
-		/* tempsense only exists on new, real-I2C boards */
-		ret = -ENXIO;
-		goto bail;
-	}
-	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-		ipath_dbg("Failed tempsense startcmd\n");
-		stop_cmd(dd);
-		ret = -ENXIO;
-		goto bail;
-	}
-	ret = wr_byte(dd, regnum);
-	if (ret) {
-		stop_cmd(dd);
-		ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
-			      regnum);
-		ret = -ENXIO;
-		goto bail;
-	}
-	ret = wr_byte(dd, data);
-	stop_cmd(dd);
-	ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
-	if (ret) {
-		ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
-			      regnum);
-		ret = -ENXIO;
-	}
-
-bail:
-	return ret;
-}
-
-#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
-
-/**
- * ipath_tempsense_write - write register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to write
- * @data: data to write
- *
- * returns 0 for success or < 0 for error
- */
-int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
-{
-	int ret;
-
-	if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
-		return -EINVAL;
-
-	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-	if (!ret) {
-		ret = ipath_tempsense_internal_write(dd, regnum, data);
-		mutex_unlock(&dd->ipath_eep_lock);
-	}
-
-	/*
-	 * There are three possibilities here:
-	 * ret is 0 for success
-	 * ret is -ENXIO or -EINVAL from code in this file
-	 * ret is -EINTR from mutex_lock_interruptible.
-	 */
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
deleted file mode 100644
index 450d159..0000000
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ /dev/null
@@ -1,2620 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/cdev.h>
-#include <linux/swap.h>
-#include <linux/export.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/jiffies.h>
-#include <linux/cpu.h>
-#include <linux/uio.h>
-#include <asm/pgtable.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-#include "ipath_user_sdma.h"
-
-static int ipath_open(struct inode *, struct file *);
-static int ipath_close(struct inode *, struct file *);
-static ssize_t ipath_write(struct file *, const char __user *, size_t,
-			   loff_t *);
-static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
-static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
-static int ipath_mmap(struct file *, struct vm_area_struct *);
-
-/*
- * This is really, really weird shit - write() and writev() here
- * have completely unrelated semantics.  Sucky userland ABI,
- * film at 11.
- */
-static const struct file_operations ipath_file_ops = {
-	.owner = THIS_MODULE,
-	.write = ipath_write,
-	.write_iter = ipath_write_iter,
-	.open = ipath_open,
-	.release = ipath_close,
-	.poll = ipath_poll,
-	.mmap = ipath_mmap,
-	.llseek = noop_llseek,
-};
-
-/*
- * Convert kernel virtual addresses to physical addresses so they don't
- * potentially conflict with the chip addresses used as mmap offsets.
- * It doesn't really matter what mmap offset we use as long as we can
- * interpret it correctly.
- */
-static u64 cvt_kvaddr(void *p)
-{
-	struct page *page;
-	u64 paddr = 0;
-
-	page = vmalloc_to_page(p);
-	if (page)
-		paddr = page_to_pfn(page) << PAGE_SHIFT;
-
-	return paddr;
-}
-
-static int ipath_get_base_info(struct file *fp,
-			       void __user *ubase, size_t ubase_size)
-{
-	struct ipath_portdata *pd = port_fp(fp);
-	int ret = 0;
-	struct ipath_base_info *kinfo = NULL;
-	struct ipath_devdata *dd = pd->port_dd;
-	unsigned subport_cnt;
-	int shared, master;
-	size_t sz;
-
-	subport_cnt = pd->port_subport_cnt;
-	if (!subport_cnt) {
-		shared = 0;
-		master = 0;
-		subport_cnt = 1;
-	} else {
-		shared = 1;
-		master = !subport_fp(fp);
-	}
-
-	sz = sizeof(*kinfo);
-	/* If port sharing is not requested, allow the old size structure */
-	if (!shared)
-		sz -= 7 * sizeof(u64);
-	if (ubase_size < sz) {
-		ipath_cdbg(PROC,
-			   "Base size %zu, need %zu (version mismatch?)\n",
-			   ubase_size, sz);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
-	if (kinfo == NULL) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	ret = dd->ipath_f_get_base_info(pd, kinfo);
-	if (ret < 0)
-		goto bail;
-
-	kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
-	kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
-	kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
-	kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
-	/*
-	 * have to mmap whole thing
-	 */
-	kinfo->spi_rcv_egrbuftotlen =
-		pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-	kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
-	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
-		pd->port_rcvegrbuf_chunks;
-	kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
-	if (master)
-		kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
-	/*
-	 * for this use, may be ipath_cfgports summed over all chips that
-	 * are are configured and present
-	 */
-	kinfo->spi_nports = dd->ipath_cfgports;
-	/* unit (chip/board) our port is on */
-	kinfo->spi_unit = dd->ipath_unit;
-	/* for now, only a single page */
-	kinfo->spi_tid_maxsize = PAGE_SIZE;
-
-	/*
-	 * Doing this per port, and based on the skip value, etc.  This has
-	 * to be the actual buffer size, since the protocol code treats it
-	 * as an array.
-	 *
-	 * These have to be set to user addresses in the user code via mmap.
-	 * These values are used on return to user code for the mmap target
-	 * addresses only.  For 32 bit, same 44 bit address problem, so use
-	 * the physical address, not virtual.  Before 2.6.11, using the
-	 * page_address() macro worked, but in 2.6.11, even that returns the
-	 * full 64 bit address (upper bits all 1's).  So far, using the
-	 * physical addresses (or chip offsets, for chip mapping) works, but
-	 * no doubt some future kernel release will change that, and we'll be
-	 * on to yet another method of dealing with this.
-	 */
-	kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
-	kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
-	kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
-	kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
-	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
-		(void *) dd->ipath_statusp -
-		(void *) dd->ipath_pioavailregs_dma;
-	if (!shared) {
-		kinfo->spi_piocnt = pd->port_piocnt;
-		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
-		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
-			dd->ipath_ureg_align * pd->port_port;
-	} else if (master) {
-		kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
-				    (pd->port_piocnt % subport_cnt);
-		/* Master's PIO buffers are after all the slave's */
-		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-			dd->ipath_palign *
-			(pd->port_piocnt - kinfo->spi_piocnt);
-	} else {
-		unsigned slave = subport_fp(fp) - 1;
-
-		kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
-		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-			dd->ipath_palign * kinfo->spi_piocnt * slave;
-	}
-
-	if (shared) {
-		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
-			dd->ipath_ureg_align * pd->port_port;
-		kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
-		kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
-		kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
-
-		kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
-			PAGE_SIZE * subport_fp(fp));
-
-		kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
-			pd->port_rcvhdrq_size * subport_fp(fp));
-		kinfo->spi_rcvhdr_tailaddr = 0;
-		kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
-			pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
-			subport_fp(fp));
-
-		kinfo->spi_subport_uregbase =
-			cvt_kvaddr(pd->subport_uregbase);
-		kinfo->spi_subport_rcvegrbuf =
-			cvt_kvaddr(pd->subport_rcvegrbuf);
-		kinfo->spi_subport_rcvhdr_base =
-			cvt_kvaddr(pd->subport_rcvhdr_base);
-		ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
-			kinfo->spi_port, kinfo->spi_runtime_flags,
-			(unsigned long long) kinfo->spi_subport_uregbase,
-			(unsigned long long) kinfo->spi_subport_rcvegrbuf,
-			(unsigned long long) kinfo->spi_subport_rcvhdr_base);
-	}
-
-	/*
-	 * All user buffers are 2KB buffers.  If we ever support
-	 * giving 4KB buffers to user processes, this will need some
-	 * work.
-	 */
-	kinfo->spi_pioindex = (kinfo->spi_piobufbase -
-		(dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
-	kinfo->spi_pioalign = dd->ipath_palign;
-
-	kinfo->spi_qpair = IPATH_KD_QP;
-	/*
-	 * user mode PIO buffers are always 2KB, even when 4KB can
-	 * be received, and sent via the kernel; this is ibmaxlen
-	 * for 2K MTU.
-	 */
-	kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
-	kinfo->spi_mtu = dd->ipath_ibmaxlen;	/* maxlen, not ibmtu */
-	kinfo->spi_port = pd->port_port;
-	kinfo->spi_subport = subport_fp(fp);
-	kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
-	kinfo->spi_hw_version = dd->ipath_revision;
-
-	if (master) {
-		kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
-	}
-
-	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
-	if (copy_to_user(ubase, kinfo, sz))
-		ret = -EFAULT;
-
-bail:
-	kfree(kinfo);
-	return ret;
-}
-
-/**
- * ipath_tid_update - update a port TID
- * @pd: the port
- * @fp: the ipath device file
- * @ti: the TID information
- *
- * The new implementation as of Oct 2004 is that the driver assigns
- * the tid and returns it to the caller.   To make it easier to
- * catch bugs, and to reduce search time, we keep a cursor for
- * each port, walking the shadow tid array to find one that's not
- * in use.
- *
- * For now, if we can't allocate the full list, we fail, although
- * in the long run, we'll allocate as many as we can, and the
- * caller will deal with that by trying the remaining pages later.
- * That means that when we fail, we have to mark the tids as not in
- * use again, in our shadow copy.
- *
- * It's up to the caller to free the tids when they are done.
- * We'll unlock the pages as they free them.
- *
- * Also, right now we are locking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.
- */
-static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
-			    const struct ipath_tid_info *ti)
-{
-	int ret = 0, ntids;
-	u32 tid, porttid, cnt, i, tidcnt, tidoff;
-	u16 *tidlist;
-	struct ipath_devdata *dd = pd->port_dd;
-	u64 physaddr;
-	unsigned long vaddr;
-	u64 __iomem *tidbase;
-	unsigned long tidmap[8];
-	struct page **pagep = NULL;
-	unsigned subport = subport_fp(fp);
-
-	if (!dd->ipath_pageshadow) {
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	cnt = ti->tidcnt;
-	if (!cnt) {
-		ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
-			  (unsigned long long) ti->tidlist);
-		/*
-		 * Should we treat as success?  likely a bug
-		 */
-		ret = -EFAULT;
-		goto done;
-	}
-	porttid = pd->port_port * dd->ipath_rcvtidcnt;
-	if (!pd->port_subport_cnt) {
-		tidcnt = dd->ipath_rcvtidcnt;
-		tid = pd->port_tidcursor;
-		tidoff = 0;
-	} else if (!subport) {
-		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-		tidoff = dd->ipath_rcvtidcnt - tidcnt;
-		porttid += tidoff;
-		tid = tidcursor_fp(fp);
-	} else {
-		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-		tidoff = tidcnt * (subport - 1);
-		porttid += tidoff;
-		tid = tidcursor_fp(fp);
-	}
-	if (cnt > tidcnt) {
-		/* make sure it all fits in port_tid_pg_list */
-		dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
-			 "TIDs, only trying max (%u)\n", cnt, tidcnt);
-		cnt = tidcnt;
-	}
-	pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
-	tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
-
-	memset(tidmap, 0, sizeof(tidmap));
-	/* before decrement; chip actual # */
-	ntids = tidcnt;
-	tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-				   dd->ipath_rcvtidbase +
-				   porttid * sizeof(*tidbase));
-
-	ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
-		   pd->port_port, cnt, tid, tidbase);
-
-	/* virtual address of first page in transfer */
-	vaddr = ti->tidvaddr;
-	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
-		       cnt * PAGE_SIZE)) {
-		ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
-			  (void *)vaddr, cnt);
-		ret = -EFAULT;
-		goto done;
-	}
-	ret = ipath_get_user_pages(vaddr, cnt, pagep);
-	if (ret) {
-		if (ret == -EBUSY) {
-			ipath_dbg("Failed to lock addr %p, %u pages "
-				  "(already locked)\n",
-				  (void *) vaddr, cnt);
-			/*
-			 * for now, continue, and see what happens but with
-			 * the new implementation, this should never happen,
-			 * unless perhaps the user has mpin'ed the pages
-			 * themselves (something we need to test)
-			 */
-			ret = 0;
-		} else {
-			dev_info(&dd->pcidev->dev,
-				 "Failed to lock addr %p, %u pages: "
-				 "errno %d\n", (void *) vaddr, cnt, -ret);
-			goto done;
-		}
-	}
-	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
-		for (; ntids--; tid++) {
-			if (tid == tidcnt)
-				tid = 0;
-			if (!dd->ipath_pageshadow[porttid + tid])
-				break;
-		}
-		if (ntids < 0) {
-			/*
-			 * oops, wrapped all the way through their TIDs,
-			 * and didn't have enough free; see comments at
-			 * start of routine
-			 */
-			ipath_dbg("Not enough free TIDs for %u pages "
-				  "(index %d), failing\n", cnt, i);
-			i--;	/* last tidlist[i] not filled in */
-			ret = -ENOMEM;
-			break;
-		}
-		tidlist[i] = tid + tidoff;
-		ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
-			   "vaddr %lx\n", i, tid + tidoff, vaddr);
-		/* we "know" system pages and TID pages are same size */
-		dd->ipath_pageshadow[porttid + tid] = pagep[i];
-		dd->ipath_physshadow[porttid + tid] = ipath_map_page(
-			dd->pcidev, pagep[i], 0, PAGE_SIZE,
-			PCI_DMA_FROMDEVICE);
-		/*
-		 * don't need atomic or it's overhead
-		 */
-		__set_bit(tid, tidmap);
-		physaddr = dd->ipath_physshadow[porttid + tid];
-		ipath_stats.sps_pagelocks++;
-		ipath_cdbg(VERBOSE,
-			   "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
-			   tid, vaddr, (unsigned long long) physaddr,
-			   pagep[i]);
-		dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
-				    physaddr);
-		/*
-		 * don't check this tid in ipath_portshadow, since we
-		 * just filled it in; start with the next one.
-		 */
-		tid++;
-	}
-
-	if (ret) {
-		u32 limit;
-	cleanup:
-		/* jump here if copy out of updated info failed... */
-		ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
-			  -ret, i, cnt);
-		/* same code that's in ipath_free_tid() */
-		limit = sizeof(tidmap) * BITS_PER_BYTE;
-		if (limit > tidcnt)
-			/* just in case size changes in future */
-			limit = tidcnt;
-		tid = find_first_bit((const unsigned long *)tidmap, limit);
-		for (; tid < limit; tid++) {
-			if (!test_bit(tid, tidmap))
-				continue;
-			if (dd->ipath_pageshadow[porttid + tid]) {
-				ipath_cdbg(VERBOSE, "Freeing TID %u\n",
-					   tid);
-				dd->ipath_f_put_tid(dd, &tidbase[tid],
-						    RCVHQ_RCV_TYPE_EXPECTED,
-						    dd->ipath_tidinvalid);
-				pci_unmap_page(dd->pcidev,
-					dd->ipath_physshadow[porttid + tid],
-					PAGE_SIZE, PCI_DMA_FROMDEVICE);
-				dd->ipath_pageshadow[porttid + tid] = NULL;
-				ipath_stats.sps_pageunlocks++;
-			}
-		}
-		ipath_release_user_pages(pagep, cnt);
-	} else {
-		/*
-		 * Copy the updated array, with ipath_tid's filled in, back
-		 * to user.  Since we did the copy in already, this "should
-		 * never fail" If it does, we have to clean up...
-		 */
-		if (copy_to_user((void __user *)
-				 (unsigned long) ti->tidlist,
-				 tidlist, cnt * sizeof(*tidlist))) {
-			ret = -EFAULT;
-			goto cleanup;
-		}
-		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
-				 tidmap, sizeof tidmap)) {
-			ret = -EFAULT;
-			goto cleanup;
-		}
-		if (tid == tidcnt)
-			tid = 0;
-		if (!pd->port_subport_cnt)
-			pd->port_tidcursor = tid;
-		else
-			tidcursor_fp(fp) = tid;
-	}
-
-done:
-	if (ret)
-		ipath_dbg("Failed to map %u TID pages, failing with %d\n",
-			  ti->tidcnt, -ret);
-	return ret;
-}
-
-/**
- * ipath_tid_free - free a port TID
- * @pd: the port
- * @subport: the subport
- * @ti: the TID info
- *
- * right now we are unlocking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.  We check that the TID is in range for this port
- * but otherwise don't check validity; if user has an error and
- * frees the wrong tid, it's only their own data that can thereby
- * be corrupted.  We do check that the TID was in use, for sanity
- * We always use our idea of the saved address, not the address that
- * they pass in to us.
- */
-
-static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
-			  const struct ipath_tid_info *ti)
-{
-	int ret = 0;
-	u32 tid, porttid, cnt, limit, tidcnt;
-	struct ipath_devdata *dd = pd->port_dd;
-	u64 __iomem *tidbase;
-	unsigned long tidmap[8];
-
-	if (!dd->ipath_pageshadow) {
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
-			   sizeof tidmap)) {
-		ret = -EFAULT;
-		goto done;
-	}
-
-	porttid = pd->port_port * dd->ipath_rcvtidcnt;
-	if (!pd->port_subport_cnt)
-		tidcnt = dd->ipath_rcvtidcnt;
-	else if (!subport) {
-		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-		porttid += dd->ipath_rcvtidcnt - tidcnt;
-	} else {
-		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-		porttid += tidcnt * (subport - 1);
-	}
-	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-				   dd->ipath_rcvtidbase +
-				   porttid * sizeof(*tidbase));
-
-	limit = sizeof(tidmap) * BITS_PER_BYTE;
-	if (limit > tidcnt)
-		/* just in case size changes in future */
-		limit = tidcnt;
-	tid = find_first_bit(tidmap, limit);
-	ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
-		   "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
-		   limit, tid, porttid);
-	for (cnt = 0; tid < limit; tid++) {
-		/*
-		 * small optimization; if we detect a run of 3 or so without
-		 * any set, use find_first_bit again.  That's mainly to
-		 * accelerate the case where we wrapped, so we have some at
-		 * the beginning, and some at the end, and a big gap
-		 * in the middle.
-		 */
-		if (!test_bit(tid, tidmap))
-			continue;
-		cnt++;
-		if (dd->ipath_pageshadow[porttid + tid]) {
-			struct page *p;
-			p = dd->ipath_pageshadow[porttid + tid];
-			dd->ipath_pageshadow[porttid + tid] = NULL;
-			ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
-				   pid_nr(pd->port_pid), tid);
-			dd->ipath_f_put_tid(dd, &tidbase[tid],
-					    RCVHQ_RCV_TYPE_EXPECTED,
-					    dd->ipath_tidinvalid);
-			pci_unmap_page(dd->pcidev,
-				dd->ipath_physshadow[porttid + tid],
-				PAGE_SIZE, PCI_DMA_FROMDEVICE);
-			ipath_release_user_pages(&p, 1);
-			ipath_stats.sps_pageunlocks++;
-		} else
-			ipath_dbg("Unused tid %u, ignoring\n", tid);
-	}
-	if (cnt != ti->tidcnt)
-		ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
-			  ti->tidcnt, cnt);
-done:
-	if (ret)
-		ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
-			  ti->tidcnt, -ret);
-	return ret;
-}
-
-/**
- * ipath_set_part_key - set a partition key
- * @pd: the port
- * @key: the key
- *
- * We can have up to 4 active at a time (other than the default, which is
- * always allowed).  This is somewhat tricky, since multiple ports may set
- * the same key, so we reference count them, and clean up at exit.  All 4
- * partition keys are packed into a single infinipath register.  It's an
- * error for a process to set the same pkey multiple times.  We provide no
- * mechanism to de-allocate a pkey at this time, we may eventually need to
- * do that.  I've used the atomic operations, and no locking, and only make
- * a single pass through what's available.  This should be more than
- * adequate for some time. I'll think about spinlocks or the like if and as
- * it's necessary.
- */
-static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	int i, any = 0, pidx = -1;
-	u16 lkey = key & 0x7FFF;
-	int ret;
-
-	if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
-		/* nothing to do; this key always valid */
-		ret = 0;
-		goto bail;
-	}
-
-	ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
-		   "%hx:%x %hx:%x %hx:%x %hx:%x\n",
-		   pd->port_port, key, dd->ipath_pkeys[0],
-		   atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
-		   atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
-		   atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
-		   atomic_read(&dd->ipath_pkeyrefs[3]));
-
-	if (!lkey) {
-		ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
-			   pd->port_port);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/*
-	 * Set the full membership bit, because it has to be
-	 * set in the register or the packet, and it seems
-	 * cleaner to set in the register than to force all
-	 * callers to set it. (see bug 4331)
-	 */
-	key |= 0x8000;
-
-	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-		if (!pd->port_pkeys[i] && pidx == -1)
-			pidx = i;
-		if (pd->port_pkeys[i] == key) {
-			ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
-				   "(%x) more than once\n",
-				   pd->port_port, key);
-			ret = -EEXIST;
-			goto bail;
-		}
-	}
-	if (pidx == -1) {
-		ipath_dbg("All pkeys for port %u already in use, "
-			  "can't set %x\n", pd->port_port, key);
-		ret = -EBUSY;
-		goto bail;
-	}
-	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-		if (!dd->ipath_pkeys[i]) {
-			any++;
-			continue;
-		}
-		if (dd->ipath_pkeys[i] == key) {
-			atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
-
-			if (atomic_inc_return(pkrefs) > 1) {
-				pd->port_pkeys[pidx] = key;
-				ipath_cdbg(VERBOSE, "p%u set key %x "
-					   "matches #%d, count now %d\n",
-					   pd->port_port, key, i,
-					   atomic_read(pkrefs));
-				ret = 0;
-				goto bail;
-			} else {
-				/*
-				 * lost race, decrement count, catch below
-				 */
-				atomic_dec(pkrefs);
-				ipath_cdbg(VERBOSE, "Lost race, count was "
-					   "0, after dec, it's %d\n",
-					   atomic_read(pkrefs));
-				any++;
-			}
-		}
-		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-			/*
-			 * It makes no sense to have both the limited and
-			 * full membership PKEY set at the same time since
-			 * the unlimited one will disable the limited one.
-			 */
-			ret = -EEXIST;
-			goto bail;
-		}
-	}
-	if (!any) {
-		ipath_dbg("port %u, all pkeys already in use, "
-			  "can't set %x\n", pd->port_port, key);
-		ret = -EBUSY;
-		goto bail;
-	}
-	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-		if (!dd->ipath_pkeys[i] &&
-		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-			u64 pkey;
-
-			/* for ipathstats, etc. */
-			ipath_stats.sps_pkeys[i] = lkey;
-			pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
-			pkey =
-				(u64) dd->ipath_pkeys[0] |
-				((u64) dd->ipath_pkeys[1] << 16) |
-				((u64) dd->ipath_pkeys[2] << 32) |
-				((u64) dd->ipath_pkeys[3] << 48);
-			ipath_cdbg(PROC, "p%u set key %x in #%d, "
-				   "portidx %d, new pkey reg %llx\n",
-				   pd->port_port, key, i, pidx,
-				   (unsigned long long) pkey);
-			ipath_write_kreg(
-				dd, dd->ipath_kregs->kr_partitionkey, pkey);
-
-			ret = 0;
-			goto bail;
-		}
-	}
-	ipath_dbg("port %u, all pkeys already in use 2nd pass, "
-		  "can't set %x\n", pd->port_port, key);
-	ret = -EBUSY;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_manage_rcvq - manage a port's receive queue
- * @pd: the port
- * @subport: the subport
- * @start_stop: action to carry out
- *
- * start_stop == 0 disables receive on the port, for use in queue
- * overflow conditions.  start_stop==1 re-enables, to be used to
- * re-init the software copy of the head register
- */
-static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
-			     int start_stop)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-
-	ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
-		   start_stop ? "en" : "dis", dd->ipath_unit,
-		   pd->port_port, subport);
-	if (subport)
-		goto bail;
-	/* atomically clear receive enable port. */
-	if (start_stop) {
-		/*
-		 * On enable, force in-memory copy of the tail register to
-		 * 0, so that protocol code doesn't have to worry about
-		 * whether or not the chip has yet updated the in-memory
-		 * copy or not on return from the system call. The chip
-		 * always resets it's tail register back to 0 on a
-		 * transition from disabled to enabled.  This could cause a
-		 * problem if software was broken, and did the enable w/o
-		 * the disable, but eventually the in-memory copy will be
-		 * updated and correct itself, even in the face of software
-		 * bugs.
-		 */
-		if (pd->port_rcvhdrtail_kvaddr)
-			ipath_clear_rcvhdrtail(pd);
-		set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-			&dd->ipath_rcvctrl);
-	} else
-		clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
-			  &dd->ipath_rcvctrl);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl);
-	/* now be sure chip saw it before we return */
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	if (start_stop) {
-		/*
-		 * And try to be sure that tail reg update has happened too.
-		 * This should in theory interlock with the RXE changes to
-		 * the tail register.  Don't assign it to the tail register
-		 * in memory copy, since we could overwrite an update by the
-		 * chip if we did.
-		 */
-		ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-	}
-	/* always; new head should be equal to new tail; see above */
-bail:
-	return 0;
-}
-
-static void ipath_clean_part_key(struct ipath_portdata *pd,
-				 struct ipath_devdata *dd)
-{
-	int i, j, pchanged = 0;
-	u64 oldpkey;
-
-	/* for debugging only */
-	oldpkey = (u64) dd->ipath_pkeys[0] |
-		((u64) dd->ipath_pkeys[1] << 16) |
-		((u64) dd->ipath_pkeys[2] << 32) |
-		((u64) dd->ipath_pkeys[3] << 48);
-
-	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-		if (!pd->port_pkeys[i])
-			continue;
-		ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
-			   pd->port_pkeys[i]);
-		for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
-			/* check for match independent of the global bit */
-			if ((dd->ipath_pkeys[j] & 0x7fff) !=
-			    (pd->port_pkeys[i] & 0x7fff))
-				continue;
-			if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
-				ipath_cdbg(VERBOSE, "p%u clear key "
-					   "%x matches #%d\n",
-					   pd->port_port,
-					   pd->port_pkeys[i], j);
-				ipath_stats.sps_pkeys[j] =
-					dd->ipath_pkeys[j] = 0;
-				pchanged++;
-			}
-			else ipath_cdbg(
-				VERBOSE, "p%u key %x matches #%d, "
-				"but ref still %d\n", pd->port_port,
-				pd->port_pkeys[i], j,
-				atomic_read(&dd->ipath_pkeyrefs[j]));
-			break;
-		}
-		pd->port_pkeys[i] = 0;
-	}
-	if (pchanged) {
-		u64 pkey = (u64) dd->ipath_pkeys[0] |
-			((u64) dd->ipath_pkeys[1] << 16) |
-			((u64) dd->ipath_pkeys[2] << 32) |
-			((u64) dd->ipath_pkeys[3] << 48);
-		ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
-			   "new pkey reg %llx\n", pd->port_port,
-			   (unsigned long long) oldpkey,
-			   (unsigned long long) pkey);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-				 pkey);
-	}
-}
-
-/*
- * Initialize the port data with the receive buffer sizes
- * so this can be done while the master port is locked.
- * Otherwise, there is a race with a slave opening the port
- * and seeing these fields uninitialized.
- */
-static void init_user_egr_sizes(struct ipath_portdata *pd)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	unsigned egrperchunk, egrcnt, size;
-
-	/*
-	 * to avoid wasting a lot of memory, we allocate 32KB chunks of
-	 * physically contiguous memory, advance through it until used up
-	 * and then allocate more.  Of course, we need memory to store those
-	 * extra pointers, now.  Started out with 256KB, but under heavy
-	 * memory pressure (creating large files and then copying them over
-	 * NFS while doing lots of MPI jobs), we hit some allocation
-	 * failures, even though we can sleep...  (2.6.10) Still get
-	 * failures at 64K.  32K is the lowest we can go without wasting
-	 * additional memory.
-	 */
-	size = 0x8000;
-	egrperchunk = size / dd->ipath_rcvegrbufsize;
-	egrcnt = dd->ipath_rcvegrcnt;
-	pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
-	pd->port_rcvegrbufs_perchunk = egrperchunk;
-	pd->port_rcvegrbuf_size = size;
-}
-
-/**
- * ipath_create_user_egr - allocate eager TID buffers
- * @pd: the port to allocate TID buffers for
- *
- * This routine is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance
- * This is the user port version
- *
- * Allocate the eager TID buffers and program them into infinipath
- * They are no longer completely contiguous, we do multiple allocation
- * calls.
- */
-static int ipath_create_user_egr(struct ipath_portdata *pd)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
-	size_t size;
-	int ret;
-	gfp_t gfp_flags;
-
-	/*
-	 * GFP_USER, but without GFP_FS, so buffer cache can be
-	 * coalesced (we hope); otherwise, even at order 4,
-	 * heavy filesystem activity makes these fail, and we can
-	 * use compound pages.
-	 */
-	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
-
-	egrcnt = dd->ipath_rcvegrcnt;
-	/* TID number offset for this port */
-	egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
-	egrsize = dd->ipath_rcvegrbufsize;
-	ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
-		   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
-
-	chunk = pd->port_rcvegrbuf_chunks;
-	egrperchunk = pd->port_rcvegrbufs_perchunk;
-	size = pd->port_rcvegrbuf_size;
-	pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
-				     GFP_KERNEL);
-	if (!pd->port_rcvegrbuf) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-	pd->port_rcvegrbuf_phys =
-		kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
-			GFP_KERNEL);
-	if (!pd->port_rcvegrbuf_phys) {
-		ret = -ENOMEM;
-		goto bail_rcvegrbuf;
-	}
-	for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-
-		pd->port_rcvegrbuf[e] = dma_alloc_coherent(
-			&dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
-			gfp_flags);
-
-		if (!pd->port_rcvegrbuf[e]) {
-			ret = -ENOMEM;
-			goto bail_rcvegrbuf_phys;
-		}
-	}
-
-	pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
-
-	for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
-		dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
-		unsigned i;
-
-		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
-			dd->ipath_f_put_tid(dd, e + egroff +
-					    (u64 __iomem *)
-					    ((char __iomem *)
-					     dd->ipath_kregbase +
-					     dd->ipath_rcvegrbase),
-					    RCVHQ_RCV_TYPE_EAGER, pa);
-			pa += egrsize;
-		}
-		cond_resched();	/* don't hog the cpu */
-	}
-
-	ret = 0;
-	goto bail;
-
-bail_rcvegrbuf_phys:
-	for (e = 0; e < pd->port_rcvegrbuf_chunks &&
-		pd->port_rcvegrbuf[e]; e++) {
-		dma_free_coherent(&dd->pcidev->dev, size,
-				  pd->port_rcvegrbuf[e],
-				  pd->port_rcvegrbuf_phys[e]);
-
-	}
-	kfree(pd->port_rcvegrbuf_phys);
-	pd->port_rcvegrbuf_phys = NULL;
-bail_rcvegrbuf:
-	kfree(pd->port_rcvegrbuf);
-	pd->port_rcvegrbuf = NULL;
-bail:
-	return ret;
-}
-
-
-/* common code for the mappings on dma_alloc_coherent mem */
-static int ipath_mmap_mem(struct vm_area_struct *vma,
-	struct ipath_portdata *pd, unsigned len, int write_ok,
-	void *kvaddr, char *what)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	unsigned long pfn;
-	int ret;
-
-	if ((vma->vm_end - vma->vm_start) > len) {
-		dev_info(&dd->pcidev->dev,
-		         "FAIL on %s: len %lx > %x\n", what,
-			 vma->vm_end - vma->vm_start, len);
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	if (!write_ok) {
-		if (vma->vm_flags & VM_WRITE) {
-			dev_info(&dd->pcidev->dev,
-				 "%s must be mapped readonly\n", what);
-			ret = -EPERM;
-			goto bail;
-		}
-
-		/* don't allow them to later change with mprotect */
-		vma->vm_flags &= ~VM_MAYWRITE;
-	}
-
-	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
-	ret = remap_pfn_range(vma, vma->vm_start, pfn,
-			      len, vma->vm_page_prot);
-	if (ret)
-		dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
-			 "bytes r%c failed: %d\n", what, pd->port_port,
-			 pfn, len, write_ok?'w':'o', ret);
-	else
-		ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
-			   "r%c\n", what, pd->port_port, pfn, len,
-			   write_ok?'w':'o');
-bail:
-	return ret;
-}
-
-static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
-		     u64 ureg)
-{
-	unsigned long phys;
-	int ret;
-
-	/*
-	 * This is real hardware, so use io_remap.  This is the mechanism
-	 * for the user process to update the head registers for their port
-	 * in the chip.
-	 */
-	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
-		dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
-			 "%lx > PAGE\n", vma->vm_end - vma->vm_start);
-		ret = -EFAULT;
-	} else {
-		phys = dd->ipath_physaddr + ureg;
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-		ret = io_remap_pfn_range(vma, vma->vm_start,
-					 phys >> PAGE_SHIFT,
-					 vma->vm_end - vma->vm_start,
-					 vma->vm_page_prot);
-	}
-	return ret;
-}
-
-static int mmap_piobufs(struct vm_area_struct *vma,
-			struct ipath_devdata *dd,
-			struct ipath_portdata *pd,
-			unsigned piobufs, unsigned piocnt)
-{
-	unsigned long phys;
-	int ret;
-
-	/*
-	 * When we map the PIO buffers in the chip, we want to map them as
-	 * writeonly, no read possible.   This prevents access to previous
-	 * process data, and catches users who might try to read the i/o
-	 * space due to a bug.
-	 */
-	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
-		dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
-			 "reqlen %lx > PAGE\n",
-			 vma->vm_end - vma->vm_start);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	phys = dd->ipath_physaddr + piobufs;
-
-#if defined(__powerpc__)
-	/* There isn't a generic way to specify writethrough mappings */
-	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
-	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
-	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
-#endif
-
-	/*
-	 * don't allow them to later change to readable with mprotect (for when
-	 * not initially mapped readable, as is normally the case)
-	 */
-	vma->vm_flags &= ~VM_MAYREAD;
-	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-
-	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
-				 vma->vm_end - vma->vm_start,
-				 vma->vm_page_prot);
-bail:
-	return ret;
-}
-
-static int mmap_rcvegrbufs(struct vm_area_struct *vma,
-			   struct ipath_portdata *pd)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	unsigned long start, size;
-	size_t total_size, i;
-	unsigned long pfn;
-	int ret;
-
-	size = pd->port_rcvegrbuf_size;
-	total_size = pd->port_rcvegrbuf_chunks * size;
-	if ((vma->vm_end - vma->vm_start) > total_size) {
-		dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
-			 "reqlen %lx > actual %lx\n",
-			 vma->vm_end - vma->vm_start,
-			 (unsigned long) total_size);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (vma->vm_flags & VM_WRITE) {
-		dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
-			 "writable (flags=%lx)\n", vma->vm_flags);
-		ret = -EPERM;
-		goto bail;
-	}
-	/* don't allow them to later change to writeable with mprotect */
-	vma->vm_flags &= ~VM_MAYWRITE;
-
-	start = vma->vm_start;
-
-	for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
-		pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
-		ret = remap_pfn_range(vma, start, pfn, size,
-				      vma->vm_page_prot);
-		if (ret < 0)
-			goto bail;
-	}
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/*
- * ipath_file_vma_fault - handle a VMA page fault.
- */
-static int ipath_file_vma_fault(struct vm_area_struct *vma,
-					struct vm_fault *vmf)
-{
-	struct page *page;
-
-	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
-	if (!page)
-		return VM_FAULT_SIGBUS;
-	get_page(page);
-	vmf->page = page;
-
-	return 0;
-}
-
-static const struct vm_operations_struct ipath_file_vm_ops = {
-	.fault = ipath_file_vma_fault,
-};
-
-static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
-		       struct ipath_portdata *pd, unsigned subport)
-{
-	unsigned long len;
-	struct ipath_devdata *dd;
-	void *addr;
-	size_t size;
-	int ret = 0;
-
-	/* If the port is not shared, all addresses should be physical */
-	if (!pd->port_subport_cnt)
-		goto bail;
-
-	dd = pd->port_dd;
-	size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-
-	/*
-	 * Each process has all the subport uregbase, rcvhdrq, and
-	 * rcvegrbufs mmapped - as an array for all the processes,
-	 * and also separately for this process.
-	 */
-	if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
-		addr = pd->subport_uregbase;
-		size = PAGE_SIZE * pd->port_subport_cnt;
-	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
-		addr = pd->subport_rcvhdr_base;
-		size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
-	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
-		addr = pd->subport_rcvegrbuf;
-		size *= pd->port_subport_cnt;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
-                                        PAGE_SIZE * subport)) {
-                addr = pd->subport_uregbase + PAGE_SIZE * subport;
-                size = PAGE_SIZE;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
-                                pd->port_rcvhdrq_size * subport)) {
-                addr = pd->subport_rcvhdr_base +
-                        pd->port_rcvhdrq_size * subport;
-                size = pd->port_rcvhdrq_size;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
-                               size * subport)) {
-                addr = pd->subport_rcvegrbuf + size * subport;
-                /* rcvegrbufs are read-only on the slave */
-                if (vma->vm_flags & VM_WRITE) {
-                        dev_info(&dd->pcidev->dev,
-                                 "Can't map eager buffers as "
-                                 "writable (flags=%lx)\n", vma->vm_flags);
-                        ret = -EPERM;
-                        goto bail;
-                }
-                /*
-                 * Don't allow permission to later change to writeable
-                 * with mprotect.
-                 */
-                vma->vm_flags &= ~VM_MAYWRITE;
-	} else {
-		goto bail;
-	}
-	len = vma->vm_end - vma->vm_start;
-	if (len > size) {
-		ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
-	vma->vm_ops = &ipath_file_vm_ops;
-	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-	ret = 1;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_mmap - mmap various structures into user space
- * @fp: the file pointer
- * @vma: the VM area
- *
- * We use this to have a shared buffer between the kernel and the user code
- * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
- * buffers in the chip.  We have the open and close entries so we can bump
- * the ref count and keep the driver from being unloaded while still mapped.
- */
-static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-	struct ipath_portdata *pd;
-	struct ipath_devdata *dd;
-	u64 pgaddr, ureg;
-	unsigned piobufs, piocnt;
-	int ret;
-
-	pd = port_fp(fp);
-	if (!pd) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	dd = pd->port_dd;
-
-	/*
-	 * This is the ipath_do_user_init() code, mapping the shared buffers
-	 * into the user process. The address referred to by vm_pgoff is the
-	 * file offset passed via mmap().  For shared ports, this is the
-	 * kernel vmalloc() address of the pages to share with the master.
-	 * For non-shared or master ports, this is a physical address.
-	 * We only do one mmap for each space mapped.
-	 */
-	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
-
-	/*
-	 * Check for 0 in case one of the allocations failed, but user
-	 * called mmap anyway.
-	 */
-	if (!pgaddr)  {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
-		   (unsigned long long) pgaddr, vma->vm_start,
-		   vma->vm_end - vma->vm_start, dd->ipath_unit,
-		   pd->port_port, subport_fp(fp));
-
-	/*
-	 * Physical addresses must fit in 40 bits for our hardware.
-	 * Check for kernel virtual addresses first, anything else must
-	 * match a HW or memory address.
-	 */
-	ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
-	if (ret) {
-		if (ret > 0)
-			ret = 0;
-		goto bail;
-	}
-
-	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
-	if (!pd->port_subport_cnt) {
-		/* port is not shared */
-		piocnt = pd->port_piocnt;
-		piobufs = pd->port_piobufs;
-	} else if (!subport_fp(fp)) {
-		/* caller is the master */
-		piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
-			 (pd->port_piocnt % pd->port_subport_cnt);
-		piobufs = pd->port_piobufs +
-			dd->ipath_palign * (pd->port_piocnt - piocnt);
-	} else {
-		unsigned slave = subport_fp(fp) - 1;
-
-		/* caller is a slave */
-		piocnt = pd->port_piocnt / pd->port_subport_cnt;
-		piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
-	}
-
-	if (pgaddr == ureg)
-		ret = mmap_ureg(vma, dd, ureg);
-	else if (pgaddr == piobufs)
-		ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
-	else if (pgaddr == dd->ipath_pioavailregs_phys)
-		/* in-memory copy of pioavail registers */
-		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-			      	     (void *) dd->ipath_pioavailregs_dma,
-				     "pioavail registers");
-	else if (pgaddr == pd->port_rcvegr_phys)
-		ret = mmap_rcvegrbufs(vma, pd);
-	else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
-		/*
-		 * The rcvhdrq itself; readonly except on HT (so have
-		 * to allow writable mapping), multiple pages, contiguous
-		 * from an i/o perspective.
-		 */
-		ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
-				     pd->port_rcvhdrq,
-				     "rcvhdrq");
-	else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
-		/* in-memory copy of rcvhdrq tail register */
-		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-				     pd->port_rcvhdrtail_kvaddr,
-				     "rcvhdrq tail");
-	else
-		ret = -EINVAL;
-
-	vma->vm_private_data = NULL;
-
-	if (ret < 0)
-		dev_info(&dd->pcidev->dev,
-			 "Failure %d on off %llx len %lx\n",
-			 -ret, (unsigned long long)pgaddr,
-			 vma->vm_end - vma->vm_start);
-bail:
-	return ret;
-}
-
-static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
-{
-	unsigned pollflag = 0;
-
-	if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
-	    pd->port_hdrqfull != pd->port_hdrqfull_poll) {
-		pollflag |= POLLIN | POLLRDNORM;
-		pd->port_hdrqfull_poll = pd->port_hdrqfull;
-	}
-
-	return pollflag;
-}
-
-static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
-				      struct file *fp,
-				      struct poll_table_struct *pt)
-{
-	unsigned pollflag = 0;
-	struct ipath_devdata *dd;
-
-	dd = pd->port_dd;
-
-	/* variable access in ipath_poll_hdrqfull() needs this */
-	rmb();
-	pollflag = ipath_poll_hdrqfull(pd);
-
-	if (pd->port_urgent != pd->port_urgent_poll) {
-		pollflag |= POLLIN | POLLRDNORM;
-		pd->port_urgent_poll = pd->port_urgent;
-	}
-
-	if (!pollflag) {
-		/* this saves a spin_lock/unlock in interrupt handler... */
-		set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
-		/* flush waiting flag so don't miss an event... */
-		wmb();
-		poll_wait(fp, &pd->port_wait, pt);
-	}
-
-	return pollflag;
-}
-
-static unsigned int ipath_poll_next(struct ipath_portdata *pd,
-				    struct file *fp,
-				    struct poll_table_struct *pt)
-{
-	u32 head;
-	u32 tail;
-	unsigned pollflag = 0;
-	struct ipath_devdata *dd;
-
-	dd = pd->port_dd;
-
-	/* variable access in ipath_poll_hdrqfull() needs this */
-	rmb();
-	pollflag = ipath_poll_hdrqfull(pd);
-
-	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
-	if (pd->port_rcvhdrtail_kvaddr)
-		tail = ipath_get_rcvhdrtail(pd);
-	else
-		tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-
-	if (head != tail)
-		pollflag |= POLLIN | POLLRDNORM;
-	else {
-		/* this saves a spin_lock/unlock in interrupt handler */
-		set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
-		/* flush waiting flag so we don't miss an event */
-		wmb();
-
-		set_bit(pd->port_port + dd->ipath_r_intravail_shift,
-			&dd->ipath_rcvctrl);
-
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-				 dd->ipath_rcvctrl);
-
-		if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
-			ipath_write_ureg(dd, ur_rcvhdrhead,
-					 dd->ipath_rhdrhead_intr_off | head,
-					 pd->port_port);
-
-		poll_wait(fp, &pd->port_wait, pt);
-	}
-
-	return pollflag;
-}
-
-static unsigned int ipath_poll(struct file *fp,
-			       struct poll_table_struct *pt)
-{
-	struct ipath_portdata *pd;
-	unsigned pollflag;
-
-	pd = port_fp(fp);
-	if (!pd)
-		pollflag = 0;
-	else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
-		pollflag = ipath_poll_urgent(pd, fp, pt);
-	else
-		pollflag = ipath_poll_next(pd, fp, pt);
-
-	return pollflag;
-}
-
-static int ipath_supports_subports(int user_swmajor, int user_swminor)
-{
-	/* no subport implementation prior to software version 1.3 */
-	return (user_swmajor > 1) || (user_swminor >= 3);
-}
-
-static int ipath_compatible_subports(int user_swmajor, int user_swminor)
-{
-	/* this code is written long-hand for clarity */
-	if (IPATH_USER_SWMAJOR != user_swmajor) {
-		/* no promise of compatibility if major mismatch */
-		return 0;
-	}
-	if (IPATH_USER_SWMAJOR == 1) {
-		switch (IPATH_USER_SWMINOR) {
-		case 0:
-		case 1:
-		case 2:
-			/* no subport implementation so cannot be compatible */
-			return 0;
-		case 3:
-			/* 3 is only compatible with itself */
-			return user_swminor == 3;
-		default:
-			/* >= 4 are compatible (or are expected to be) */
-			return user_swminor >= 4;
-		}
-	}
-	/* make no promises yet for future major versions */
-	return 0;
-}
-
-static int init_subports(struct ipath_devdata *dd,
-			 struct ipath_portdata *pd,
-			 const struct ipath_user_info *uinfo)
-{
-	int ret = 0;
-	unsigned num_subports;
-	size_t size;
-
-	/*
-	 * If the user is requesting zero subports,
-	 * skip the subport allocation.
-	 */
-	if (uinfo->spu_subport_cnt <= 0)
-		goto bail;
-
-	/* Self-consistency check for ipath_compatible_subports() */
-	if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
-	    !ipath_compatible_subports(IPATH_USER_SWMAJOR,
-				       IPATH_USER_SWMINOR)) {
-		dev_info(&dd->pcidev->dev,
-			 "Inconsistent ipath_compatible_subports()\n");
-		goto bail;
-	}
-
-	/* Check for subport compatibility */
-	if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
-				       uinfo->spu_userversion & 0xffff)) {
-		dev_info(&dd->pcidev->dev,
-			 "Mismatched user version (%d.%d) and driver "
-			 "version (%d.%d) while port sharing. Ensure "
-                         "that driver and library are from the same "
-                         "release.\n",
-			 (int) (uinfo->spu_userversion >> 16),
-                         (int) (uinfo->spu_userversion & 0xffff),
-			 IPATH_USER_SWMAJOR,
-	                 IPATH_USER_SWMINOR);
-		goto bail;
-	}
-	if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	num_subports = uinfo->spu_subport_cnt;
-	pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
-	if (!pd->subport_uregbase) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-	/* Note: pd->port_rcvhdrq_size isn't initialized yet. */
-	size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-		     sizeof(u32), PAGE_SIZE) * num_subports;
-	pd->subport_rcvhdr_base = vzalloc(size);
-	if (!pd->subport_rcvhdr_base) {
-		ret = -ENOMEM;
-		goto bail_ureg;
-	}
-
-	pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
-					pd->port_rcvegrbuf_size *
-					num_subports);
-	if (!pd->subport_rcvegrbuf) {
-		ret = -ENOMEM;
-		goto bail_rhdr;
-	}
-
-	pd->port_subport_cnt = uinfo->spu_subport_cnt;
-	pd->port_subport_id = uinfo->spu_subport_id;
-	pd->active_slaves = 1;
-	set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-	goto bail;
-
-bail_rhdr:
-	vfree(pd->subport_rcvhdr_base);
-bail_ureg:
-	vfree(pd->subport_uregbase);
-	pd->subport_uregbase = NULL;
-bail:
-	return ret;
-}
-
-static int try_alloc_port(struct ipath_devdata *dd, int port,
-			  struct file *fp,
-			  const struct ipath_user_info *uinfo)
-{
-	struct ipath_portdata *pd;
-	int ret;
-
-	if (!(pd = dd->ipath_pd[port])) {
-		void *ptmp;
-
-		pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
-
-		/*
-		 * Allocate memory for use in ipath_tid_update() just once
-		 * at open, not per call.  Reduces cost of expected send
-		 * setup.
-		 */
-		ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
-			       dd->ipath_rcvtidcnt * sizeof(struct page **),
-			       GFP_KERNEL);
-		if (!pd || !ptmp) {
-			ipath_dev_err(dd, "Unable to allocate portdata "
-				      "memory, failing open\n");
-			ret = -ENOMEM;
-			kfree(pd);
-			kfree(ptmp);
-			goto bail;
-		}
-		dd->ipath_pd[port] = pd;
-		dd->ipath_pd[port]->port_port = port;
-		dd->ipath_pd[port]->port_dd = dd;
-		dd->ipath_pd[port]->port_tid_pg_list = ptmp;
-		init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
-	}
-	if (!pd->port_cnt) {
-		pd->userversion = uinfo->spu_userversion;
-		init_user_egr_sizes(pd);
-		if ((ret = init_subports(dd, pd, uinfo)) != 0)
-			goto bail;
-		ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
-			   current->comm, current->pid, dd->ipath_unit,
-			   port);
-		pd->port_cnt = 1;
-		port_fp(fp) = pd;
-		pd->port_pid = get_pid(task_pid(current));
-		strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
-		ipath_stats.sps_ports++;
-		ret = 0;
-	} else
-		ret = -EBUSY;
-
-bail:
-	return ret;
-}
-
-static inline int usable(struct ipath_devdata *dd)
-{
-	return dd &&
-		(dd->ipath_flags & IPATH_PRESENT) &&
-		dd->ipath_kregbase &&
-		dd->ipath_lid &&
-		!(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
-				     | IPATH_LINKUNK));
-}
-
-static int find_free_port(int unit, struct file *fp,
-			  const struct ipath_user_info *uinfo)
-{
-	struct ipath_devdata *dd = ipath_lookup(unit);
-	int ret, i;
-
-	if (!dd) {
-		ret = -ENODEV;
-		goto bail;
-	}
-
-	if (!usable(dd)) {
-		ret = -ENETDOWN;
-		goto bail;
-	}
-
-	for (i = 1; i < dd->ipath_cfgports; i++) {
-		ret = try_alloc_port(dd, i, fp, uinfo);
-		if (ret != -EBUSY)
-			goto bail;
-	}
-	ret = -EBUSY;
-
-bail:
-	return ret;
-}
-
-static int find_best_unit(struct file *fp,
-			  const struct ipath_user_info *uinfo)
-{
-	int ret = 0, i, prefunit = -1, devmax;
-	int maxofallports, npresent, nup;
-	int ndev;
-
-	devmax = ipath_count_units(&npresent, &nup, &maxofallports);
-
-	/*
-	 * This code is present to allow a knowledgeable person to
-	 * specify the layout of processes to processors before opening
-	 * this driver, and then we'll assign the process to the "closest"
-	 * InfiniPath chip to that processor (we assume reasonable connectivity,
-	 * for now).  This code assumes that if affinity has been set
-	 * before this point, that at most one cpu is set; for now this
-	 * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
-	 * in case some kernel variant sets none of the bits when no
-	 * affinity is set.  2.6.11 and 12 kernels have all present
-	 * cpus set.  Some day we'll have to fix it up further to handle
-	 * a cpu subset.  This algorithm fails for two HT chips connected
-	 * in tunnel fashion.  Eventually this needs real topology
-	 * information.  There may be some issues with dual core numbering
-	 * as well.  This needs more work prior to release.
-	 */
-	if (!cpumask_empty(tsk_cpus_allowed(current)) &&
-	    !cpumask_full(tsk_cpus_allowed(current))) {
-		int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
-		get_online_cpus();
-		for_each_online_cpu(i)
-			if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
-				ipath_cdbg(PROC, "%s[%u] affinity set for "
-					   "cpu %d/%d\n", current->comm,
-					   current->pid, i, ncpus);
-				curcpu = i;
-				nset++;
-			}
-		put_online_cpus();
-		if (curcpu != -1 && nset != ncpus) {
-			if (npresent) {
-				prefunit = curcpu / (ncpus / npresent);
-				ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
-					  "%d cpus/chip, select unit %d\n",
-					  current->comm, current->pid,
-					  npresent, ncpus, ncpus / npresent,
-					  prefunit);
-			}
-		}
-	}
-
-	/*
-	 * user ports start at 1, kernel port is 0
-	 * For now, we do round-robin access across all chips
-	 */
-
-	if (prefunit != -1)
-		devmax = prefunit + 1;
-recheck:
-	for (i = 1; i < maxofallports; i++) {
-		for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
-		     ndev++) {
-			struct ipath_devdata *dd = ipath_lookup(ndev);
-
-			if (!usable(dd))
-				continue; /* can't use this unit */
-			if (i >= dd->ipath_cfgports)
-				/*
-				 * Maxed out on users of this unit. Try
-				 * next.
-				 */
-				continue;
-			ret = try_alloc_port(dd, i, fp, uinfo);
-			if (!ret)
-				goto done;
-		}
-	}
-
-	if (npresent) {
-		if (nup == 0) {
-			ret = -ENETDOWN;
-			ipath_dbg("No ports available (none initialized "
-				  "and ready)\n");
-		} else {
-			if (prefunit > 0) {
-				/* if started above 0, retry from 0 */
-				ipath_cdbg(PROC,
-					   "%s[%u] no ports on prefunit "
-					   "%d, clear and re-check\n",
-					   current->comm, current->pid,
-					   prefunit);
-				devmax = ipath_count_units(NULL, NULL,
-							   NULL);
-				prefunit = -1;
-				goto recheck;
-			}
-			ret = -EBUSY;
-			ipath_dbg("No ports available\n");
-		}
-	} else {
-		ret = -ENXIO;
-		ipath_dbg("No boards found\n");
-	}
-
-done:
-	return ret;
-}
-
-static int find_shared_port(struct file *fp,
-			    const struct ipath_user_info *uinfo)
-{
-	int devmax, ndev, i;
-	int ret = 0;
-
-	devmax = ipath_count_units(NULL, NULL, NULL);
-
-	for (ndev = 0; ndev < devmax; ndev++) {
-		struct ipath_devdata *dd = ipath_lookup(ndev);
-
-		if (!usable(dd))
-			continue;
-		for (i = 1; i < dd->ipath_cfgports; i++) {
-			struct ipath_portdata *pd = dd->ipath_pd[i];
-
-			/* Skip ports which are not yet open */
-			if (!pd || !pd->port_cnt)
-				continue;
-			/* Skip port if it doesn't match the requested one */
-			if (pd->port_subport_id != uinfo->spu_subport_id)
-				continue;
-			/* Verify the sharing process matches the master */
-			if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
-			    pd->userversion != uinfo->spu_userversion ||
-			    pd->port_cnt >= pd->port_subport_cnt) {
-				ret = -EINVAL;
-				goto done;
-			}
-			port_fp(fp) = pd;
-			subport_fp(fp) = pd->port_cnt++;
-			pd->port_subpid[subport_fp(fp)] =
-				get_pid(task_pid(current));
-			tidcursor_fp(fp) = 0;
-			pd->active_slaves |= 1 << subport_fp(fp);
-			ipath_cdbg(PROC,
-				   "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
-				   current->comm, current->pid,
-				   subport_fp(fp),
-				   pd->port_comm, pid_nr(pd->port_pid),
-				   dd->ipath_unit, pd->port_port);
-			ret = 1;
-			goto done;
-		}
-	}
-
-done:
-	return ret;
-}
-
-static int ipath_open(struct inode *in, struct file *fp)
-{
-	/* The real work is performed later in ipath_assign_port() */
-	fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
-	return fp->private_data ? 0 : -ENOMEM;
-}
-
-/* Get port early, so can set affinity prior to memory allocation */
-static int ipath_assign_port(struct file *fp,
-			      const struct ipath_user_info *uinfo)
-{
-	int ret;
-	int i_minor;
-	unsigned swmajor, swminor;
-
-	/* Check to be sure we haven't already initialized this file */
-	if (port_fp(fp)) {
-		ret = -EINVAL;
-		goto done;
-	}
-
-	/* for now, if major version is different, bail */
-	swmajor = uinfo->spu_userversion >> 16;
-	if (swmajor != IPATH_USER_SWMAJOR) {
-		ipath_dbg("User major version %d not same as driver "
-			  "major %d\n", uinfo->spu_userversion >> 16,
-			  IPATH_USER_SWMAJOR);
-		ret = -ENODEV;
-		goto done;
-	}
-
-	swminor = uinfo->spu_userversion & 0xffff;
-	if (swminor != IPATH_USER_SWMINOR)
-		ipath_dbg("User minor version %d not same as driver "
-			  "minor %d\n", swminor, IPATH_USER_SWMINOR);
-
-	mutex_lock(&ipath_mutex);
-
-	if (ipath_compatible_subports(swmajor, swminor) &&
-	    uinfo->spu_subport_cnt &&
-	    (ret = find_shared_port(fp, uinfo))) {
-		if (ret > 0)
-			ret = 0;
-		goto done_chk_sdma;
-	}
-
-	i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
-	ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
-		   (long)file_inode(fp)->i_rdev, i_minor);
-
-	if (i_minor)
-		ret = find_free_port(i_minor - 1, fp, uinfo);
-	else
-		ret = find_best_unit(fp, uinfo);
-
-done_chk_sdma:
-	if (!ret) {
-		struct ipath_filedata *fd = fp->private_data;
-		const struct ipath_portdata *pd = fd->pd;
-		const struct ipath_devdata *dd = pd->port_dd;
-
-		fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
-						      dd->ipath_unit,
-						      pd->port_port,
-						      fd->subport);
-
-		if (!fd->pq)
-			ret = -ENOMEM;
-	}
-
-	mutex_unlock(&ipath_mutex);
-
-done:
-	return ret;
-}
-
-
-static int ipath_do_user_init(struct file *fp,
-			      const struct ipath_user_info *uinfo)
-{
-	int ret;
-	struct ipath_portdata *pd = port_fp(fp);
-	struct ipath_devdata *dd;
-	u32 head32;
-
-	/* Subports don't need to initialize anything since master did it. */
-	if (subport_fp(fp)) {
-		ret = wait_event_interruptible(pd->port_wait,
-			!test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
-		goto done;
-	}
-
-	dd = pd->port_dd;
-
-	if (uinfo->spu_rcvhdrsize) {
-		ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
-		if (ret)
-			goto done;
-	}
-
-	/* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
-
-	/* some ports may get extra buffers, calculate that here */
-	if (pd->port_port <= dd->ipath_ports_extrabuf)
-		pd->port_piocnt = dd->ipath_pbufsport + 1;
-	else
-		pd->port_piocnt = dd->ipath_pbufsport;
-
-	/* for right now, kernel piobufs are at end, so port 1 is at 0 */
-	if (pd->port_port <= dd->ipath_ports_extrabuf)
-		pd->port_pio_base = (dd->ipath_pbufsport + 1)
-			* (pd->port_port - 1);
-	else
-		pd->port_pio_base = dd->ipath_ports_extrabuf +
-			dd->ipath_pbufsport * (pd->port_port - 1);
-	pd->port_piobufs = dd->ipath_piobufbase +
-		pd->port_pio_base * dd->ipath_palign;
-	ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
-		" first pio %u\n", pd->port_port, pd->port_piobufs,
-		pd->port_piocnt, pd->port_pio_base);
-	ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
-
-	/*
-	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
-	 * array for time being.  If pd->port_port > chip-supported,
-	 * we need to do extra stuff here to handle by handling overflow
-	 * through port 0, someday
-	 */
-	ret = ipath_create_rcvhdrq(dd, pd);
-	if (!ret)
-		ret = ipath_create_user_egr(pd);
-	if (ret)
-		goto done;
-
-	/*
-	 * set the eager head register for this port to the current values
-	 * of the tail pointers, since we don't know if they were
-	 * updated on last use of the port.
-	 */
-	head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
-	ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
-	pd->port_lastrcvhdrqtail = -1;
-	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
-		pd->port_port, head32);
-	pd->port_tidcursor = 0;	/* start at beginning after open */
-
-	/* initialize poll variables... */
-	pd->port_urgent = 0;
-	pd->port_urgent_poll = 0;
-	pd->port_hdrqfull_poll = pd->port_hdrqfull;
-
-	/*
-	 * Now enable the port for receive.
-	 * For chips that are set to DMA the tail register to memory
-	 * when they change (and when the update bit transitions from
-	 * 0 to 1.  So for those chips, we turn it off and then back on.
-	 * This will (very briefly) affect any other open ports, but the
-	 * duration is very short, and therefore isn't an issue.  We
-	 * explicitly set the in-memory tail copy to 0 beforehand, so we
-	 * don't have to wait to be sure the DMA update has happened
-	 * (chip resets head/tail to 0 on transition to enable).
-	 */
-	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-		&dd->ipath_rcvctrl);
-	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-		if (pd->port_rcvhdrtail_kvaddr)
-			ipath_clear_rcvhdrtail(pd);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			dd->ipath_rcvctrl &
-			~(1ULL << dd->ipath_r_tailupd_shift));
-	}
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl);
-	/* Notify any waiting slaves */
-	if (pd->port_subport_cnt) {
-		clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-		wake_up(&pd->port_wait);
-	}
-done:
-	return ret;
-}
-
-/**
- * unlock_exptid - unlock any expected TID entries port still had in use
- * @pd: port
- *
- * We don't actually update the chip here, because we do a bulk update
- * below, using ipath_f_clear_tids.
- */
-static void unlock_expected_tids(struct ipath_portdata *pd)
-{
-	struct ipath_devdata *dd = pd->port_dd;
-	int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
-	int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-
-	ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
-		   pd->port_port);
-	for (i = port_tidbase; i < maxtid; i++) {
-		struct page *ps = dd->ipath_pageshadow[i];
-
-		if (!ps)
-			continue;
-
-		dd->ipath_pageshadow[i] = NULL;
-		pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
-			PAGE_SIZE, PCI_DMA_FROMDEVICE);
-		ipath_release_user_pages_on_close(&ps, 1);
-		cnt++;
-		ipath_stats.sps_pageunlocks++;
-	}
-	if (cnt)
-		ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
-			   pd->port_port, cnt);
-
-	if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
-		ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
-			   (unsigned long long) ipath_stats.sps_pagelocks,
-			   (unsigned long long)
-			   ipath_stats.sps_pageunlocks);
-}
-
-static int ipath_close(struct inode *in, struct file *fp)
-{
-	int ret = 0;
-	struct ipath_filedata *fd;
-	struct ipath_portdata *pd;
-	struct ipath_devdata *dd;
-	unsigned long flags;
-	unsigned port;
-	struct pid *pid;
-
-	ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
-		   (long)in->i_rdev, fp->private_data);
-
-	mutex_lock(&ipath_mutex);
-
-	fd = fp->private_data;
-	fp->private_data = NULL;
-	pd = fd->pd;
-	if (!pd) {
-		mutex_unlock(&ipath_mutex);
-		goto bail;
-	}
-
-	dd = pd->port_dd;
-
-	/* drain user sdma queue */
-	ipath_user_sdma_queue_drain(dd, fd->pq);
-	ipath_user_sdma_queue_destroy(fd->pq);
-
-	if (--pd->port_cnt) {
-		/*
-		 * XXX If the master closes the port before the slave(s),
-		 * revoke the mmap for the eager receive queue so
-		 * the slave(s) don't wait for receive data forever.
-		 */
-		pd->active_slaves &= ~(1 << fd->subport);
-		put_pid(pd->port_subpid[fd->subport]);
-		pd->port_subpid[fd->subport] = NULL;
-		mutex_unlock(&ipath_mutex);
-		goto bail;
-	}
-	/* early; no interrupt users after this */
-	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-	port = pd->port_port;
-	dd->ipath_pd[port] = NULL;
-	pid = pd->port_pid;
-	pd->port_pid = NULL;
-	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-	if (pd->port_rcvwait_to || pd->port_piowait_to
-	    || pd->port_rcvnowait || pd->port_pionowait) {
-		ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
-			   "%u rcv %u, pio already\n",
-			   pd->port_port, pd->port_rcvwait_to,
-			   pd->port_piowait_to, pd->port_rcvnowait,
-			   pd->port_pionowait);
-		pd->port_rcvwait_to = pd->port_piowait_to =
-			pd->port_rcvnowait = pd->port_pionowait = 0;
-	}
-	if (pd->port_flag) {
-		ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
-			  pd->port_port, pd->port_flag);
-		pd->port_flag = 0;
-	}
-
-	if (dd->ipath_kregbase) {
-		/* atomically clear receive enable port and intr avail. */
-		clear_bit(dd->ipath_r_portenable_shift + port,
-			  &dd->ipath_rcvctrl);
-		clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
-			  &dd->ipath_rcvctrl);
-		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
-			dd->ipath_rcvctrl);
-		/* and read back from chip to be sure that nothing
-		 * else is in flight when we do the rest */
-		(void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-		/* clean up the pkeys for this port user */
-		ipath_clean_part_key(pd, dd);
-		/*
-		 * be paranoid, and never write 0's to these, just use an
-		 * unused part of the port 0 tail page.  Of course,
-		 * rcvhdraddr points to a large chunk of memory, so this
-		 * could still trash things, but at least it won't trash
-		 * page 0, and by disabling the port, it should stop "soon",
-		 * even if a packet or two is in already in flight after we
-		 * disabled the port.
-		 */
-		ipath_write_kreg_port(dd,
-		        dd->ipath_kregs->kr_rcvhdrtailaddr, port,
-			dd->ipath_dummy_hdrq_phys);
-		ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-			pd->port_port, dd->ipath_dummy_hdrq_phys);
-
-		ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
-		ipath_chg_pioavailkernel(dd, pd->port_pio_base,
-			pd->port_piocnt, 1);
-
-		dd->ipath_f_clear_tids(dd, pd->port_port);
-
-		if (dd->ipath_pageshadow)
-			unlock_expected_tids(pd);
-		ipath_stats.sps_ports--;
-		ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
-			   pd->port_comm, pid_nr(pid),
-			   dd->ipath_unit, port);
-	}
-
-	put_pid(pid);
-	mutex_unlock(&ipath_mutex);
-	ipath_free_pddata(dd, pd); /* after releasing the mutex */
-
-bail:
-	kfree(fd);
-	return ret;
-}
-
-static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
-			   struct ipath_port_info __user *uinfo)
-{
-	struct ipath_port_info info;
-	int nup;
-	int ret;
-	size_t sz;
-
-	(void) ipath_count_units(NULL, &nup, NULL);
-	info.num_active = nup;
-	info.unit = pd->port_dd->ipath_unit;
-	info.port = pd->port_port;
-	info.subport = subport;
-	/* Don't return new fields if old library opened the port. */
-	if (ipath_supports_subports(pd->userversion >> 16,
-				    pd->userversion & 0xffff)) {
-		/* Number of user ports available for this device. */
-		info.num_ports = pd->port_dd->ipath_cfgports - 1;
-		info.num_subports = pd->port_subport_cnt;
-		sz = sizeof(info);
-	} else
-		sz = sizeof(info) - 2 * sizeof(u16);
-
-	if (copy_to_user(uinfo, &info, sz)) {
-		ret = -EFAULT;
-		goto bail;
-	}
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-static int ipath_get_slave_info(struct ipath_portdata *pd,
-				void __user *slave_mask_addr)
-{
-	int ret = 0;
-
-	if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
-		ret = -EFAULT;
-	return ret;
-}
-
-static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
-				   u32 __user *inflightp)
-{
-	const u32 val = ipath_user_sdma_inflight_counter(pq);
-
-	if (put_user(val, inflightp))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int ipath_sdma_get_complete(struct ipath_devdata *dd,
-				   struct ipath_user_sdma_queue *pq,
-				   u32 __user *completep)
-{
-	u32 val;
-	int err;
-
-	err = ipath_user_sdma_make_progress(dd, pq);
-	if (err < 0)
-		return err;
-
-	val = ipath_user_sdma_complete_counter(pq);
-	if (put_user(val, completep))
-		return -EFAULT;
-
-	return 0;
-}
-
-static ssize_t ipath_write(struct file *fp, const char __user *data,
-			   size_t count, loff_t *off)
-{
-	const struct ipath_cmd __user *ucmd;
-	struct ipath_portdata *pd;
-	const void __user *src;
-	size_t consumed, copy;
-	struct ipath_cmd cmd;
-	ssize_t ret = 0;
-	void *dest;
-
-	if (count < sizeof(cmd.type)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	ucmd = (const struct ipath_cmd __user *) data;
-
-	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	consumed = sizeof(cmd.type);
-
-	switch (cmd.type) {
-	case IPATH_CMD_ASSIGN_PORT:
-	case __IPATH_CMD_USER_INIT:
-	case IPATH_CMD_USER_INIT:
-		copy = sizeof(cmd.cmd.user_info);
-		dest = &cmd.cmd.user_info;
-		src = &ucmd->cmd.user_info;
-		break;
-	case IPATH_CMD_RECV_CTRL:
-		copy = sizeof(cmd.cmd.recv_ctrl);
-		dest = &cmd.cmd.recv_ctrl;
-		src = &ucmd->cmd.recv_ctrl;
-		break;
-	case IPATH_CMD_PORT_INFO:
-		copy = sizeof(cmd.cmd.port_info);
-		dest = &cmd.cmd.port_info;
-		src = &ucmd->cmd.port_info;
-		break;
-	case IPATH_CMD_TID_UPDATE:
-	case IPATH_CMD_TID_FREE:
-		copy = sizeof(cmd.cmd.tid_info);
-		dest = &cmd.cmd.tid_info;
-		src = &ucmd->cmd.tid_info;
-		break;
-	case IPATH_CMD_SET_PART_KEY:
-		copy = sizeof(cmd.cmd.part_key);
-		dest = &cmd.cmd.part_key;
-		src = &ucmd->cmd.part_key;
-		break;
-	case __IPATH_CMD_SLAVE_INFO:
-		copy = sizeof(cmd.cmd.slave_mask_addr);
-		dest = &cmd.cmd.slave_mask_addr;
-		src = &ucmd->cmd.slave_mask_addr;
-		break;
-	case IPATH_CMD_PIOAVAILUPD:	// force an update of PIOAvail reg
-		copy = 0;
-		src = NULL;
-		dest = NULL;
-		break;
-	case IPATH_CMD_POLL_TYPE:
-		copy = sizeof(cmd.cmd.poll_type);
-		dest = &cmd.cmd.poll_type;
-		src = &ucmd->cmd.poll_type;
-		break;
-	case IPATH_CMD_ARMLAUNCH_CTRL:
-		copy = sizeof(cmd.cmd.armlaunch_ctrl);
-		dest = &cmd.cmd.armlaunch_ctrl;
-		src = &ucmd->cmd.armlaunch_ctrl;
-		break;
-	case IPATH_CMD_SDMA_INFLIGHT:
-		copy = sizeof(cmd.cmd.sdma_inflight);
-		dest = &cmd.cmd.sdma_inflight;
-		src = &ucmd->cmd.sdma_inflight;
-		break;
-	case IPATH_CMD_SDMA_COMPLETE:
-		copy = sizeof(cmd.cmd.sdma_complete);
-		dest = &cmd.cmd.sdma_complete;
-		src = &ucmd->cmd.sdma_complete;
-		break;
-	default:
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (copy) {
-		if ((count - consumed) < copy) {
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		if (copy_from_user(dest, src, copy)) {
-			ret = -EFAULT;
-			goto bail;
-		}
-
-		consumed += copy;
-	}
-
-	pd = port_fp(fp);
-	if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
-		cmd.type != IPATH_CMD_ASSIGN_PORT) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	switch (cmd.type) {
-	case IPATH_CMD_ASSIGN_PORT:
-		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-		if (ret)
-			goto bail;
-		break;
-	case __IPATH_CMD_USER_INIT:
-		/* backwards compatibility, get port first */
-		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-		if (ret)
-			goto bail;
-		/* and fall through to current version. */
-	case IPATH_CMD_USER_INIT:
-		ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
-		if (ret)
-			goto bail;
-		ret = ipath_get_base_info(
-			fp, (void __user *) (unsigned long)
-			cmd.cmd.user_info.spu_base_info,
-			cmd.cmd.user_info.spu_base_info_size);
-		break;
-	case IPATH_CMD_RECV_CTRL:
-		ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
-		break;
-	case IPATH_CMD_PORT_INFO:
-		ret = ipath_port_info(pd, subport_fp(fp),
-				      (struct ipath_port_info __user *)
-				      (unsigned long) cmd.cmd.port_info);
-		break;
-	case IPATH_CMD_TID_UPDATE:
-		ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
-		break;
-	case IPATH_CMD_TID_FREE:
-		ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
-		break;
-	case IPATH_CMD_SET_PART_KEY:
-		ret = ipath_set_part_key(pd, cmd.cmd.part_key);
-		break;
-	case __IPATH_CMD_SLAVE_INFO:
-		ret = ipath_get_slave_info(pd,
-					   (void __user *) (unsigned long)
-					   cmd.cmd.slave_mask_addr);
-		break;
-	case IPATH_CMD_PIOAVAILUPD:
-		ipath_force_pio_avail_update(pd->port_dd);
-		break;
-	case IPATH_CMD_POLL_TYPE:
-		pd->poll_type = cmd.cmd.poll_type;
-		break;
-	case IPATH_CMD_ARMLAUNCH_CTRL:
-		if (cmd.cmd.armlaunch_ctrl)
-			ipath_enable_armlaunch(pd->port_dd);
-		else
-			ipath_disable_armlaunch(pd->port_dd);
-		break;
-	case IPATH_CMD_SDMA_INFLIGHT:
-		ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
-					      (u32 __user *) (unsigned long)
-					      cmd.cmd.sdma_inflight);
-		break;
-	case IPATH_CMD_SDMA_COMPLETE:
-		ret = ipath_sdma_get_complete(pd->port_dd,
-					      user_sdma_queue_fp(fp),
-					      (u32 __user *) (unsigned long)
-					      cmd.cmd.sdma_complete);
-		break;
-	}
-
-	if (ret >= 0)
-		ret = consumed;
-
-bail:
-	return ret;
-}
-
-static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *filp = iocb->ki_filp;
-	struct ipath_filedata *fp = filp->private_data;
-	struct ipath_portdata *pd = port_fp(filp);
-	struct ipath_user_sdma_queue *pq = fp->pq;
-
-	if (!iter_is_iovec(from) || !from->nr_segs)
-		return -EINVAL;
-
-	return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
-}
-
-static struct class *ipath_class;
-
-static int init_cdev(int minor, char *name, const struct file_operations *fops,
-		     struct cdev **cdevp, struct device **devp)
-{
-	const dev_t dev = MKDEV(IPATH_MAJOR, minor);
-	struct cdev *cdev = NULL;
-	struct device *device = NULL;
-	int ret;
-
-	cdev = cdev_alloc();
-	if (!cdev) {
-		printk(KERN_ERR IPATH_DRV_NAME
-		       ": Could not allocate cdev for minor %d, %s\n",
-		       minor, name);
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	cdev->owner = THIS_MODULE;
-	cdev->ops = fops;
-	kobject_set_name(&cdev->kobj, name);
-
-	ret = cdev_add(cdev, dev, 1);
-	if (ret < 0) {
-		printk(KERN_ERR IPATH_DRV_NAME
-		       ": Could not add cdev for minor %d, %s (err %d)\n",
-		       minor, name, -ret);
-		goto err_cdev;
-	}
-
-	device = device_create(ipath_class, NULL, dev, NULL, name);
-
-	if (IS_ERR(device)) {
-		ret = PTR_ERR(device);
-		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-		       "device for minor %d, %s (err %d)\n",
-		       minor, name, -ret);
-		goto err_cdev;
-	}
-
-	goto done;
-
-err_cdev:
-	cdev_del(cdev);
-	cdev = NULL;
-
-done:
-	if (ret >= 0) {
-		*cdevp = cdev;
-		*devp = device;
-	} else {
-		*cdevp = NULL;
-		*devp = NULL;
-	}
-
-	return ret;
-}
-
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-		    struct cdev **cdevp, struct device **devp)
-{
-	return init_cdev(minor, name, fops, cdevp, devp);
-}
-
-static void cleanup_cdev(struct cdev **cdevp,
-			 struct device **devp)
-{
-	struct device *dev = *devp;
-
-	if (dev) {
-		device_unregister(dev);
-		*devp = NULL;
-	}
-
-	if (*cdevp) {
-		cdev_del(*cdevp);
-		*cdevp = NULL;
-	}
-}
-
-void ipath_cdev_cleanup(struct cdev **cdevp,
-			struct device **devp)
-{
-	cleanup_cdev(cdevp, devp);
-}
-
-static struct cdev *wildcard_cdev;
-static struct device *wildcard_dev;
-
-static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
-
-static int user_init(void)
-{
-	int ret;
-
-	ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
-	if (ret < 0) {
-		printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
-		       "chrdev region (err %d)\n", -ret);
-		goto done;
-	}
-
-	ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
-
-	if (IS_ERR(ipath_class)) {
-		ret = PTR_ERR(ipath_class);
-		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-		       "device class (err %d)\n", -ret);
-		goto bail;
-	}
-
-	goto done;
-bail:
-	unregister_chrdev_region(dev, IPATH_NMINORS);
-done:
-	return ret;
-}
-
-static void user_cleanup(void)
-{
-	if (ipath_class) {
-		class_destroy(ipath_class);
-		ipath_class = NULL;
-	}
-
-	unregister_chrdev_region(dev, IPATH_NMINORS);
-}
-
-static atomic_t user_count = ATOMIC_INIT(0);
-static atomic_t user_setup = ATOMIC_INIT(0);
-
-int ipath_user_add(struct ipath_devdata *dd)
-{
-	char name[10];
-	int ret;
-
-	if (atomic_inc_return(&user_count) == 1) {
-		ret = user_init();
-		if (ret < 0) {
-			ipath_dev_err(dd, "Unable to set up user support: "
-				      "error %d\n", -ret);
-			goto bail;
-		}
-		ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
-				&wildcard_dev);
-		if (ret < 0) {
-			ipath_dev_err(dd, "Could not create wildcard "
-				      "minor: error %d\n", -ret);
-			goto bail_user;
-		}
-
-		atomic_set(&user_setup, 1);
-	}
-
-	snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
-
-	ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
-			&dd->user_cdev, &dd->user_dev);
-	if (ret < 0)
-		ipath_dev_err(dd, "Could not create user minor %d, %s\n",
-			      dd->ipath_unit + 1, name);
-
-	goto bail;
-
-bail_user:
-	user_cleanup();
-bail:
-	return ret;
-}
-
-void ipath_user_remove(struct ipath_devdata *dd)
-{
-	cleanup_cdev(&dd->user_cdev, &dd->user_dev);
-
-	if (atomic_dec_return(&user_count) == 0) {
-		if (atomic_read(&user_setup) == 0)
-			goto bail;
-
-		cleanup_cdev(&wildcard_cdev, &wildcard_dev);
-		user_cleanup();
-
-		atomic_set(&user_setup, 0);
-	}
-bail:
-	return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
deleted file mode 100644
index 25422a3..0000000
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/slab.h>
-
-#include "ipath_kernel.h"
-
-#define IPATHFS_MAGIC 0x726a77
-
-static struct super_block *ipath_super;
-
-static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
-			 umode_t mode, const struct file_operations *fops,
-			 void *data)
-{
-	int error;
-	struct inode *inode = new_inode(dir->i_sb);
-
-	if (!inode) {
-		error = -EPERM;
-		goto bail;
-	}
-
-	inode->i_ino = get_next_ino();
-	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_private = data;
-	if (S_ISDIR(mode)) {
-		inode->i_op = &simple_dir_inode_operations;
-		inc_nlink(inode);
-		inc_nlink(dir);
-	}
-
-	inode->i_fop = fops;
-
-	d_instantiate(dentry, inode);
-	error = 0;
-
-bail:
-	return error;
-}
-
-static int create_file(const char *name, umode_t mode,
-		       struct dentry *parent, struct dentry **dentry,
-		       const struct file_operations *fops, void *data)
-{
-	int error;
-
-	mutex_lock(&d_inode(parent)->i_mutex);
-	*dentry = lookup_one_len(name, parent, strlen(name));
-	if (!IS_ERR(*dentry))
-		error = ipathfs_mknod(d_inode(parent), *dentry,
-				      mode, fops, data);
-	else
-		error = PTR_ERR(*dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
-
-	return error;
-}
-
-static ssize_t atomic_stats_read(struct file *file, char __user *buf,
-				 size_t count, loff_t *ppos)
-{
-	return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
-				       sizeof ipath_stats);
-}
-
-static const struct file_operations atomic_stats_ops = {
-	.read = atomic_stats_read,
-	.llseek = default_llseek,
-};
-
-static ssize_t atomic_counters_read(struct file *file, char __user *buf,
-				    size_t count, loff_t *ppos)
-{
-	struct infinipath_counters counters;
-	struct ipath_devdata *dd;
-
-	dd = file_inode(file)->i_private;
-	dd->ipath_f_read_counters(dd, &counters);
-
-	return simple_read_from_buffer(buf, count, ppos, &counters,
-				       sizeof counters);
-}
-
-static const struct file_operations atomic_counters_ops = {
-	.read = atomic_counters_read,
-	.llseek = default_llseek,
-};
-
-static ssize_t flash_read(struct file *file, char __user *buf,
-			  size_t count, loff_t *ppos)
-{
-	struct ipath_devdata *dd;
-	ssize_t ret;
-	loff_t pos;
-	char *tmp;
-
-	pos = *ppos;
-
-	if ( pos < 0) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (pos >= sizeof(struct ipath_flash)) {
-		ret = 0;
-		goto bail;
-	}
-
-	if (count > sizeof(struct ipath_flash) - pos)
-		count = sizeof(struct ipath_flash) - pos;
-
-	tmp = kmalloc(count, GFP_KERNEL);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	dd = file_inode(file)->i_private;
-	if (ipath_eeprom_read(dd, pos, tmp, count)) {
-		ipath_dev_err(dd, "failed to read from flash\n");
-		ret = -ENXIO;
-		goto bail_tmp;
-	}
-
-	if (copy_to_user(buf, tmp, count)) {
-		ret = -EFAULT;
-		goto bail_tmp;
-	}
-
-	*ppos = pos + count;
-	ret = count;
-
-bail_tmp:
-	kfree(tmp);
-
-bail:
-	return ret;
-}
-
-static ssize_t flash_write(struct file *file, const char __user *buf,
-			   size_t count, loff_t *ppos)
-{
-	struct ipath_devdata *dd;
-	ssize_t ret;
-	loff_t pos;
-	char *tmp;
-
-	pos = *ppos;
-
-	if (pos != 0) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (count != sizeof(struct ipath_flash)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	tmp = kmalloc(count, GFP_KERNEL);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	if (copy_from_user(tmp, buf, count)) {
-		ret = -EFAULT;
-		goto bail_tmp;
-	}
-
-	dd = file_inode(file)->i_private;
-	if (ipath_eeprom_write(dd, pos, tmp, count)) {
-		ret = -ENXIO;
-		ipath_dev_err(dd, "failed to write to flash\n");
-		goto bail_tmp;
-	}
-
-	*ppos = pos + count;
-	ret = count;
-
-bail_tmp:
-	kfree(tmp);
-
-bail:
-	return ret;
-}
-
-static const struct file_operations flash_ops = {
-	.read = flash_read,
-	.write = flash_write,
-	.llseek = default_llseek,
-};
-
-static int create_device_files(struct super_block *sb,
-			       struct ipath_devdata *dd)
-{
-	struct dentry *dir, *tmp;
-	char unit[10];
-	int ret;
-
-	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
-			  &simple_dir_operations, dd);
-	if (ret) {
-		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
-		goto bail;
-	}
-
-	ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
-			  &atomic_counters_ops, dd);
-	if (ret) {
-		printk(KERN_ERR "create_file(%s/atomic_counters) "
-		       "failed: %d\n", unit, ret);
-		goto bail;
-	}
-
-	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
-			  &flash_ops, dd);
-	if (ret) {
-		printk(KERN_ERR "create_file(%s/flash) "
-		       "failed: %d\n", unit, ret);
-		goto bail;
-	}
-
-bail:
-	return ret;
-}
-
-static int remove_file(struct dentry *parent, char *name)
-{
-	struct dentry *tmp;
-	int ret;
-
-	tmp = lookup_one_len(name, parent, strlen(name));
-
-	if (IS_ERR(tmp)) {
-		ret = PTR_ERR(tmp);
-		goto bail;
-	}
-
-	spin_lock(&tmp->d_lock);
-	if (simple_positive(tmp)) {
-		dget_dlock(tmp);
-		__d_drop(tmp);
-		spin_unlock(&tmp->d_lock);
-		simple_unlink(d_inode(parent), tmp);
-	} else
-		spin_unlock(&tmp->d_lock);
-
-	ret = 0;
-bail:
-	/*
-	 * We don't expect clients to care about the return value, but
-	 * it's there if they need it.
-	 */
-	return ret;
-}
-
-static int remove_device_files(struct super_block *sb,
-			       struct ipath_devdata *dd)
-{
-	struct dentry *dir, *root;
-	char unit[10];
-	int ret;
-
-	root = dget(sb->s_root);
-	mutex_lock(&d_inode(root)->i_mutex);
-	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-	dir = lookup_one_len(unit, root, strlen(unit));
-
-	if (IS_ERR(dir)) {
-		ret = PTR_ERR(dir);
-		printk(KERN_ERR "Lookup of %s failed\n", unit);
-		goto bail;
-	}
-
-	remove_file(dir, "flash");
-	remove_file(dir, "atomic_counters");
-	d_delete(dir);
-	ret = simple_rmdir(d_inode(root), dir);
-
-bail:
-	mutex_unlock(&d_inode(root)->i_mutex);
-	dput(root);
-	return ret;
-}
-
-static int ipathfs_fill_super(struct super_block *sb, void *data,
-			      int silent)
-{
-	struct ipath_devdata *dd, *tmp;
-	unsigned long flags;
-	int ret;
-
-	static struct tree_descr files[] = {
-		[2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
-		{""},
-	};
-
-	ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
-	if (ret) {
-		printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
-		goto bail;
-	}
-
-	spin_lock_irqsave(&ipath_devs_lock, flags);
-
-	list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
-		spin_unlock_irqrestore(&ipath_devs_lock, flags);
-		ret = create_device_files(sb, dd);
-		if (ret)
-			goto bail;
-		spin_lock_irqsave(&ipath_devs_lock, flags);
-	}
-
-	spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-bail:
-	return ret;
-}
-
-static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
-			int flags, const char *dev_name, void *data)
-{
-	struct dentry *ret;
-	ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
-	if (!IS_ERR(ret))
-		ipath_super = ret->d_sb;
-	return ret;
-}
-
-static void ipathfs_kill_super(struct super_block *s)
-{
-	kill_litter_super(s);
-	ipath_super = NULL;
-}
-
-int ipathfs_add_device(struct ipath_devdata *dd)
-{
-	int ret;
-
-	if (ipath_super == NULL) {
-		ret = 0;
-		goto bail;
-	}
-
-	ret = create_device_files(ipath_super, dd);
-
-bail:
-	return ret;
-}
-
-int ipathfs_remove_device(struct ipath_devdata *dd)
-{
-	int ret;
-
-	if (ipath_super == NULL) {
-		ret = 0;
-		goto bail;
-	}
-
-	ret = remove_device_files(ipath_super, dd);
-
-bail:
-	return ret;
-}
-
-static struct file_system_type ipathfs_fs_type = {
-	.owner =	THIS_MODULE,
-	.name =		"ipathfs",
-	.mount =	ipathfs_mount,
-	.kill_sb =	ipathfs_kill_super,
-};
-MODULE_ALIAS_FS("ipathfs");
-
-int __init ipath_init_ipathfs(void)
-{
-	return register_filesystem(&ipathfs_fs_type);
-}
-
-void __exit ipath_exit_ipathfs(void)
-{
-	unregister_filesystem(&ipathfs_fs_type);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
deleted file mode 100644
index 7cc3054..0000000
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ /dev/null
@@ -1,1940 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains all of the code that is specific to the InfiniPath
- * HT chip.
- */
-
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/htirq.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_kernel.h"
-#include "ipath_registers.h"
-
-static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
-
-
-/*
- * This lists the InfiniPath registers, in the actual chip layout.
- * This structure should never be directly accessed.
- *
- * The names are in InterCap form because they're taken straight from
- * the chip specification.  Since they're only used in this file, they
- * don't pollute the rest of the source.
-*/
-
-struct _infinipath_do_not_use_kernel_regs {
-	unsigned long long Revision;
-	unsigned long long Control;
-	unsigned long long PageAlign;
-	unsigned long long PortCnt;
-	unsigned long long DebugPortSelect;
-	unsigned long long DebugPort;
-	unsigned long long SendRegBase;
-	unsigned long long UserRegBase;
-	unsigned long long CounterRegBase;
-	unsigned long long Scratch;
-	unsigned long long ReservedMisc1;
-	unsigned long long InterruptConfig;
-	unsigned long long IntBlocked;
-	unsigned long long IntMask;
-	unsigned long long IntStatus;
-	unsigned long long IntClear;
-	unsigned long long ErrorMask;
-	unsigned long long ErrorStatus;
-	unsigned long long ErrorClear;
-	unsigned long long HwErrMask;
-	unsigned long long HwErrStatus;
-	unsigned long long HwErrClear;
-	unsigned long long HwDiagCtrl;
-	unsigned long long MDIO;
-	unsigned long long IBCStatus;
-	unsigned long long IBCCtrl;
-	unsigned long long ExtStatus;
-	unsigned long long ExtCtrl;
-	unsigned long long GPIOOut;
-	unsigned long long GPIOMask;
-	unsigned long long GPIOStatus;
-	unsigned long long GPIOClear;
-	unsigned long long RcvCtrl;
-	unsigned long long RcvBTHQP;
-	unsigned long long RcvHdrSize;
-	unsigned long long RcvHdrCnt;
-	unsigned long long RcvHdrEntSize;
-	unsigned long long RcvTIDBase;
-	unsigned long long RcvTIDCnt;
-	unsigned long long RcvEgrBase;
-	unsigned long long RcvEgrCnt;
-	unsigned long long RcvBufBase;
-	unsigned long long RcvBufSize;
-	unsigned long long RxIntMemBase;
-	unsigned long long RxIntMemSize;
-	unsigned long long RcvPartitionKey;
-	unsigned long long ReservedRcv[10];
-	unsigned long long SendCtrl;
-	unsigned long long SendPIOBufBase;
-	unsigned long long SendPIOSize;
-	unsigned long long SendPIOBufCnt;
-	unsigned long long SendPIOAvailAddr;
-	unsigned long long TxIntMemBase;
-	unsigned long long TxIntMemSize;
-	unsigned long long ReservedSend[9];
-	unsigned long long SendBufferError;
-	unsigned long long SendBufferErrorCONT1;
-	unsigned long long SendBufferErrorCONT2;
-	unsigned long long SendBufferErrorCONT3;
-	unsigned long long ReservedSBE[4];
-	unsigned long long RcvHdrAddr0;
-	unsigned long long RcvHdrAddr1;
-	unsigned long long RcvHdrAddr2;
-	unsigned long long RcvHdrAddr3;
-	unsigned long long RcvHdrAddr4;
-	unsigned long long RcvHdrAddr5;
-	unsigned long long RcvHdrAddr6;
-	unsigned long long RcvHdrAddr7;
-	unsigned long long RcvHdrAddr8;
-	unsigned long long ReservedRHA[7];
-	unsigned long long RcvHdrTailAddr0;
-	unsigned long long RcvHdrTailAddr1;
-	unsigned long long RcvHdrTailAddr2;
-	unsigned long long RcvHdrTailAddr3;
-	unsigned long long RcvHdrTailAddr4;
-	unsigned long long RcvHdrTailAddr5;
-	unsigned long long RcvHdrTailAddr6;
-	unsigned long long RcvHdrTailAddr7;
-	unsigned long long RcvHdrTailAddr8;
-	unsigned long long ReservedRHTA[7];
-	unsigned long long Sync;	/* Software only */
-	unsigned long long Dump;	/* Software only */
-	unsigned long long SimVer;	/* Software only */
-	unsigned long long ReservedSW[5];
-	unsigned long long SerdesConfig0;
-	unsigned long long SerdesConfig1;
-	unsigned long long SerdesStatus;
-	unsigned long long XGXSConfig;
-	unsigned long long ReservedSW2[4];
-};
-
-struct _infinipath_do_not_use_counters {
-	__u64 LBIntCnt;
-	__u64 LBFlowStallCnt;
-	__u64 Reserved1;
-	__u64 TxUnsupVLErrCnt;
-	__u64 TxDataPktCnt;
-	__u64 TxFlowPktCnt;
-	__u64 TxDwordCnt;
-	__u64 TxLenErrCnt;
-	__u64 TxMaxMinLenErrCnt;
-	__u64 TxUnderrunCnt;
-	__u64 TxFlowStallCnt;
-	__u64 TxDroppedPktCnt;
-	__u64 RxDroppedPktCnt;
-	__u64 RxDataPktCnt;
-	__u64 RxFlowPktCnt;
-	__u64 RxDwordCnt;
-	__u64 RxLenErrCnt;
-	__u64 RxMaxMinLenErrCnt;
-	__u64 RxICRCErrCnt;
-	__u64 RxVCRCErrCnt;
-	__u64 RxFlowCtrlErrCnt;
-	__u64 RxBadFormatCnt;
-	__u64 RxLinkProblemCnt;
-	__u64 RxEBPCnt;
-	__u64 RxLPCRCErrCnt;
-	__u64 RxBufOvflCnt;
-	__u64 RxTIDFullErrCnt;
-	__u64 RxTIDValidErrCnt;
-	__u64 RxPKeyMismatchCnt;
-	__u64 RxP0HdrEgrOvflCnt;
-	__u64 RxP1HdrEgrOvflCnt;
-	__u64 RxP2HdrEgrOvflCnt;
-	__u64 RxP3HdrEgrOvflCnt;
-	__u64 RxP4HdrEgrOvflCnt;
-	__u64 RxP5HdrEgrOvflCnt;
-	__u64 RxP6HdrEgrOvflCnt;
-	__u64 RxP7HdrEgrOvflCnt;
-	__u64 RxP8HdrEgrOvflCnt;
-	__u64 Reserved6;
-	__u64 Reserved7;
-	__u64 IBStatusChangeCnt;
-	__u64 IBLinkErrRecoveryCnt;
-	__u64 IBLinkDownedCnt;
-	__u64 IBSymbolErrCnt;
-};
-
-#define IPATH_KREG_OFFSET(field) (offsetof( \
-	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
-#define IPATH_CREG_OFFSET(field) (offsetof( \
-	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
-
-static const struct ipath_kregs ipath_ht_kregs = {
-	.kr_control = IPATH_KREG_OFFSET(Control),
-	.kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
-	.kr_debugport = IPATH_KREG_OFFSET(DebugPort),
-	.kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
-	.kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
-	.kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
-	.kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
-	.kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
-	.kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
-	.kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
-	.kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
-	.kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
-	.kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
-	.kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
-	.kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
-	.kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
-	.kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
-	.kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
-	.kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
-	.kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
-	.kr_intclear = IPATH_KREG_OFFSET(IntClear),
-	.kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
-	.kr_intmask = IPATH_KREG_OFFSET(IntMask),
-	.kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
-	.kr_mdio = IPATH_KREG_OFFSET(MDIO),
-	.kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
-	.kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
-	.kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
-	.kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
-	.kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
-	.kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
-	.kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
-	.kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
-	.kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
-	.kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
-	.kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
-	.kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
-	.kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
-	.kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
-	.kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
-	.kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
-	.kr_revision = IPATH_KREG_OFFSET(Revision),
-	.kr_scratch = IPATH_KREG_OFFSET(Scratch),
-	.kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
-	.kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
-	.kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
-	.kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
-	.kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
-	.kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
-	.kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
-	.kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
-	.kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
-	.kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
-	.kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
-	.kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
-	.kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
-	.kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
-	/*
-	 * These should not be used directly via ipath_write_kreg64(),
-	 * use them with ipath_write_kreg64_port(),
-	 */
-	.kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
-	.kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
-};
-
-static const struct ipath_cregs ipath_ht_cregs = {
-	.cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
-	.cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
-	.cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
-	.cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
-	.cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
-	.cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
-	.cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
-	.cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
-	.cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
-	.cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
-	.cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
-	.cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
-	/* calc from Reg_CounterRegBase + offset */
-	.cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
-	.cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
-	.cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
-	.cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
-	.cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
-	.cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
-	.cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
-	.cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
-	.cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
-	.cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
-	.cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
-	.cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
-	.cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
-	.cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
-	.cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
-	.cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
-	.cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
-	.cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
-	.cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
-	.cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
-	.cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
-};
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVURG_SHIFT 0
-#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVAVAIL_SHIFT 12
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
-#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
-#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
-#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
-#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
-#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
-#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
-#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
-#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
-#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
-#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
-#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
-#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
-#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
-
-#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
-#define IBA6110_IBCS_LINKSTATE_SHIFT 4
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_FREQSEL 0x2
-#define INFINIPATH_EXTS_SERDESSEL 0x4
-#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
-#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
-
-
-/* TID entries (memory), HT-only */
-#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL	/* 40 bits valid */
-#define INFINIPATH_RT_VALID 0x8000000000000000ULL
-#define INFINIPATH_RT_ADDR_SHIFT 0
-#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
-#define INFINIPATH_RT_BUFSIZE_SHIFT 48
-
-#define INFINIPATH_R_INTRAVAIL_SHIFT 16
-#define INFINIPATH_R_TAILUPD_SHIFT 31
-
-/* kr_xgxsconfig bits */
-#define INFINIPATH_XGXS_RESET          0x7ULL
-
-/*
- * masks and bits that are different in different chips, or present only
- * in one
- */
-static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
-    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
-static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
-    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
-
-static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
-
-#define _IPATH_GPIO_SDA_NUM 1
-#define _IPATH_GPIO_SCL_NUM 0
-
-#define IPATH_GPIO_SDA \
-	(1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-#define IPATH_GPIO_SCL \
-	(1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-
-/* keep the code below somewhat more readable; not used elsewhere */
-#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
-				infinipath_hwe_htclnkabyte1crcerr)
-#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |	\
-				infinipath_hwe_htclnkbbyte1crcerr)
-#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
-				infinipath_hwe_htclnkbbyte0crcerr)
-#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |	\
-				infinipath_hwe_htclnkbbyte1crcerr)
-
-static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
-			  char *msg, size_t msgl)
-{
-	char bitsmsg[64];
-	ipath_err_t crcbits = hwerrs &
-		(_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
-	/* don't check if 8bit HT */
-	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-		crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
-	/* don't check if 8bit HT */
-	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-		crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
-	/*
-	 * we'll want to ignore link errors on link that is
-	 * not in use, if any.  For now, complain about both
-	 */
-	if (crcbits) {
-		u16 ctrl0, ctrl1;
-		snprintf(bitsmsg, sizeof bitsmsg,
-			 "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
-			 !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
-			 "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
-				    ? "1 (B)" : "0+1 (A+B)"),
-			 !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
-			 : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
-			    "0+1"), (unsigned long long) crcbits);
-		strlcat(msg, bitsmsg, msgl);
-
-		/*
-		 * print extra info for debugging.  slave/primary
-		 * config word 4, 8 (link control 0, 1)
-		 */
-
-		if (pci_read_config_word(dd->pcidev,
-					 dd->ipath_ht_slave_off + 0x4,
-					 &ctrl0))
-			dev_info(&dd->pcidev->dev, "Couldn't read "
-				 "linkctrl0 of slave/primary "
-				 "config block\n");
-		else if (!(ctrl0 & 1 << 6))
-			/* not if EOC bit set */
-			ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
-				  ((ctrl0 >> 8) & 7) ? " CRC" : "",
-				  ((ctrl0 >> 4) & 1) ? "linkfail" :
-				  "");
-		if (pci_read_config_word(dd->pcidev,
-					 dd->ipath_ht_slave_off + 0x8,
-					 &ctrl1))
-			dev_info(&dd->pcidev->dev, "Couldn't read "
-				 "linkctrl1 of slave/primary "
-				 "config block\n");
-		else if (!(ctrl1 & 1 << 6))
-			/* not if EOC bit set */
-			ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
-				  ((ctrl1 >> 8) & 7) ? " CRC" : "",
-				  ((ctrl1 >> 4) & 1) ? "linkfail" :
-				  "");
-
-		/* disable until driver reloaded */
-		dd->ipath_hwerrmask &= ~crcbits;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-				 dd->ipath_hwerrmask);
-		ipath_dbg("HT crc errs: %s\n", msg);
-	} else
-		ipath_dbg("ignoring HT crc errors 0x%llx, "
-			  "not in use\n", (unsigned long long)
-			  (hwerrs & (_IPATH_HTLINK0_CRCBITS |
-				     _IPATH_HTLINK1_CRCBITS)));
-}
-
-/* 6110 specific hardware errors... */
-static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
-	INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
-	INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
-	INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
-	INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
-	INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
-	INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
-	INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
-	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
-};
-
-#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
-		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
-		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
-#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
-			  << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
-
-static void ipath_ht_txe_recover(struct ipath_devdata *dd)
-{
-	++ipath_stats.sps_txeparity;
-	dev_info(&dd->pcidev->dev,
-		"Recovering from TXE PIO parity error\n");
-}
-
-
-/**
- * ipath_ht_handle_hwerrors - display hardware errors.
- * @dd: the infinipath device
- * @msg: the output buffer
- * @msgl: the size of the output buffer
- *
- * Use same msg buffer as regular errors to avoid excessive stack
- * use.  Most hardware errors are catastrophic, but for right now,
- * we'll print them and continue.  We reuse the same message buffer as
- * ipath_handle_errors() to avoid excessive stack usage.
- */
-static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
-				     size_t msgl)
-{
-	ipath_err_t hwerrs;
-	u32 bits, ctrl;
-	int isfatal = 0;
-	char bitsmsg[64];
-	int log_idx;
-
-	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-
-	if (!hwerrs) {
-		ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
-		/*
-		 * better than printing cofusing messages
-		 * This seems to be related to clearing the crc error, or
-		 * the pll error during init.
-		 */
-		goto bail;
-	} else if (hwerrs == -1LL) {
-		ipath_dev_err(dd, "Read of hardware error status failed "
-			      "(all bits set); ignoring\n");
-		goto bail;
-	}
-	ipath_stats.sps_hwerrs++;
-
-	/* Always clear the error status register, except MEMBISTFAIL,
-	 * regardless of whether we continue or stop using the chip.
-	 * We want that set so we know it failed, even across driver reload.
-	 * We'll still ignore it in the hwerrmask.  We do this partly for
-	 * diagnostics, but also for support */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-			 hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
-
-	hwerrs &= dd->ipath_hwerrmask;
-
-	/* We log some errors to EEPROM, check if we have any of those. */
-	for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
-		if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
-			ipath_inc_eeprom_err(dd, log_idx, 1);
-
-	/*
-	 * make sure we get this much out, unless told to be quiet,
-	 * it's a parity error we may recover from,
-	 * or it's occurred within the last 5 seconds
-	 */
-	if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
-		RXE_EAGER_PARITY)) ||
-		(ipath_debug & __IPATH_VERBDBG))
-		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
-			 "(cleared)\n", (unsigned long long) hwerrs);
-	dd->ipath_lasthwerror |= hwerrs;
-
-	if (hwerrs & ~dd->ipath_hwe_bitsextant)
-		ipath_dev_err(dd, "hwerror interrupt with unknown errors "
-			      "%llx set\n", (unsigned long long)
-			      (hwerrs & ~dd->ipath_hwe_bitsextant));
-
-	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
-		/*
-		 * parity errors in send memory are recoverable,
-		 * just cancel the send (if indicated in * sendbuffererror),
-		 * count the occurrence, unfreeze (if no other handled
-		 * hardware error bits are set), and continue. They can
-		 * occur if a processor speculative read is done to the PIO
-		 * buffer while we are sending a packet, for example.
-		 */
-		if (hwerrs & TXE_PIO_PARITY) {
-			ipath_ht_txe_recover(dd);
-			hwerrs &= ~TXE_PIO_PARITY;
-		}
-
-		if (!hwerrs) {
-			ipath_dbg("Clearing freezemode on ignored or "
-				  "recovered hardware error\n");
-			ipath_clear_freeze(dd);
-		}
-	}
-
-	*msg = '\0';
-
-	/*
-	 * may someday want to decode into which bits are which
-	 * functional area for parity errors, etc.
-	 */
-	if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
-		      << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
-		bits = (u32) ((hwerrs >>
-			       INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
-			      INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
-		snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
-			 bits);
-		strlcat(msg, bitsmsg, msgl);
-	}
-
-	ipath_format_hwerrors(hwerrs,
-			      ipath_6110_hwerror_msgs,
-			      ARRAY_SIZE(ipath_6110_hwerror_msgs),
-			      msg, msgl);
-
-	if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
-		hwerr_crcbits(dd, hwerrs, msg, msgl);
-
-	if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
-		strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
-			msgl);
-		/* ignore from now on, so disable until driver reloaded */
-		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-				 dd->ipath_hwerrmask);
-	}
-#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |	\
-			 INFINIPATH_HWE_COREPLL_RFSLIP |	\
-			 INFINIPATH_HWE_HTBPLL_FBSLIP |		\
-			 INFINIPATH_HWE_HTBPLL_RFSLIP |		\
-			 INFINIPATH_HWE_HTAPLL_FBSLIP |		\
-			 INFINIPATH_HWE_HTAPLL_RFSLIP)
-
-	if (hwerrs & _IPATH_PLL_FAIL) {
-		snprintf(bitsmsg, sizeof bitsmsg,
-			 "[PLL failed (%llx), InfiniPath hardware unusable]",
-			 (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
-		strlcat(msg, bitsmsg, msgl);
-		/* ignore from now on, so disable until driver reloaded */
-		dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-				 dd->ipath_hwerrmask);
-	}
-
-	if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
-		/*
-		 * If it occurs, it is left masked since the eternal
-		 * interface is unused
-		 */
-		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-				 dd->ipath_hwerrmask);
-	}
-
-	if (hwerrs) {
-		/*
-		 * if any set that we aren't ignoring; only
-		 * make the complaint once, in case it's stuck
-		 * or recurring, and we get here multiple
-		 * times.
-		 * force link down, so switch knows, and
-		 * LEDs are turned off
-		 */
-		if (dd->ipath_flags & IPATH_INITTED) {
-			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-			ipath_setup_ht_setextled(dd,
-				INFINIPATH_IBCS_L_STATE_DOWN,
-				INFINIPATH_IBCS_LT_STATE_DISABLED);
-			ipath_dev_err(dd, "Fatal Hardware Error (freeze "
-					  "mode), no longer usable, SN %.16s\n",
-					  dd->ipath_serial);
-			isfatal = 1;
-		}
-		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-		/* mark as having had error */
-		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-		/*
-		 * mark as not usable, at a minimum until driver
-		 * is reloaded, probably until reboot, since no
-		 * other reset is possible.
-		 */
-		dd->ipath_flags &= ~IPATH_INITTED;
-	}
-	else
-		*msg = 0; /* recovered from all of them */
-	if (*msg)
-		ipath_dev_err(dd, "%s hardware error\n", msg);
-	if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
-		/*
-		 * for status file; if no trailing brace is copied,
-		 * we'll know it was truncated.
-		 */
-		snprintf(dd->ipath_freezemsg,
-			 dd->ipath_freezelen, "{%s}", msg);
-
-bail:;
-}
-
-/**
- * ipath_ht_boardname - fill in the board name
- * @dd: the infinipath device
- * @name: the output buffer
- * @namelen: the size of the output buffer
- *
- * fill in the board name, based on the board revision register
- */
-static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
-			      size_t namelen)
-{
-	char *n = NULL;
-	u8 boardrev = dd->ipath_boardrev;
-	int ret = 0;
-
-	switch (boardrev) {
-	case 5:
-		/*
-		 * original production board; two production levels, with
-		 * different serial number ranges.   See ipath_ht_early_init() for
-		 * case where we enable IPATH_GPIO_INTR for later serial # range.
-		 * Original 112* serial number is no longer supported.
-		 */
-		n = "InfiniPath_QHT7040";
-		break;
-	case 7:
-		/* small form factor production board */
-		n = "InfiniPath_QHT7140";
-		break;
-	default:		/* don't know, just print the number */
-		ipath_dev_err(dd, "Don't yet know about board "
-			      "with ID %u\n", boardrev);
-		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
-			 boardrev);
-		break;
-	}
-	if (n)
-		snprintf(name, namelen, "%s", n);
-
-	if (ret) {
-		ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
-		goto bail;
-	}
-	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
-		dd->ipath_minrev > 4)) {
-		/*
-		 * This version of the driver only supports Rev 3.2 - 3.4
-		 */
-		ipath_dev_err(dd,
-			      "Unsupported InfiniPath hardware revision %u.%u!\n",
-			      dd->ipath_majrev, dd->ipath_minrev);
-		ret = 1;
-		goto bail;
-	}
-	/*
-	 * pkt/word counters are 32 bit, and therefore wrap fast enough
-	 * that we snapshot them from a timer, and maintain 64 bit shadow
-	 * copies
-	 */
-	dd->ipath_flags |= IPATH_32BITCOUNTERS;
-	dd->ipath_flags |= IPATH_GPIO_INTR;
-	if (dd->ipath_lbus_speed != 800)
-		ipath_dev_err(dd,
-			      "Incorrectly configured for HT @ %uMHz\n",
-			      dd->ipath_lbus_speed);
-
-	/*
-	 * set here, not in ipath_init_*_funcs because we have to do
-	 * it after we can read chip registers.
-	 */
-	dd->ipath_ureg_align =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-
-bail:
-	return ret;
-}
-
-static void ipath_check_htlink(struct ipath_devdata *dd)
-{
-	u8 linkerr, link_off, i;
-
-	for (i = 0; i < 2; i++) {
-		link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
-		if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
-			dev_info(&dd->pcidev->dev, "Couldn't read "
-				 "linkerror%d of HT slave/primary block\n",
-				 i);
-		else if (linkerr & 0xf0) {
-			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-				   "clearing\n", linkerr >> 4, i);
-			/*
-			 * writing the linkerr bits that are set should
-			 * clear them
-			 */
-			if (pci_write_config_byte(dd->pcidev, link_off,
-						  linkerr))
-				ipath_dbg("Failed write to clear HT "
-					  "linkerror%d\n", i);
-			if (pci_read_config_byte(dd->pcidev, link_off,
-						 &linkerr))
-				dev_info(&dd->pcidev->dev,
-					 "Couldn't reread linkerror%d of "
-					 "HT slave/primary block\n", i);
-			else if (linkerr & 0xf0)
-				dev_info(&dd->pcidev->dev,
-					 "HT linkerror%d bits 0x%x "
-					 "couldn't be cleared\n",
-					 i, linkerr >> 4);
-		}
-	}
-}
-
-static int ipath_setup_ht_reset(struct ipath_devdata *dd)
-{
-	ipath_dbg("No reset possible for this InfiniPath hardware\n");
-	return 0;
-}
-
-#define HT_INTR_DISC_CONFIG  0x80	/* HT interrupt and discovery cap */
-#define HT_INTR_REG_INDEX    2	/* intconfig requires indirect accesses */
-
-/*
- * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
- * errors.  We only bother to do this at load time, because it's OK if
- * it happened before we were loaded (first time after boot/reset),
- * but any time after that, it's fatal anyway.  Also need to not check
- * for upper byte errors if we are in 8 bit mode, so figure out
- * our width.  For now, at least, also complain if it's 8 bit.
- */
-static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
-			     int pos, u8 cap_type)
-{
-	u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
-	u16 linkctrl = 0;
-	int i;
-
-	dd->ipath_ht_slave_off = pos;
-	/* command word, master_host bit */
-	/* master host || slave */
-	if ((cap_type >> 2) & 1)
-		link_a_b_off = 4;
-	else
-		link_a_b_off = 0;
-	ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
-		   link_a_b_off ? 1 : 0,
-		   link_a_b_off ? 'B' : 'A');
-
-	link_a_b_off += pos;
-
-	/*
-	 * check both link control registers; clear both HT CRC sets if
-	 * necessary.
-	 */
-	for (i = 0; i < 2; i++) {
-		link_off = pos + i * 4 + 0x4;
-		if (pci_read_config_word(pdev, link_off, &linkctrl))
-			ipath_dev_err(dd, "Couldn't read HT link control%d "
-				      "register\n", i);
-		else if (linkctrl & (0xf << 8)) {
-			ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
-				   "bits %x\n", i, linkctrl & (0xf << 8));
-			/*
-			 * now write them back to clear the error.
-			 */
-			pci_write_config_word(pdev, link_off,
-					      linkctrl & (0xf << 8));
-		}
-	}
-
-	/*
-	 * As with HT CRC bits, same for protocol errors that might occur
-	 * during boot.
-	 */
-	for (i = 0; i < 2; i++) {
-		link_off = pos + i * 4 + 0xd;
-		if (pci_read_config_byte(pdev, link_off, &linkerr))
-			dev_info(&pdev->dev, "Couldn't read linkerror%d "
-				 "of HT slave/primary block\n", i);
-		else if (linkerr & 0xf0) {
-			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-				   "clearing\n", linkerr >> 4, i);
-			/*
-			 * writing the linkerr bits that are set will clear
-			 * them
-			 */
-			if (pci_write_config_byte
-			    (pdev, link_off, linkerr))
-				ipath_dbg("Failed write to clear HT "
-					  "linkerror%d\n", i);
-			if (pci_read_config_byte(pdev, link_off, &linkerr))
-				dev_info(&pdev->dev, "Couldn't reread "
-					 "linkerror%d of HT slave/primary "
-					 "block\n", i);
-			else if (linkerr & 0xf0)
-				dev_info(&pdev->dev, "HT linkerror%d bits "
-					 "0x%x couldn't be cleared\n",
-					 i, linkerr >> 4);
-		}
-	}
-
-	/*
-	 * this is just for our link to the host, not devices connected
-	 * through tunnel.
-	 */
-
-	if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
-		ipath_dev_err(dd, "Couldn't read HT link width "
-			      "config register\n");
-	else {
-		u32 width;
-		switch (linkwidth & 7) {
-		case 5:
-			width = 4;
-			break;
-		case 4:
-			width = 2;
-			break;
-		case 3:
-			width = 32;
-			break;
-		case 1:
-			width = 16;
-			break;
-		case 0:
-		default:	/* if wrong, assume 8 bit */
-			width = 8;
-			break;
-		}
-
-		dd->ipath_lbus_width = width;
-
-		if (linkwidth != 0x11) {
-			ipath_dev_err(dd, "Not configured for 16 bit HT "
-				      "(%x)\n", linkwidth);
-			if (!(linkwidth & 0xf)) {
-				ipath_dbg("Will ignore HT lane1 errors\n");
-				dd->ipath_flags |= IPATH_8BIT_IN_HT0;
-			}
-		}
-	}
-
-	/*
-	 * this is just for our link to the host, not devices connected
-	 * through tunnel.
-	 */
-	if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
-		ipath_dev_err(dd, "Couldn't read HT link frequency "
-			      "config register\n");
-	else {
-		u32 speed;
-		switch (linkwidth & 0xf) {
-		case 6:
-			speed = 1000;
-			break;
-		case 5:
-			speed = 800;
-			break;
-		case 4:
-			speed = 600;
-			break;
-		case 3:
-			speed = 500;
-			break;
-		case 2:
-			speed = 400;
-			break;
-		case 1:
-			speed = 300;
-			break;
-		default:
-			/*
-			 * assume reserved and vendor-specific are 200...
-			 */
-		case 0:
-			speed = 200;
-			break;
-		}
-		dd->ipath_lbus_speed = speed;
-	}
-
-	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
-		"HyperTransport,%uMHz,x%u\n",
-		dd->ipath_lbus_speed,
-		dd->ipath_lbus_width);
-}
-
-static int ipath_ht_intconfig(struct ipath_devdata *dd)
-{
-	int ret;
-
-	if (dd->ipath_intconfig) {
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
-				 dd->ipath_intconfig);	/* interrupt address */
-		ret = 0;
-	} else {
-		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
-			      "interrupt address\n");
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
-				struct ht_irq_msg *msg)
-{
-	struct ipath_devdata *dd = pci_get_drvdata(dev);
-	u64 prev_intconfig = dd->ipath_intconfig;
-
-	dd->ipath_intconfig = msg->address_lo;
-	dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
-
-	/*
-	 * If the previous value of dd->ipath_intconfig is zero, we're
-	 * getting configured for the first time, and must not program the
-	 * intconfig register here (it will be programmed later, when the
-	 * hardware is ready).  Otherwise, we should.
-	 */
-	if (prev_intconfig)
-		ipath_ht_intconfig(dd);
-}
-
-/**
- * ipath_setup_ht_config - setup the interruptconfig register
- * @dd: the infinipath device
- * @pdev: the PCI device
- *
- * setup the interruptconfig register from the HT config info.
- * Also clear CRC errors in HT linkcontrol, if necessary.
- * This is done only for the real hardware.  It is done before
- * chip address space is initted, so can't touch infinipath registers
- */
-static int ipath_setup_ht_config(struct ipath_devdata *dd,
-				 struct pci_dev *pdev)
-{
-	int pos, ret;
-
-	ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
-	if (ret < 0) {
-		ipath_dev_err(dd, "Couldn't create interrupt handler: "
-			      "err %d\n", ret);
-		goto bail;
-	}
-	dd->ipath_irq = ret;
-	ret = 0;
-
-	/*
-	 * Handle clearing CRC errors in linkctrl register if necessary.  We
-	 * do this early, before we ever enable errors or hardware errors,
-	 * mostly to avoid causing the chip to enter freeze mode.
-	 */
-	pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
-	if (!pos) {
-		ipath_dev_err(dd, "Couldn't find HyperTransport "
-			      "capability; no interrupts\n");
-		ret = -ENODEV;
-		goto bail;
-	}
-	do {
-		u8 cap_type;
-
-		/*
-		 * The HT capability type byte is 3 bytes after the
-		 * capability byte.
-		 */
-		if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
-			dev_info(&pdev->dev, "Couldn't read config "
-				 "command @ %d\n", pos);
-			continue;
-		}
-		if (!(cap_type & 0xE0))
-			slave_or_pri_blk(dd, pdev, pos, cap_type);
-	} while ((pos = pci_find_next_capability(pdev, pos,
-						 PCI_CAP_ID_HT)));
-
-	dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
- * @dd: the infinipath device
- *
- * Called during driver unload.
- * This is currently a nop for the HT chip, not for all chips
- */
-static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
-{
-}
-
-/**
- * ipath_setup_ht_setextled - set the state of the two external LEDs
- * @dd: the infinipath device
- * @lst: the L state
- * @ltst: the LT state
- *
- * Set the state of the two external LEDs, to indicate physical and
- * logical state of IB link.   For this chip (at least with recommended
- * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
- * (logical state)
- *
- * Note:  We try to match the Mellanox HCA LED behavior as best
- * we can.  Green indicates physical link state is OK (something is
- * plugged in, and we can train).
- * Amber indicates the link is logically up (ACTIVE).
- * Mellanox further blinks the amber LED to indicate data packet
- * activity, but we have no hardware support for that, so it would
- * require waking up every 10-20 msecs and checking the counters
- * on the chip, and then turning the LED off if appropriate.  That's
- * visible overhead, so not something we will do.
- *
- */
-static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
-				     u64 lst, u64 ltst)
-{
-	u64 extctl;
-	unsigned long flags = 0;
-
-	/* the diags use the LED to indicate diag info, so we leave
-	 * the external LED alone when the diags are running */
-	if (ipath_diag_inuse)
-		return;
-
-	/* Allow override of LED display for, e.g. Locating system in rack */
-	if (dd->ipath_led_override) {
-		ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
-			? INFINIPATH_IBCS_LT_STATE_LINKUP
-			: INFINIPATH_IBCS_LT_STATE_DISABLED;
-		lst = (dd->ipath_led_override & IPATH_LED_LOG)
-			? INFINIPATH_IBCS_L_STATE_ACTIVE
-			: INFINIPATH_IBCS_L_STATE_DOWN;
-	}
-
-	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-	/*
-	 * start by setting both LED control bits to off, then turn
-	 * on the appropriate bit(s).
-	 */
-	if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
-		/*
-		 * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
-		 * is inverted,  because it is normally used to indicate
-		 * a hardware fault at reset, if there were errors
-		 */
-		extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
-			| INFINIPATH_EXTC_LEDGBLERR_OFF;
-		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-			extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
-		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-			extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
-	}
-	else {
-		extctl = dd->ipath_extctrl &
-			~(INFINIPATH_EXTC_LED1PRIPORT_ON |
-			  INFINIPATH_EXTC_LED2PRIPORT_ON);
-		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-			extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
-		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-			extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
-	}
-	dd->ipath_extctrl = extctl;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
-	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-}
-
-static void ipath_init_ht_variables(struct ipath_devdata *dd)
-{
-	/*
-	 * setup the register offsets, since they are different for each
-	 * chip
-	 */
-	dd->ipath_kregs = &ipath_ht_kregs;
-	dd->ipath_cregs = &ipath_ht_cregs;
-
-	dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
-	dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
-	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
-	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
-
-	/*
-	 * Fill in data for field-values that change in newer chips.
-	 * We dynamically specify only the mask for LINKTRAININGSTATE
-	 * and only the shift for LINKSTATE, as they are the only ones
-	 * that change.  Also precalculate the 3 link states of interest
-	 * and the combined mask.
-	 */
-	dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
-	dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
-	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
-		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
-	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
-	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
-	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
-
-	/*
-	 * Fill in data for ibcc field-values that change in newer chips.
-	 * We dynamically specify only the mask for LINKINITCMD
-	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
-	 * the only ones that change.
-	 */
-	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
-	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
-	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
-
-	/* Fill in shifts for RcvCtrl. */
-	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
-	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
-	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
-	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
-
-	dd->ipath_i_bitsextant =
-		(INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
-		(INFINIPATH_I_RCVAVAIL_MASK <<
-		 INFINIPATH_I_RCVAVAIL_SHIFT) |
-		INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
-		INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
-
-	dd->ipath_e_bitsextant =
-		INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
-		INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
-		INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
-		INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
-		INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
-		INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
-		INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-		INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
-		INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
-		INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
-		INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
-		INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
-		INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
-		INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
-		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
-		INFINIPATH_E_HARDWARE;
-
-	dd->ipath_hwe_bitsextant =
-		(INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
-		 INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
-		(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-		 INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
-		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-		 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
-		INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
-		INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
-		INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
-		INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
-		INFINIPATH_HWE_HTCMISCERR4 |
-		INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
-		INFINIPATH_HWE_HTCMISCERR7 |
-		INFINIPATH_HWE_HTCBUSTREQPARITYERR |
-		INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
-		INFINIPATH_HWE_HTCBUSIREQPARITYERR |
-		INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
-		INFINIPATH_HWE_MEMBISTFAILED |
-		INFINIPATH_HWE_COREPLL_FBSLIP |
-		INFINIPATH_HWE_COREPLL_RFSLIP |
-		INFINIPATH_HWE_HTBPLL_FBSLIP |
-		INFINIPATH_HWE_HTBPLL_RFSLIP |
-		INFINIPATH_HWE_HTAPLL_FBSLIP |
-		INFINIPATH_HWE_HTAPLL_RFSLIP |
-		INFINIPATH_HWE_SERDESPLLFAILED |
-		INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
-		INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
-
-	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
-	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
-	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
-	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
-
-	/*
-	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
-	 * 2 is Some Misc, 3 is reserved for future.
-	 */
-	dd->ipath_eep_st_masks[0].hwerrs_to_log =
-		INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-		INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
-
-	dd->ipath_eep_st_masks[1].hwerrs_to_log =
-		INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-		INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
-
-	dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
-
-	dd->delay_mult = 2; /* SDR, 4X, can't change */
-
-	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
-	dd->ipath_link_speed_supported = IPATH_IB_SDR;
-	dd->ipath_link_width_enabled = IB_WIDTH_4X;
-	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
-	/* these can't change for this chip, so set once */
-	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
-	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
-}
-
-/**
- * ipath_ht_init_hwerrors - enable hardware errors
- * @dd: the infinipath device
- *
- * now that we have finished initializing everything that might reasonably
- * cause a hardware error, and cleared those errors bits as they occur,
- * we can enable hardware errors in the mask (potentially enabling
- * freeze mode), and enable hardware errors as errors (along with
- * everything else) in errormask
- */
-static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
-{
-	ipath_err_t val;
-	u64 extsval;
-
-	extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-
-	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
-		ipath_dev_err(dd, "MemBIST did not complete!\n");
-	if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
-		ipath_dbg("MemBIST corrected\n");
-
-	ipath_check_htlink(dd);
-
-	/* barring bugs, all hwerrors become interrupts, which can */
-	val = -1LL;
-	/* don't look at crc lane1 if 8 bit */
-	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-		val &= ~infinipath_hwe_htclnkabyte1crcerr;
-	/* don't look at crc lane1 if 8 bit */
-	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-		val &= ~infinipath_hwe_htclnkbbyte1crcerr;
-
-	/*
-	 * disable RXDSYNCMEMPARITY because external serdes is unused,
-	 * and therefore the logic will never be used or initialized,
-	 * and uninitialized state will normally result in this error
-	 * being asserted.  Similarly for the external serdess pll
-	 * lock signal.
-	 */
-	val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
-		 INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
-
-	/*
-	 * Disable MISCERR4 because of an inversion in the HT core
-	 * logic checking for errors that cause this bit to be set.
-	 * The errata can also cause the protocol error bit to be set
-	 * in the HT config space linkerror register(s).
-	 */
-	val &= ~INFINIPATH_HWE_HTCMISCERR4;
-
-	/*
-	 * PLL ignored because unused MDIO interface has a logic problem
-	 */
-	if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
-		val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-	dd->ipath_hwerrmask = val;
-}
-
-
-
-
-/**
- * ipath_ht_bringup_serdes - bring up the serdes
- * @dd: the infinipath device
- */
-static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
-{
-	u64 val, config1;
-	int ret = 0, change = 0;
-
-	ipath_dbg("Trying to bringup serdes\n");
-
-	if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
-	    INFINIPATH_HWE_SERDESPLLFAILED)
-	{
-		ipath_dbg("At start, serdes PLL failed bit set in "
-			  "hwerrstatus, clearing and continuing\n");
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-				 INFINIPATH_HWE_SERDESPLLFAILED);
-	}
-
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-	config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
-
-	ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
-		   "config1=%llx, sstatus=%llx xgxs %llx\n",
-		   (unsigned long long) val, (unsigned long long) config1,
-		   (unsigned long long)
-		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-		   (unsigned long long)
-		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-	/* force reset on */
-	val |= INFINIPATH_SERDC0_RESET_PLL
-		/* | INFINIPATH_SERDC0_RESET_MASK */
-		;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-	udelay(15);		/* need pll reset set at least for a bit */
-
-	if (val & INFINIPATH_SERDC0_RESET_PLL) {
-		u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
-		/* set lane resets, and tx idle, during pll reset */
-		val2 |= INFINIPATH_SERDC0_RESET_MASK |
-			INFINIPATH_SERDC0_TXIDLE;
-		ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
-			   "%llx)\n", (unsigned long long) val2);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-				 val2);
-		/*
-		 * be sure chip saw it
-		 */
-		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		/*
-		 * need pll reset clear at least 11 usec before lane
-		 * resets cleared; give it a few more
-		 */
-		udelay(15);
-		val = val2;	/* for check below */
-	}
-
-	if (val & (INFINIPATH_SERDC0_RESET_PLL |
-		   INFINIPATH_SERDC0_RESET_MASK |
-		   INFINIPATH_SERDC0_TXIDLE)) {
-		val &= ~(INFINIPATH_SERDC0_RESET_PLL |
-			 INFINIPATH_SERDC0_RESET_MASK |
-			 INFINIPATH_SERDC0_TXIDLE);
-		/* clear them */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-				 val);
-	}
-
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-	if (val & INFINIPATH_XGXS_RESET) {
-		/* normally true after boot */
-		val &= ~INFINIPATH_XGXS_RESET;
-		change = 1;
-	}
-	if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
-	     INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
-		/* need to compensate for Tx inversion in partner */
-		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-		         INFINIPATH_XGXS_RX_POL_SHIFT);
-		val |= dd->ipath_rx_pol_inv <<
-			INFINIPATH_XGXS_RX_POL_SHIFT;
-		change = 1;
-	}
-	if (change)
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-	/* clear current and de-emphasis bits */
-	config1 &= ~0x0ffffffff00ULL;
-	/* set current to 20ma */
-	config1 |= 0x00000000000ULL;
-	/* set de-emphasis to -5.68dB */
-	config1 |= 0x0cccc000000ULL;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
-
-	ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
-		   "config1=%llx, sstatus=%llx xgxs %llx\n",
-		   (unsigned long long) val, (unsigned long long) config1,
-		   (unsigned long long)
-		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-		   (unsigned long long)
-		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-	return ret;		/* for now, say we always succeeded */
-}
-
-/**
- * ipath_ht_quiet_serdes - set serdes to txidle
- * @dd: the infinipath device
- * driver is being unloaded
- */
-static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
-{
-	u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-	val |= INFINIPATH_SERDC0_TXIDLE;
-	ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
-		  (unsigned long long) val);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-}
-
-/**
- * ipath_pe_put_tid - write a TID in chip
- * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to update
- * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
- * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
- *
- * This exists as a separate routine to allow for special locking etc.
- * It's used for both the full cleanup on exit, as well as the normal
- * setup and teardown.
- */
-static void ipath_ht_put_tid(struct ipath_devdata *dd,
-			     u64 __iomem *tidptr, u32 type,
-			     unsigned long pa)
-{
-	if (!dd->ipath_kregbase)
-		return;
-
-	if (pa != dd->ipath_tidinvalid) {
-		if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
-			dev_info(&dd->pcidev->dev,
-				 "physaddr %lx has more than "
-				 "40 bits, using only 40!!!\n", pa);
-			pa &= INFINIPATH_RT_ADDR_MASK;
-		}
-		if (type == RCVHQ_RCV_TYPE_EAGER)
-			pa |= dd->ipath_tidtemplate;
-		else {
-			/* in words (fixed, full page).  */
-			u64 lenvalid = PAGE_SIZE >> 2;
-			lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-			pa |= lenvalid | INFINIPATH_RT_VALID;
-		}
-	}
-
-	writeq(pa, tidptr);
-}
-
-
-/**
- * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
- * @dd: the infinipath device
- * @port: the port
- *
- * Used from ipath_close(), and at chip initialization.
- */
-static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
-{
-	u64 __iomem *tidbase;
-	int i;
-
-	if (!dd->ipath_kregbase)
-		return;
-
-	ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
-
-	/*
-	 * need to invalidate all of the expected TID entries for this
-	 * port, so we don't have valid entries that might somehow get
-	 * used (early in next use of this port, or through some bug)
-	 */
-	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-				   dd->ipath_rcvtidbase +
-				   port * dd->ipath_rcvtidcnt *
-				   sizeof(*tidbase));
-	for (i = 0; i < dd->ipath_rcvtidcnt; i++)
-		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
-				 dd->ipath_tidinvalid);
-
-	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-				   dd->ipath_rcvegrbase +
-				   port * dd->ipath_rcvegrcnt *
-				   sizeof(*tidbase));
-
-	for (i = 0; i < dd->ipath_rcvegrcnt; i++)
-		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
-				 dd->ipath_tidinvalid);
-}
-
-/**
- * ipath_ht_tidtemplate - setup constants for TID updates
- * @dd: the infinipath device
- *
- * We setup stuff that we use a lot, to avoid calculating each time
- */
-static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
-{
-	dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
-	dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-	dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
-
-	/*
-	 * work around chip errata bug 7358, by marking invalid tids
-	 * as having max length
-	 */
-	dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
-		INFINIPATH_RT_BUFSIZE_SHIFT;
-}
-
-static int ipath_ht_early_init(struct ipath_devdata *dd)
-{
-	u32 __iomem *piobuf;
-	u32 pioincr, val32;
-	int i;
-
-	/*
-	 * one cache line; long IB headers will spill over into received
-	 * buffer
-	 */
-	dd->ipath_rcvhdrentsize = 16;
-	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
-
-	/*
-	 * For HT, we allocate a somewhat overly large eager buffer,
-	 * such that we can guarantee that we can receive the largest
-	 * packet that we can send out.  To truly support a 4KB MTU,
-	 * we need to bump this to a large value.  To date, other than
-	 * testing, we have never encountered an HCA that can really
-	 * send 4KB MTU packets, so we do not handle that (we'll get
-	 * errors interrupts if we ever see one).
-	 */
-	dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
-
-	/*
-	 * the min() check here is currently a nop, but it may not
-	 * always be, depending on just how we do ipath_rcvegrbufsize
-	 */
-	dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
-				 dd->ipath_rcvegrbufsize);
-	dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
-	ipath_ht_tidtemplate(dd);
-
-	/*
-	 * zero all the TID entries at startup.  We do this for sanity,
-	 * in case of a previous driver crash of some kind, and also
-	 * because the chip powers up with these memories in an unknown
-	 * state.  Use portcnt, not cfgports, since this is for the
-	 * full chip, not for current (possibly different) configuration
-	 * value.
-	 * Chip Errata bug 6447
-	 */
-	for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
-		ipath_ht_clear_tids(dd, val32);
-
-	/*
-	 * write the pbc of each buffer, to be sure it's initialized, then
-	 * cancel all the buffers, and also abort any packets that might
-	 * have been in flight for some reason (the latter is for driver
-	 * unload/reload, but isn't a bad idea at first init).	PIO send
-	 * isn't enabled at this point, so there is no danger of sending
-	 * these out on the wire.
-	 * Chip Errata bug 6610
-	 */
-	piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
-				  dd->ipath_piobufbase);
-	pioincr = dd->ipath_palign / sizeof(*piobuf);
-	for (i = 0; i < dd->ipath_piobcnt2k; i++) {
-		/*
-		 * reasonable word count, just to init pbc
-		 */
-		writel(16, piobuf);
-		piobuf += pioincr;
-	}
-
-	ipath_get_eeprom_info(dd);
-	if (dd->ipath_boardrev == 5) {
-		/*
-		 * Later production QHT7040 has same changes as QHT7140, so
-		 * can use GPIO interrupts.  They have serial #'s starting
-		 * with 128, rather than 112.
-		 */
-		if (dd->ipath_serial[0] == '1' &&
-		    dd->ipath_serial[1] == '2' &&
-		    dd->ipath_serial[2] == '8')
-			dd->ipath_flags |= IPATH_GPIO_INTR;
-		else {
-			ipath_dev_err(dd, "Unsupported InfiniPath board "
-				"(serial number %.16s)!\n",
-				dd->ipath_serial);
-			return 1;
-		}
-	}
-
-	if (dd->ipath_minrev >= 4) {
-		/* Rev4+ reports extra errors via internal GPIO pins */
-		dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
-		dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-				 dd->ipath_gpio_mask);
-	}
-
-	return 0;
-}
-
-
-/**
- * ipath_init_ht_get_base_info - set chip-specific flags for user code
- * @dd: the infinipath device
- * @kbase: ipath_base_info pointer
- *
- * We set the PCIE flag because the lower bandwidth on PCIe vs
- * HyperTransport can affect some user packet algorithms.
- */
-static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
-{
-	struct ipath_base_info *kinfo = kbase;
-
-	kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
-		IPATH_RUNTIME_PIO_REGSWAPPED;
-
-	if (pd->port_dd->ipath_minrev < 4)
-		kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
-
-	return 0;
-}
-
-static void ipath_ht_free_irq(struct ipath_devdata *dd)
-{
-	free_irq(dd->ipath_irq, dd);
-	ht_destroy_irq(dd->ipath_irq);
-	dd->ipath_irq = 0;
-	dd->ipath_intconfig = 0;
-}
-
-static struct ipath_message_header *
-ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
-{
-	return (struct ipath_message_header *)
-		&rhf_addr[sizeof(u64) / sizeof(u32)];
-}
-
-static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
-{
-	dd->ipath_portcnt =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
-	dd->ipath_p0_rcvegrcnt =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-}
-
-static void ipath_ht_read_counters(struct ipath_devdata *dd,
-				   struct infinipath_counters *cntrs)
-{
-	cntrs->LBIntCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
-	cntrs->LBFlowStallCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
-	cntrs->TxSDmaDescCnt = 0;
-	cntrs->TxUnsupVLErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
-	cntrs->TxDataPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
-	cntrs->TxFlowPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
-	cntrs->TxDwordCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
-	cntrs->TxLenErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
-	cntrs->TxMaxMinLenErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
-	cntrs->TxUnderrunCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
-	cntrs->TxFlowStallCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
-	cntrs->TxDroppedPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
-	cntrs->RxDroppedPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
-	cntrs->RxDataPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
-	cntrs->RxFlowPktCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
-	cntrs->RxDwordCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
-	cntrs->RxLenErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
-	cntrs->RxMaxMinLenErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
-	cntrs->RxICRCErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
-	cntrs->RxVCRCErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
-	cntrs->RxFlowCtrlErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
-	cntrs->RxBadFormatCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
-	cntrs->RxLinkProblemCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
-	cntrs->RxEBPCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
-	cntrs->RxLPCRCErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
-	cntrs->RxBufOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
-	cntrs->RxTIDFullErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
-	cntrs->RxTIDValidErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
-	cntrs->RxPKeyMismatchCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
-	cntrs->RxP0HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
-	cntrs->RxP1HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
-	cntrs->RxP2HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
-	cntrs->RxP3HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
-	cntrs->RxP4HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
-	cntrs->RxP5HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
-	cntrs->RxP6HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
-	cntrs->RxP7HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
-	cntrs->RxP8HdrEgrOvflCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
-	cntrs->RxP9HdrEgrOvflCnt = 0;
-	cntrs->RxP10HdrEgrOvflCnt = 0;
-	cntrs->RxP11HdrEgrOvflCnt = 0;
-	cntrs->RxP12HdrEgrOvflCnt = 0;
-	cntrs->RxP13HdrEgrOvflCnt = 0;
-	cntrs->RxP14HdrEgrOvflCnt = 0;
-	cntrs->RxP15HdrEgrOvflCnt = 0;
-	cntrs->RxP16HdrEgrOvflCnt = 0;
-	cntrs->IBStatusChangeCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
-	cntrs->IBLinkErrRecoveryCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
-	cntrs->IBLinkDownedCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
-	cntrs->IBSymbolErrCnt =
-		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
-	cntrs->RxVL15DroppedPktCnt = 0;
-	cntrs->RxOtherLocalPhyErrCnt = 0;
-	cntrs->PcieRetryBufDiagQwordCnt = 0;
-	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
-	cntrs->LocalLinkIntegrityErrCnt =
-		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-		dd->ipath_lli_errs : dd->ipath_lli_errors;
-	cntrs->RxVlErrCnt = 0;
-	cntrs->RxDlidFltrCnt = 0;
-}
-
-
-/* no interrupt fallback for these chips */
-static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
-{
-	return 0;
-}
-
-
-/*
- * reset the XGXS (between serdes and IBC).  Slightly less intrusive
- * than resetting the IBC or external link state, and useful in some
- * cases to cause some retraining.  To do this right, we reset IBC
- * as well.
- */
-static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
-{
-	u64 val, prev_val;
-
-	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-	val = prev_val | INFINIPATH_XGXS_RESET;
-	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-			 dd->ipath_control);
-}
-
-
-static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
-{
-	int ret;
-
-	switch (which) {
-	case IPATH_IB_CFG_LWID:
-		ret = dd->ipath_link_width_active;
-		break;
-	case IPATH_IB_CFG_SPD:
-		ret = dd->ipath_link_speed_active;
-		break;
-	case IPATH_IB_CFG_LWID_ENB:
-		ret = dd->ipath_link_width_enabled;
-		break;
-	case IPATH_IB_CFG_SPD_ENB:
-		ret = dd->ipath_link_speed_enabled;
-		break;
-	default:
-		ret =  -ENOTSUPP;
-		break;
-	}
-	return ret;
-}
-
-
-/* we assume range checking is already done, if needed */
-static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
-{
-	int ret = 0;
-
-	if (which == IPATH_IB_CFG_LWID_ENB)
-		dd->ipath_link_width_enabled = val;
-	else if (which == IPATH_IB_CFG_SPD_ENB)
-		dd->ipath_link_speed_enabled = val;
-	else
-		ret = -ENOTSUPP;
-	return ret;
-}
-
-
-static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
-{
-}
-
-
-static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
-{
-	ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
-		ipath_ib_linktrstate(dd, ibcs));
-	return 0;
-}
-
-
-/**
- * ipath_init_iba6110_funcs - set up the chip-specific function pointers
- * @dd: the infinipath device
- *
- * This is global, and is called directly at init to set up the
- * chip-specific function pointers for later use.
- */
-void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
-{
-	dd->ipath_f_intrsetup = ipath_ht_intconfig;
-	dd->ipath_f_bus = ipath_setup_ht_config;
-	dd->ipath_f_reset = ipath_setup_ht_reset;
-	dd->ipath_f_get_boardname = ipath_ht_boardname;
-	dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
-	dd->ipath_f_early_init = ipath_ht_early_init;
-	dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
-	dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
-	dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
-	dd->ipath_f_clear_tids = ipath_ht_clear_tids;
-	dd->ipath_f_put_tid = ipath_ht_put_tid;
-	dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
-	dd->ipath_f_setextled = ipath_setup_ht_setextled;
-	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
-	dd->ipath_f_free_irq = ipath_ht_free_irq;
-	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
-	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
-	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
-	dd->ipath_f_config_ports = ipath_ht_config_ports;
-	dd->ipath_f_read_counters = ipath_ht_read_counters;
-	dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
-	dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
-	dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
-	dd->ipath_f_config_jint = ipath_ht_config_jint;
-	dd->ipath_f_ib_updown = ipath_ht_ib_updown;
-
-	/*
-	 * initialize chip-specific variables
-	 */
-	ipath_init_ht_variables(dd);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
deleted file mode 100644
index be2a60e..0000000
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ /dev/null
@@ -1,1066 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/moduleparam.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-/*
- * min buffers we want to have per port, after driver
- */
-#define IPATH_MIN_USER_PORT_BUFCNT 7
-
-/*
- * Number of ports we are configured to use (to allow for more pio
- * buffers per port, etc.)  Zero means use chip value.
- */
-static ushort ipath_cfgports;
-
-module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
-MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
-
-/*
- * Number of buffers reserved for driver (verbs and layered drivers.)
- * Initialized based on number of PIO buffers if not set via module interface.
- * The problem with this is that it's global, but we'll use different
- * numbers for different chip types.
- */
-static ushort ipath_kpiobufs;
-
-static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
-
-module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
-		  &ipath_kpiobufs, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
-
-/**
- * create_port0_egr - allocate the eager TID buffers
- * @dd: the infinipath device
- *
- * This code is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance.
- * This is the kernel (port0) version.
- *
- * Allocate the eager TID buffers and program them into infinipath.
- * We use the network layer alloc_skb() allocator to allocate the
- * memory, and either use the buffers as is for things like verbs
- * packets, or pass the buffers up to the ipath layered driver and
- * thence the network layer, replacing them as we do so (see
- * ipath_rcv_layer()).
- */
-static int create_port0_egr(struct ipath_devdata *dd)
-{
-	unsigned e, egrcnt;
-	struct ipath_skbinfo *skbinfo;
-	int ret;
-
-	egrcnt = dd->ipath_p0_rcvegrcnt;
-
-	skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
-	if (skbinfo == NULL) {
-		ipath_dev_err(dd, "allocation error for eager TID "
-			      "skb array\n");
-		ret = -ENOMEM;
-		goto bail;
-	}
-	for (e = 0; e < egrcnt; e++) {
-		/*
-		 * This is a bit tricky in that we allocate extra
-		 * space for 2 bytes of the 14 byte ethernet header.
-		 * These two bytes are passed in the ipath header so
-		 * the rest of the data is word aligned.  We allocate
-		 * 4 bytes so that the data buffer stays word aligned.
-		 * See ipath_kreceive() for more details.
-		 */
-		skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
-		if (!skbinfo[e].skb) {
-			ipath_dev_err(dd, "SKB allocation error for "
-				      "eager TID %u\n", e);
-			while (e != 0)
-				dev_kfree_skb(skbinfo[--e].skb);
-			vfree(skbinfo);
-			ret = -ENOMEM;
-			goto bail;
-		}
-	}
-	/*
-	 * After loop above, so we can test non-NULL to see if ready
-	 * to use at receive, etc.
-	 */
-	dd->ipath_port0_skbinfo = skbinfo;
-
-	for (e = 0; e < egrcnt; e++) {
-		dd->ipath_port0_skbinfo[e].phys =
-		  ipath_map_single(dd->pcidev,
-				   dd->ipath_port0_skbinfo[e].skb->data,
-				   dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
-		dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
-				    ((char __iomem *) dd->ipath_kregbase +
-				     dd->ipath_rcvegrbase),
-				    RCVHQ_RCV_TYPE_EAGER,
-				    dd->ipath_port0_skbinfo[e].phys);
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-static int bringup_link(struct ipath_devdata *dd)
-{
-	u64 val, ibc;
-	int ret = 0;
-
-	/* hold IBC in reset */
-	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-			 dd->ipath_control);
-
-	/*
-	 * set initial max size pkt IBC will send, including ICRC; it's the
-	 * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
-	 */
-	val = (dd->ipath_ibmaxlen >> 2) + 1;
-	ibc = val << dd->ibcc_mpl_shift;
-
-	/* flowcontrolwatermark is in units of KBytes */
-	ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
-	/*
-	 * How often flowctrl sent.  More or less in usecs; balance against
-	 * watermark value, so that in theory senders always get a flow
-	 * control update in time to not let the IB link go idle.
-	 */
-	ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
-	/* max error tolerance */
-	ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-	/* use "real" buffer space for */
-	ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
-	/* IB credit flow control. */
-	ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-	/* initially come up waiting for TS1, without sending anything. */
-	dd->ipath_ibcctrl = ibc;
-	/*
-	 * Want to start out with both LINKCMD and LINKINITCMD in NOP
-	 * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
-	 * to stay a NOP. Flag that we are disabled, for the (unlikely)
-	 * case that some recovery path is trying to bring the link up
-	 * before we are ready.
-	 */
-	ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
-		INFINIPATH_IBCC_LINKINITCMD_SHIFT;
-	dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-	ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
-		   (unsigned long long) ibc);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
-
-	// be sure chip saw it
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-	ret = dd->ipath_f_bringup_serdes(dd);
-
-	if (ret)
-		dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
-			 "not usable\n");
-	else {
-		/* enable IBC */
-		dd->ipath_control |= INFINIPATH_C_LINKENABLE;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-				 dd->ipath_control);
-	}
-
-	return ret;
-}
-
-static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
-{
-	struct ipath_portdata *pd = NULL;
-
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (pd) {
-		pd->port_dd = dd;
-		pd->port_cnt = 1;
-		/* The port 0 pkey table is used by the layer interface. */
-		pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
-		pd->port_seq_cnt = 1;
-	}
-	return pd;
-}
-
-static int init_chip_first(struct ipath_devdata *dd)
-{
-	struct ipath_portdata *pd;
-	int ret = 0;
-	u64 val;
-
-	spin_lock_init(&dd->ipath_kernel_tid_lock);
-	spin_lock_init(&dd->ipath_user_tid_lock);
-	spin_lock_init(&dd->ipath_sendctrl_lock);
-	spin_lock_init(&dd->ipath_uctxt_lock);
-	spin_lock_init(&dd->ipath_sdma_lock);
-	spin_lock_init(&dd->ipath_gpio_lock);
-	spin_lock_init(&dd->ipath_eep_st_lock);
-	spin_lock_init(&dd->ipath_sdepb_lock);
-	mutex_init(&dd->ipath_eep_lock);
-
-	/*
-	 * skip cfgports stuff because we are not allocating memory,
-	 * and we don't want problems if the portcnt changed due to
-	 * cfgports.  We do still check and report a difference, if
-	 * not same (should be impossible).
-	 */
-	dd->ipath_f_config_ports(dd, ipath_cfgports);
-	if (!ipath_cfgports)
-		dd->ipath_cfgports = dd->ipath_portcnt;
-	else if (ipath_cfgports <= dd->ipath_portcnt) {
-		dd->ipath_cfgports = ipath_cfgports;
-		ipath_dbg("Configured to use %u ports out of %u in chip\n",
-			  dd->ipath_cfgports, ipath_read_kreg32(dd,
-			  dd->ipath_kregs->kr_portcnt));
-	} else {
-		dd->ipath_cfgports = dd->ipath_portcnt;
-		ipath_dbg("Tried to configured to use %u ports; chip "
-			  "only supports %u\n", ipath_cfgports,
-			  ipath_read_kreg32(dd,
-				  dd->ipath_kregs->kr_portcnt));
-	}
-	/*
-	 * Allocate full portcnt array, rather than just cfgports, because
-	 * cleanup iterates across all possible ports.
-	 */
-	dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
-			       GFP_KERNEL);
-
-	if (!dd->ipath_pd) {
-		ipath_dev_err(dd, "Unable to allocate portdata array, "
-			      "failing\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	pd = create_portdata0(dd);
-	if (!pd) {
-		ipath_dev_err(dd, "Unable to allocate portdata for port "
-			      "0, failing\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-	dd->ipath_pd[0] = pd;
-
-	dd->ipath_rcvtidcnt =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-	dd->ipath_rcvtidbase =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-	dd->ipath_rcvegrcnt =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-	dd->ipath_rcvegrbase =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-	dd->ipath_palign =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-	dd->ipath_piobufbase =
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
-	dd->ipath_piosize2k = val & ~0U;
-	dd->ipath_piosize4k = val >> 32;
-	if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
-		ipath_mtu4096 = 0; /* 4KB not supported by this chip */
-	dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
-	dd->ipath_piobcnt2k = val & ~0U;
-	dd->ipath_piobcnt4k = val >> 32;
-	dd->ipath_pio2kbase =
-		(u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-				 (dd->ipath_piobufbase & 0xffffffff));
-	if (dd->ipath_piobcnt4k) {
-		dd->ipath_pio4kbase = (u32 __iomem *)
-			(((char __iomem *) dd->ipath_kregbase) +
-			 (dd->ipath_piobufbase >> 32));
-		/*
-		 * 4K buffers take 2 pages; we use roundup just to be
-		 * paranoid; we calculate it once here, rather than on
-		 * ever buf allocate
-		 */
-		dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
-					  dd->ipath_palign);
-		ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
-			  "(%x aligned)\n",
-			  dd->ipath_piobcnt2k, dd->ipath_piosize2k,
-			  dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
-			  dd->ipath_piosize4k, dd->ipath_pio4kbase,
-			  dd->ipath_4kalign);
-	}
-	else ipath_dbg("%u 2k piobufs @ %p\n",
-		       dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
-
-done:
-	return ret;
-}
-
-/**
- * init_chip_reset - re-initialize after a reset, or enable
- * @dd: the infinipath device
- *
- * sanity check at least some of the values after reset, and
- * ensure no receive or transmit (explicitly, in case reset
- * failed
- */
-static int init_chip_reset(struct ipath_devdata *dd)
-{
-	u32 rtmp;
-	int i;
-	unsigned long flags;
-
-	/*
-	 * ensure chip does no sends or receives, tail updates, or
-	 * pioavail updates while we re-initialize
-	 */
-	dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
-	for (i = 0; i < dd->ipath_portcnt; i++) {
-		clear_bit(dd->ipath_r_portenable_shift + i,
-			  &dd->ipath_rcvctrl);
-		clear_bit(dd->ipath_r_intravail_shift + i,
-			  &dd->ipath_rcvctrl);
-	}
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-		dd->ipath_rcvctrl);
-
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl = 0U; /* no sdma, etc */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-	if (rtmp != dd->ipath_rcvtidcnt)
-		dev_info(&dd->pcidev->dev, "tidcnt was %u before "
-			 "reset, now %u, using original\n",
-			 dd->ipath_rcvtidcnt, rtmp);
-	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-	if (rtmp != dd->ipath_rcvtidbase)
-		dev_info(&dd->pcidev->dev, "tidbase was %u before "
-			 "reset, now %u, using original\n",
-			 dd->ipath_rcvtidbase, rtmp);
-	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-	if (rtmp != dd->ipath_rcvegrcnt)
-		dev_info(&dd->pcidev->dev, "egrcnt was %u before "
-			 "reset, now %u, using original\n",
-			 dd->ipath_rcvegrcnt, rtmp);
-	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-	if (rtmp != dd->ipath_rcvegrbase)
-		dev_info(&dd->pcidev->dev, "egrbase was %u before "
-			 "reset, now %u, using original\n",
-			 dd->ipath_rcvegrbase, rtmp);
-
-	return 0;
-}
-
-static int init_pioavailregs(struct ipath_devdata *dd)
-{
-	int ret;
-
-	dd->ipath_pioavailregs_dma = dma_alloc_coherent(
-		&dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
-		GFP_KERNEL);
-	if (!dd->ipath_pioavailregs_dma) {
-		ipath_dev_err(dd, "failed to allocate PIOavail reg area "
-			      "in memory\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	/*
-	 * we really want L2 cache aligned, but for current CPUs of
-	 * interest, they are the same.
-	 */
-	dd->ipath_statusp = (u64 *)
-		((char *)dd->ipath_pioavailregs_dma +
-		 ((2 * L1_CACHE_BYTES +
-		   dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
-	/* copy the current value now that it's really allocated */
-	*dd->ipath_statusp = dd->_ipath_status;
-	/*
-	 * setup buffer to hold freeze msg, accessible to apps,
-	 * following statusp
-	 */
-	dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
-	/* and its length */
-	dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
-
-	ret = 0;
-
-done:
-	return ret;
-}
-
-/**
- * init_shadow_tids - allocate the shadow TID array
- * @dd: the infinipath device
- *
- * allocate the shadow TID array, so we can ipath_munlock previous
- * entries.  It may make more sense to move the pageshadow to the
- * port data structure, so we only allocate memory for ports actually
- * in use, since we at 8k per port, now.
- */
-static void init_shadow_tids(struct ipath_devdata *dd)
-{
-	struct page **pages;
-	dma_addr_t *addrs;
-
-	pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-			sizeof(struct page *));
-	if (!pages) {
-		ipath_dev_err(dd, "failed to allocate shadow page * "
-			      "array, no expected sends!\n");
-		dd->ipath_pageshadow = NULL;
-		return;
-	}
-
-	addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-			sizeof(dma_addr_t));
-	if (!addrs) {
-		ipath_dev_err(dd, "failed to allocate shadow dma handle "
-			      "array, no expected sends!\n");
-		vfree(pages);
-		dd->ipath_pageshadow = NULL;
-		return;
-	}
-
-	dd->ipath_pageshadow = pages;
-	dd->ipath_physshadow = addrs;
-}
-
-static void enable_chip(struct ipath_devdata *dd, int reinit)
-{
-	u32 val;
-	u64 rcvmask;
-	unsigned long flags;
-	int i;
-
-	if (!reinit)
-		init_waitqueue_head(&ipath_state_wait);
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl);
-
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	/* Enable PIO send, and update of PIOavail regs to memory. */
-	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
-		INFINIPATH_S_PIOBUFAVAILUPD;
-
-	/*
-	 * Set the PIO avail update threshold to host memory
-	 * on chips that support it.
-	 */
-	if (dd->ipath_pioupd_thresh)
-		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-			<< INFINIPATH_S_UPDTHRESH_SHIFT;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	/*
-	 * Enable kernel ports' receive and receive interrupt.
-	 * Other ports done as user opens and inits them.
-	 */
-	rcvmask = 1ULL;
-	dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
-		(rcvmask << dd->ipath_r_intravail_shift);
-	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
-		dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl);
-
-	/*
-	 * now ready for use.  this should be cleared whenever we
-	 * detect a reset, or initiate one.
-	 */
-	dd->ipath_flags |= IPATH_INITTED;
-
-	/*
-	 * Init our shadow copies of head from tail values,
-	 * and write head values to match.
-	 */
-	val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
-	ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
-
-	/* Initialize so we interrupt on next packet received */
-	ipath_write_ureg(dd, ur_rcvhdrhead,
-			 dd->ipath_rhdrhead_intr_off |
-			 dd->ipath_pd[0]->port_head, 0);
-
-	/*
-	 * by now pioavail updates to memory should have occurred, so
-	 * copy them into our working/shadow registers; this is in
-	 * case something went wrong with abort, but mostly to get the
-	 * initial values of the generation bit correct.
-	 */
-	for (i = 0; i < dd->ipath_pioavregs; i++) {
-		__le64 pioavail;
-
-		/*
-		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
-		 */
-		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
-		else
-			pioavail = dd->ipath_pioavailregs_dma[i];
-		/*
-		 * don't need to worry about ipath_pioavailkernel here
-		 * because we will call ipath_chg_pioavailkernel() later
-		 * in initialization, to busy out buffers as needed
-		 */
-		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
-	}
-	/* can get counters, stats, etc. */
-	dd->ipath_flags |= IPATH_PRESENT;
-}
-
-static int init_housekeeping(struct ipath_devdata *dd, int reinit)
-{
-	char boardn[40];
-	int ret = 0;
-
-	/*
-	 * have to clear shadow copies of registers at init that are
-	 * not otherwise set here, or all kinds of bizarre things
-	 * happen with driver on chip reset
-	 */
-	dd->ipath_rcvhdrsize = 0;
-
-	/*
-	 * Don't clear ipath_flags as 8bit mode was set before
-	 * entering this func. However, we do set the linkstate to
-	 * unknown, so we can watch for a transition.
-	 * PRESENT is set because we want register reads to work,
-	 * and the kernel infrastructure saw it in config space;
-	 * We clear it if we have failures.
-	 */
-	dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
-	dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
-			     IPATH_LINKDOWN | IPATH_LINKINIT);
-
-	ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
-	dd->ipath_revision =
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
-
-	/*
-	 * set up fundamental info we need to use the chip; we assume
-	 * if the revision reg and these regs are OK, we don't need to
-	 * special case the rest
-	 */
-	dd->ipath_sregbase =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
-	dd->ipath_cregbase =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
-	dd->ipath_uregbase =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
-	ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
-		   "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
-		   dd->ipath_uregbase, dd->ipath_cregbase);
-	if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
-	    || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
-	    || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
-	    || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
-		ipath_dev_err(dd, "Register read failures from chip, "
-			      "giving up initialization\n");
-		dd->ipath_flags &= ~IPATH_PRESENT;
-		ret = -ENODEV;
-		goto done;
-	}
-
-
-	/* clear diagctrl register, in case diags were running and crashed */
-	ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
-
-	/* clear the initial reset flag, in case first driver load */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-			 INFINIPATH_E_RESET);
-
-	ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
-		   (unsigned long long) dd->ipath_revision,
-		   dd->ipath_pcirev);
-
-	if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
-	     INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
-		ipath_dev_err(dd, "Driver only handles version %d, "
-			      "chip swversion is %d (%llx), failng\n",
-			      IPATH_CHIP_SWVERSION,
-			      (int)(dd->ipath_revision >>
-				    INFINIPATH_R_SOFTWARE_SHIFT) &
-			      INFINIPATH_R_SOFTWARE_MASK,
-			      (unsigned long long) dd->ipath_revision);
-		ret = -ENOSYS;
-		goto done;
-	}
-	dd->ipath_majrev = (u8) ((dd->ipath_revision >>
-				  INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
-				 INFINIPATH_R_CHIPREVMAJOR_MASK);
-	dd->ipath_minrev = (u8) ((dd->ipath_revision >>
-				  INFINIPATH_R_CHIPREVMINOR_SHIFT) &
-				 INFINIPATH_R_CHIPREVMINOR_MASK);
-	dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
-				    INFINIPATH_R_BOARDID_SHIFT) &
-				   INFINIPATH_R_BOARDID_MASK);
-
-	ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
-
-	snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
-		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
-		 "SW Compat %u\n",
-		 IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
-		 (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
-		 INFINIPATH_R_ARCH_MASK,
-		 dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
-		 (unsigned)(dd->ipath_revision >>
-			    INFINIPATH_R_SOFTWARE_SHIFT) &
-		 INFINIPATH_R_SOFTWARE_MASK);
-
-	ipath_dbg("%s", dd->ipath_boardversion);
-
-	if (ret)
-		goto done;
-
-	if (reinit)
-		ret = init_chip_reset(dd);
-	else
-		ret = init_chip_first(dd);
-
-done:
-	return ret;
-}
-
-static void verify_interrupt(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-
-	if (!dd)
-		return; /* being torn down */
-
-	/*
-	 * If we don't have any interrupts, let the user know and
-	 * don't bother checking again.
-	 */
-	if (dd->ipath_int_counter == 0) {
-		if (!dd->ipath_f_intr_fallback(dd))
-			dev_err(&dd->pcidev->dev, "No interrupts detected, "
-				"not usable.\n");
-		else /* re-arm the timer to see if fallback works */
-			mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
-	} else
-		ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
-			dd->ipath_int_counter);
-}
-
-/**
- * ipath_init_chip - do the actual initialization sequence on the chip
- * @dd: the infinipath device
- * @reinit: reinitializing, so don't allocate new memory
- *
- * Do the actual initialization sequence on the chip.  This is done
- * both from the init routine called from the PCI infrastructure, and
- * when we reset the chip, or detect that it was reset internally,
- * or it's administratively re-enabled.
- *
- * Memory allocation here and in called routines is only done in
- * the first case (reinit == 0).  We have to be careful, because even
- * without memory allocation, we need to re-write all the chip registers
- * TIDs, etc. after the reset or enable has completed.
- */
-int ipath_init_chip(struct ipath_devdata *dd, int reinit)
-{
-	int ret = 0;
-	u32 kpiobufs, defkbufs;
-	u32 piobufs, uports;
-	u64 val;
-	struct ipath_portdata *pd;
-	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-
-	ret = init_housekeeping(dd, reinit);
-	if (ret)
-		goto done;
-
-	/*
-	 * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
-	 * but then it no longer nicely fits power of two, and since
-	 * we now use routines that backend onto __get_free_pages, the
-	 * rest would be wasted.
-	 */
-	dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
-			 dd->ipath_rcvhdrcnt);
-
-	/*
-	 * Set up the shadow copies of the piobufavail registers,
-	 * which we compare against the chip registers for now, and
-	 * the in memory DMA'ed copies of the registers.  This has to
-	 * be done early, before we calculate lastport, etc.
-	 */
-	piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-	/*
-	 * calc number of pioavail registers, and save it; we have 2
-	 * bits per buffer.
-	 */
-	dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
-		/ (sizeof(u64) * BITS_PER_BYTE / 2);
-	uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
-	if (piobufs > 144)
-		defkbufs = 32 + dd->ipath_pioreserved;
-	else
-		defkbufs = 16 + dd->ipath_pioreserved;
-
-	if (ipath_kpiobufs && (ipath_kpiobufs +
-		(uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
-		int i = (int) piobufs -
-			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
-		if (i < 1)
-			i = 1;
-		dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
-			 "%d for kernel leaves too few for %d user ports "
-			 "(%d each); using %u\n", ipath_kpiobufs,
-			 piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
-		/*
-		 * shouldn't change ipath_kpiobufs, because could be
-		 * different for different devices...
-		 */
-		kpiobufs = i;
-	} else if (ipath_kpiobufs)
-		kpiobufs = ipath_kpiobufs;
-	else
-		kpiobufs = defkbufs;
-	dd->ipath_lastport_piobuf = piobufs - kpiobufs;
-	dd->ipath_pbufsport =
-		uports ? dd->ipath_lastport_piobuf / uports : 0;
-	/* if not an even divisor, some user ports get extra buffers */
-	dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
-		(dd->ipath_pbufsport * uports);
-	if (dd->ipath_ports_extrabuf)
-		ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
-			"ports <= %u\n", dd->ipath_pbufsport,
-			dd->ipath_ports_extrabuf);
-	dd->ipath_lastpioindex = 0;
-	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
-	/* ipath_pioavailshadow initialized earlier */
-	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
-		   "each for %u user ports\n", kpiobufs,
-		   piobufs, dd->ipath_pbufsport, uports);
-	ret = dd->ipath_f_early_init(dd);
-	if (ret) {
-		ipath_dev_err(dd, "Early initialization failure\n");
-		goto done;
-	}
-
-	/*
-	 * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
-	 * done after early_init.
-	 */
-	dd->ipath_hdrqlast =
-		dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
-			 dd->ipath_rcvhdrentsize);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-			 dd->ipath_rcvhdrsize);
-
-	if (!reinit) {
-		ret = init_pioavailregs(dd);
-		init_shadow_tids(dd);
-		if (ret)
-			goto done;
-	}
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
-			 dd->ipath_pioavailregs_phys);
-
-	/*
-	 * this is to detect s/w errors, which the h/w works around by
-	 * ignoring the low 6 bits of address, if it wasn't aligned.
-	 */
-	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
-	if (val != dd->ipath_pioavailregs_phys) {
-		ipath_dev_err(dd, "Catastrophic software error, "
-			      "SendPIOAvailAddr written as %lx, "
-			      "read back as %llx\n",
-			      (unsigned long) dd->ipath_pioavailregs_phys,
-			      (unsigned long long) val);
-		ret = -EINVAL;
-		goto done;
-	}
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
-
-	/*
-	 * make sure we are not in freeze, and PIO send enabled, so
-	 * writes to pbc happen
-	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-	/*
-	 * before error clears, since we expect serdes pll errors during
-	 * this, the first time after reset
-	 */
-	if (bringup_link(dd)) {
-		dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
-		ret = -ENETDOWN;
-		goto done;
-	}
-
-	/*
-	 * clear any "expected" hwerrs from reset and/or initialization
-	 * clear any that aren't enabled (at least this once), and then
-	 * set the enable mask
-	 */
-	dd->ipath_f_init_hwerrors(dd);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-			 dd->ipath_hwerrmask);
-
-	/* clear all */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-	/* enable errors that are masked, at least this first time. */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-			 ~dd->ipath_maskederrs);
-	dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
-	dd->ipath_errormask =
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-	/* clear any interrupts up to this point (ints still not enabled) */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-	dd->ipath_f_tidtemplate(dd);
-
-	/*
-	 * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
-	 * re-init, the simplest way to handle this is to free
-	 * existing, and re-allocate.
-	 * Need to re-create rest of port 0 portdata as well.
-	 */
-	pd = dd->ipath_pd[0];
-	if (reinit) {
-		struct ipath_portdata *npd;
-
-		/*
-		 * Alloc and init new ipath_portdata for port0,
-		 * Then free old pd. Could lead to fragmentation, but also
-		 * makes later support for hot-swap easier.
-		 */
-		npd = create_portdata0(dd);
-		if (npd) {
-			ipath_free_pddata(dd, pd);
-			dd->ipath_pd[0] = npd;
-			pd = npd;
-		} else {
-			ipath_dev_err(dd, "Unable to allocate portdata"
-				      " for port 0, failing\n");
-			ret = -ENOMEM;
-			goto done;
-		}
-	}
-	ret = ipath_create_rcvhdrq(dd, pd);
-	if (!ret)
-		ret = create_port0_egr(dd);
-	if (ret) {
-		ipath_dev_err(dd, "failed to allocate kernel port's "
-			      "rcvhdrq and/or egr bufs\n");
-		goto done;
-	}
-	else
-		enable_chip(dd, reinit);
-
-	/* after enable_chip, so pioavailshadow setup */
-	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
-
-	/*
-	 * Cancel any possible active sends from early driver load.
-	 * Follows early_init because some chips have to initialize
-	 * PIO buffers in early_init to avoid false parity errors.
-	 * After enable and ipath_chg_pioavailkernel so we can safely
-	 * enable pioavail updates and PIOENABLE; packets are now
-	 * ready to go out.
-	 */
-	ipath_cancel_sends(dd, 1);
-
-	if (!reinit) {
-		/*
-		 * Used when we close a port, for DMA already in flight
-		 * at close.
-		 */
-		dd->ipath_dummy_hdrq = dma_alloc_coherent(
-			&dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
-			&dd->ipath_dummy_hdrq_phys,
-			gfp_flags);
-		if (!dd->ipath_dummy_hdrq) {
-			dev_info(&dd->pcidev->dev,
-				"Couldn't allocate 0x%lx bytes for dummy hdrq\n",
-				dd->ipath_pd[0]->port_rcvhdrq_size);
-			/* fallback to just 0'ing */
-			dd->ipath_dummy_hdrq_phys = 0UL;
-		}
-	}
-
-	/*
-	 * cause retrigger of pending interrupts ignored during init,
-	 * even if we had errors
-	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-
-	if (!dd->ipath_stats_timer_active) {
-		/*
-		 * first init, or after an admin disable/enable
-		 * set up stats retrieval timer, even if we had errors
-		 * in last portion of setup
-		 */
-		init_timer(&dd->ipath_stats_timer);
-		dd->ipath_stats_timer.function = ipath_get_faststats;
-		dd->ipath_stats_timer.data = (unsigned long) dd;
-		/* every 5 seconds; */
-		dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
-		/* takes ~16 seconds to overflow at full IB 4x bandwdith */
-		add_timer(&dd->ipath_stats_timer);
-		dd->ipath_stats_timer_active = 1;
-	}
-
-	/* Set up SendDMA if chip supports it */
-	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		ret = setup_sdma(dd);
-
-	/* Set up HoL state */
-	init_timer(&dd->ipath_hol_timer);
-	dd->ipath_hol_timer.function = ipath_hol_event;
-	dd->ipath_hol_timer.data = (unsigned long)dd;
-	dd->ipath_hol_state = IPATH_HOL_UP;
-
-done:
-	if (!ret) {
-		*dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
-		if (!dd->ipath_f_intrsetup(dd)) {
-			/* now we can enable all interrupts from the chip */
-			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-					 -1LL);
-			/* force re-interrupt of any pending interrupts. */
-			ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
-					 0ULL);
-			/* chip is usable; mark it as initialized */
-			*dd->ipath_statusp |= IPATH_STATUS_INITTED;
-
-			/*
-			 * setup to verify we get an interrupt, and fallback
-			 * to an alternate if necessary and possible
-			 */
-			if (!reinit) {
-				init_timer(&dd->ipath_intrchk_timer);
-				dd->ipath_intrchk_timer.function =
-					verify_interrupt;
-				dd->ipath_intrchk_timer.data =
-					(unsigned long) dd;
-			}
-			dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
-			add_timer(&dd->ipath_intrchk_timer);
-		} else
-			ipath_dev_err(dd, "No interrupts enabled, couldn't "
-				      "setup interrupt address\n");
-
-		if (dd->ipath_cfgports > ipath_stats.sps_nports)
-			/*
-			 * sps_nports is a global, so, we set it to
-			 * the highest number of ports of any of the
-			 * chips we find; we never decrement it, at
-			 * least for now.  Since this might have changed
-			 * over disable/enable or prior to reset, always
-			 * do the check and potentially adjust.
-			 */
-			ipath_stats.sps_nports = dd->ipath_cfgports;
-	} else
-		ipath_dbg("Failed (%d) to initialize chip\n", ret);
-
-	/* if ret is non-zero, we probably should do some cleanup
-	   here... */
-	return ret;
-}
-
-static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
-{
-	struct ipath_devdata *dd;
-	unsigned long flags;
-	unsigned short val;
-	int ret;
-
-	ret = ipath_parse_ushort(str, &val);
-
-	spin_lock_irqsave(&ipath_devs_lock, flags);
-
-	if (ret < 0)
-		goto bail;
-
-	if (val == 0) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-		if (dd->ipath_kregbase)
-			continue;
-		if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
-			   (dd->ipath_cfgports *
-			    IPATH_MIN_USER_PORT_BUFCNT)))
-		{
-			ipath_dev_err(
-				dd,
-				"Allocating %d PIO bufs for kernel leaves "
-				"too few for %d user ports (%d each)\n",
-				val, dd->ipath_cfgports - 1,
-				IPATH_MIN_USER_PORT_BUFCNT);
-			ret = -EINVAL;
-			goto bail;
-		}
-		dd->ipath_lastport_piobuf =
-			dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
-	}
-
-	ipath_kpiobufs = val;
-	ret = 0;
-bail:
-	spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
deleted file mode 100644
index 01ba792..0000000
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ /dev/null
@@ -1,1273 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-
-/*
- * Called when we might have an error that is specific to a particular
- * PIO buffer, and may need to cancel that buffer, so it can be re-used.
- */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
-{
-	u32 piobcnt;
-	unsigned long sbuf[4];
-	/*
-	 * it's possible that sendbuffererror could have bits set; might
-	 * have already done this as a result of hardware error handling
-	 */
-	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-	/* read these before writing errorclear */
-	sbuf[0] = ipath_read_kreg64(
-		dd, dd->ipath_kregs->kr_sendbuffererror);
-	sbuf[1] = ipath_read_kreg64(
-		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
-	if (piobcnt > 128)
-		sbuf[2] = ipath_read_kreg64(
-			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
-	if (piobcnt > 192)
-		sbuf[3] = ipath_read_kreg64(
-			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
-	else
-		sbuf[3] = 0;
-
-	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
-		int i;
-		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
-			time_after(dd->ipath_lastcancel, jiffies)) {
-			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
-					  "SendbufErrs %lx %lx", sbuf[0],
-					  sbuf[1]);
-			if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
-				printk(" %lx %lx ", sbuf[2], sbuf[3]);
-			printk("\n");
-		}
-
-		for (i = 0; i < piobcnt; i++)
-			if (test_bit(i, sbuf))
-				ipath_disarm_piobufs(dd, i, 1);
-		/* ignore armlaunch errs for a bit */
-		dd->ipath_lastcancel = jiffies+3;
-	}
-}
-
-
-/* These are all rcv-related errors which we want to count for stats */
-#define E_SUM_PKTERRS \
-	(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
-	 INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
-	 INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
-	 INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-	 INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
-	 INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
-
-/* These are all send-related errors which we want to count for stats */
-#define E_SUM_ERRS \
-	(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
-	 INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
-	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-	 INFINIPATH_E_INVALIDADDR)
-
-/*
- * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
- * errors not related to freeze and cancelling buffers.  Can't ignore
- * armlaunch because could get more while still cleaning up, and need
- * to cancel those as they happen.
- */
-#define E_SPKT_ERRS_IGNORE \
-	 (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
-	 INFINIPATH_E_SPKTLEN)
-
-/*
- * these are errors that can occur when the link changes state while
- * a packet is being sent or received.  This doesn't cover things
- * like EBP or VCRC that can be the result of a sending having the
- * link change state, so we receive a "known bad" packet.
- */
-#define E_SUM_LINK_PKTERRS \
-	(INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-	 INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-	 INFINIPATH_E_RUNEXPCHAR)
-
-static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
-{
-	u64 ignore_this_time = 0;
-
-	ipath_disarm_senderrbufs(dd);
-	if ((errs & E_SUM_LINK_PKTERRS) &&
-	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-		/*
-		 * This can happen when SMA is trying to bring the link
-		 * up, but the IB link changes state at the "wrong" time.
-		 * The IB logic then complains that the packet isn't
-		 * valid.  We don't want to confuse people, so we just
-		 * don't print them, except at debug
-		 */
-		ipath_dbg("Ignoring packet errors %llx, because link not "
-			  "ACTIVE\n", (unsigned long long) errs);
-		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-	}
-
-	return ignore_this_time;
-}
-
-/* generic hw error messages... */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
-	{ \
-		.mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
-			  INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
-		.msg = "TXE " #a " Memory Parity"	     \
-	}
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
-	{ \
-		.mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
-			  INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
-		.msg = "RXE " #a " Memory Parity"	     \
-	}
-
-static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
-	INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
-	INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
-
-	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
-	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
-	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
-
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
-	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
-};
-
-/**
- * ipath_format_hwmsg - format a single hwerror message
- * @msg message buffer
- * @msgl length of message buffer
- * @hwmsg message to add to message buffer
- */
-static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
-{
-	strlcat(msg, "[", msgl);
-	strlcat(msg, hwmsg, msgl);
-	strlcat(msg, "]", msgl);
-}
-
-/**
- * ipath_format_hwerrors - format hardware error messages for display
- * @hwerrs hardware errors bit vector
- * @hwerrmsgs hardware error descriptions
- * @nhwerrmsgs number of hwerrmsgs
- * @msg message buffer
- * @msgl message buffer length
- */
-void ipath_format_hwerrors(u64 hwerrs,
-			   const struct ipath_hwerror_msgs *hwerrmsgs,
-			   size_t nhwerrmsgs,
-			   char *msg, size_t msgl)
-{
-	int i;
-	const int glen =
-	    ARRAY_SIZE(ipath_generic_hwerror_msgs);
-
-	for (i=0; i<glen; i++) {
-		if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
-			ipath_format_hwmsg(msg, msgl,
-					   ipath_generic_hwerror_msgs[i].msg);
-		}
-	}
-
-	for (i=0; i<nhwerrmsgs; i++) {
-		if (hwerrs & hwerrmsgs[i].mask) {
-			ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
-		}
-	}
-}
-
-/* return the strings for the most common link states */
-static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-	char *ret;
-	u32 state;
-
-	state = ipath_ib_state(dd, ibcs);
-	if (state == dd->ib_init)
-		ret = "Init";
-	else if (state == dd->ib_arm)
-		ret = "Arm";
-	else if (state == dd->ib_active)
-		ret = "Active";
-	else
-		ret = "Down";
-	return ret;
-}
-
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
-{
-	struct ib_event event;
-
-	event.device = &dd->verbs_dev->ibdev;
-	event.element.port_num = 1;
-	event.event = ev;
-	ib_dispatch_event(&event);
-}
-
-static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
-				     ipath_err_t errs)
-{
-	u32 ltstate, lstate, ibstate, lastlstate;
-	u32 init = dd->ib_init;
-	u32 arm = dd->ib_arm;
-	u32 active = dd->ib_active;
-	const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-
-	lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
-	ibstate = ipath_ib_state(dd, ibcs);
-	/* linkstate at last interrupt */
-	lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-	ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
-
-	/*
-	 * Since going into a recovery state causes the link state to go
-	 * down and since recovery is transitory, it is better if we "miss"
-	 * ever seeing the link training state go into recovery (i.e.,
-	 * ignore this transition for link state special handling purposes)
-	 * without even updating ipath_lastibcstat.
-	 */
-	if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
-	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
-	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
-		goto done;
-
-	/*
-	 * if linkstate transitions into INIT from any of the various down
-	 * states, or if it transitions from any of the up (INIT or better)
-	 * states into any of the down states (except link recovery), then
-	 * call the chip-specific code to take appropriate actions.
-	 */
-	if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
-		lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
-		/* transitioned to UP */
-		if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
-			/* link came up, so we must no longer be disabled */
-			dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-			ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
-			goto skip_ibchange; /* chip-code handled */
-		}
-	} else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
-		(dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
-		ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
-		ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
-		int handled;
-		handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
-		dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
-		if (handled) {
-			ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
-			goto skip_ibchange; /* chip-code handled */
-		}
-	}
-
-	/*
-	 * Significant enough to always print and get into logs, if it was
-	 * unexpected.  If it was a requested state change, we'll have
-	 * already cleared the flags, so we won't print this warning
-	 */
-	if ((ibstate != arm && ibstate != active) &&
-	    (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
-		dev_info(&dd->pcidev->dev, "Link state changed from %s "
-			 "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
-			 "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
-	}
-
-	if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-	    ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-		u32 lastlts;
-		lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-		/*
-		 * Ignore cycling back and forth from Polling.Active to
-		 * Polling.Quiet while waiting for the other end of the link
-		 * to come up, except to try and decide if we are connected
-		 * to a live IB device or not.  We will cycle back and
-		 * forth between them if no cable is plugged in, the other
-		 * device is powered off or disabled, etc.
-		 */
-		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-		    lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-			if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
-			     (++dd->ipath_ibpollcnt == 40)) {
-				dd->ipath_flags |= IPATH_NOCABLE;
-				*dd->ipath_statusp |=
-					IPATH_STATUS_IB_NOCABLE;
-				ipath_cdbg(LINKVERB, "Set NOCABLE\n");
-			}
-			ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
-				ipath_ibcstatus_str[ltstate], ibstate);
-			goto skip_ibchange;
-		}
-	}
-
-	dd->ipath_ibpollcnt = 0; /* not poll*, now */
-	ipath_stats.sps_iblink++;
-
-	if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
-		u64 linkrecov;
-		linkrecov = ipath_snap_cntr(dd,
-			dd->ipath_cregs->cr_iblinkerrrecovcnt);
-		if (linkrecov != dd->ipath_lastlinkrecov) {
-			ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
-				(unsigned long long) ibcs,
-				ib_linkstate(dd, ibcs),
-				ipath_ibcstatus_str[ltstate],
-				(unsigned long long) linkrecov);
-			/* and no more until active again */
-			dd->ipath_lastlinkrecov = 0;
-			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-			goto skip_ibchange;
-		}
-	}
-
-	if (ibstate == init || ibstate == arm || ibstate == active) {
-		*dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
-		if (ibstate == init || ibstate == arm) {
-			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-			if (dd->ipath_flags & IPATH_LINKACTIVE)
-				signal_ib_event(dd, IB_EVENT_PORT_ERR);
-		}
-		if (ibstate == arm) {
-			dd->ipath_flags |= IPATH_LINKARMED;
-			dd->ipath_flags &= ~(IPATH_LINKUNK |
-				IPATH_LINKINIT | IPATH_LINKDOWN |
-				IPATH_LINKACTIVE | IPATH_NOCABLE);
-			ipath_hol_down(dd);
-		} else  if (ibstate == init) {
-			/*
-			 * set INIT and DOWN.  Down is checked by
-			 * most of the other code, but INIT is
-			 * useful to know in a few places.
-			 */
-			dd->ipath_flags |= IPATH_LINKINIT |
-				IPATH_LINKDOWN;
-			dd->ipath_flags &= ~(IPATH_LINKUNK |
-				IPATH_LINKARMED | IPATH_LINKACTIVE |
-				IPATH_NOCABLE);
-			ipath_hol_down(dd);
-		} else {  /* active */
-			dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
-				dd->ipath_cregs->cr_iblinkerrrecovcnt);
-			*dd->ipath_statusp |=
-				IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
-			dd->ipath_flags |= IPATH_LINKACTIVE;
-			dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-				| IPATH_LINKDOWN | IPATH_LINKARMED |
-				IPATH_NOCABLE);
-			if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-				ipath_restart_sdma(dd);
-			signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
-			/* LED active not handled in chip _f_updown */
-			dd->ipath_f_setextled(dd, lstate, ltstate);
-			ipath_hol_up(dd);
-		}
-
-		/*
-		 * print after we've already done the work, so as not to
-		 * delay the state changes and notifications, for debugging
-		 */
-		if (lstate == lastlstate)
-			ipath_cdbg(LINKVERB, "Unchanged from last: %s "
-				"(%x)\n", ib_linkstate(dd, ibcs), ibstate);
-		else
-			ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
-				  dd->ipath_unit, ib_linkstate(dd, ibcs),
-				  ipath_ibcstatus_str[ltstate],  ibstate);
-	} else { /* down */
-		if (dd->ipath_flags & IPATH_LINKACTIVE)
-			signal_ib_event(dd, IB_EVENT_PORT_ERR);
-		dd->ipath_flags |= IPATH_LINKDOWN;
-		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-				     | IPATH_LINKACTIVE |
-				     IPATH_LINKARMED);
-		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-		dd->ipath_lli_counter = 0;
-
-		if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
-			ipath_cdbg(VERBOSE, "Unit %u link state down "
-				   "(state 0x%x), from %s\n",
-				   dd->ipath_unit, lstate,
-				   ib_linkstate(dd, dd->ipath_lastibcstat));
-		else
-			ipath_cdbg(LINKVERB, "Unit %u link state changed "
-				   "to %s (0x%x) from down (%x)\n",
-				   dd->ipath_unit,
-				   ipath_ibcstatus_str[ltstate],
-				   ibstate, lastlstate);
-	}
-
-skip_ibchange:
-	dd->ipath_lastibcstat = ibcs;
-done:
-	return;
-}
-
-static void handle_supp_msgs(struct ipath_devdata *dd,
-			     unsigned supp_msgs, char *msg, u32 msgsz)
-{
-	/*
-	 * Print the message unless it's ibc status change only, which
-	 * happens so often we never want to count it.
-	 */
-	if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
-		int iserr;
-		ipath_err_t mask;
-		iserr = ipath_decode_err(dd, msg, msgsz,
-					 dd->ipath_lasterror &
-					 ~INFINIPATH_E_IBSTATUSCHANGED);
-
-		mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-			INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
-
-		/* if we're in debug, then don't mask SDMADISABLED msgs */
-		if (ipath_debug & __IPATH_DBG)
-			mask &= ~INFINIPATH_E_SDMADISABLED;
-
-		if (dd->ipath_lasterror & ~mask)
-			ipath_dev_err(dd, "Suppressed %u messages for "
-				      "fast-repeating errors (%s) (%llx)\n",
-				      supp_msgs, msg,
-				      (unsigned long long)
-				      dd->ipath_lasterror);
-		else {
-			/*
-			 * rcvegrfull and rcvhdrqfull are "normal", for some
-			 * types of processes (mostly benchmarks) that send
-			 * huge numbers of messages, while not processing
-			 * them. So only complain about these at debug
-			 * level.
-			 */
-			if (iserr)
-				ipath_dbg("Suppressed %u messages for %s\n",
-					  supp_msgs, msg);
-			else
-				ipath_cdbg(ERRPKT,
-					"Suppressed %u messages for %s\n",
-					  supp_msgs, msg);
-		}
-	}
-}
-
-static unsigned handle_frequent_errors(struct ipath_devdata *dd,
-				       ipath_err_t errs, char *msg,
-				       u32 msgsz, int *noprint)
-{
-	unsigned long nc;
-	static unsigned long nextmsg_time;
-	static unsigned nmsgs, supp_msgs;
-
-	/*
-	 * Throttle back "fast" messages to no more than 10 per 5 seconds.
-	 * This isn't perfect, but it's a reasonable heuristic. If we get
-	 * more than 10, give a 6x longer delay.
-	 */
-	nc = jiffies;
-	if (nmsgs > 10) {
-		if (time_before(nc, nextmsg_time)) {
-			*noprint = 1;
-			if (!supp_msgs++)
-				nextmsg_time = nc + HZ * 3;
-		}
-		else if (supp_msgs) {
-			handle_supp_msgs(dd, supp_msgs, msg, msgsz);
-			supp_msgs = 0;
-			nmsgs = 0;
-		}
-	}
-	else if (!nmsgs++ || time_after(nc, nextmsg_time))
-		nextmsg_time = nc + HZ / 2;
-
-	return supp_msgs;
-}
-
-static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-	unsigned long flags;
-	int expected;
-
-	if (ipath_debug & __IPATH_DBG) {
-		char msg[128];
-		ipath_decode_err(dd, msg, sizeof msg, errs &
-			INFINIPATH_E_SDMAERRS);
-		ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
-	}
-	if (ipath_debug & __IPATH_VERBDBG) {
-		unsigned long tl, hd, status, lengen;
-		tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-		hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-		status = ipath_read_kreg64(dd
-			, dd->ipath_kregs->kr_senddmastatus);
-		lengen = ipath_read_kreg64(dd,
-			dd->ipath_kregs->kr_senddmalengen);
-		ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
-			"lengen 0x%lx\n", tl, hd, status, lengen);
-	}
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-	if (!expected)
-		ipath_cancel_sends(dd, 1);
-}
-
-static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
-{
-	unsigned long flags;
-	int expected;
-
-	if ((istat & INFINIPATH_I_SDMAINT) &&
-	    !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-		ipath_sdma_intr(dd);
-
-	if (istat & INFINIPATH_I_SDMADISABLED) {
-		expected = test_bit(IPATH_SDMA_ABORTING,
-			&dd->ipath_sdma_status);
-		ipath_dbg("%s SDmaDisabled intr\n",
-			expected ? "expected" : "unexpected");
-		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-		if (!expected)
-			ipath_cancel_sends(dd, 1);
-		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-	}
-}
-
-static int handle_hdrq_full(struct ipath_devdata *dd)
-{
-	int chkerrpkts = 0;
-	u32 hd, tl;
-	u32 i;
-
-	ipath_stats.sps_hdrqfull++;
-	for (i = 0; i < dd->ipath_cfgports; i++) {
-		struct ipath_portdata *pd = dd->ipath_pd[i];
-
-		if (i == 0) {
-			/*
-			 * For kernel receive queues, we just want to know
-			 * if there are packets in the queue that we can
-			 * process.
-			 */
-			if (pd->port_head != ipath_get_hdrqtail(pd))
-				chkerrpkts |= 1 << i;
-			continue;
-		}
-
-		/* Skip if user context is not open */
-		if (!pd || !pd->port_cnt)
-			continue;
-
-		/* Don't report the same point multiple times. */
-		if (dd->ipath_flags & IPATH_NODMA_RTAIL)
-			tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
-		else
-			tl = ipath_get_rcvhdrtail(pd);
-		if (tl == pd->port_lastrcvhdrqtail)
-			continue;
-
-		hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
-		if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
-			pd->port_lastrcvhdrqtail = tl;
-			pd->port_hdrqfull++;
-			/* flush hdrqfull so that poll() sees it */
-			wmb();
-			wake_up_interruptible(&pd->port_wait);
-		}
-	}
-
-	return chkerrpkts;
-}
-
-static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-	char msg[128];
-	u64 ignore_this_time = 0;
-	u64 iserr = 0;
-	int chkerrpkts = 0, noprint = 0;
-	unsigned supp_msgs;
-	int log_idx;
-
-	/*
-	 * don't report errors that are masked, either at init
-	 * (not set in ipath_errormask), or temporarily (set in
-	 * ipath_maskederrs)
-	 */
-	errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
-
-	supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
-		&noprint);
-
-	/* do these first, they are most important */
-	if (errs & INFINIPATH_E_HARDWARE) {
-		/* reuse same msg buf */
-		dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
-	} else {
-		u64 mask;
-		for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
-			mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
-			if (errs & mask)
-				ipath_inc_eeprom_err(dd, log_idx, 1);
-		}
-	}
-
-	if (errs & INFINIPATH_E_SDMAERRS)
-		handle_sdma_errors(dd, errs);
-
-	if (!noprint && (errs & ~dd->ipath_e_bitsextant))
-		ipath_dev_err(dd, "error interrupt with unknown errors "
-			      "%llx set\n", (unsigned long long)
-			      (errs & ~dd->ipath_e_bitsextant));
-
-	if (errs & E_SUM_ERRS)
-		ignore_this_time = handle_e_sum_errs(dd, errs);
-	else if ((errs & E_SUM_LINK_PKTERRS) &&
-	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-		/*
-		 * This can happen when SMA is trying to bring the link
-		 * up, but the IB link changes state at the "wrong" time.
-		 * The IB logic then complains that the packet isn't
-		 * valid.  We don't want to confuse people, so we just
-		 * don't print them, except at debug
-		 */
-		ipath_dbg("Ignoring packet errors %llx, because link not "
-			  "ACTIVE\n", (unsigned long long) errs);
-		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-	}
-
-	if (supp_msgs == 250000) {
-		int s_iserr;
-		/*
-		 * It's not entirely reasonable assuming that the errors set
-		 * in the last clear period are all responsible for the
-		 * problem, but the alternative is to assume it's the only
-		 * ones on this particular interrupt, which also isn't great
-		 */
-		dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
-
-		dd->ipath_errormask &= ~dd->ipath_maskederrs;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-				 dd->ipath_errormask);
-		s_iserr = ipath_decode_err(dd, msg, sizeof msg,
-					   dd->ipath_maskederrs);
-
-		if (dd->ipath_maskederrs &
-		    ~(INFINIPATH_E_RRCVEGRFULL |
-		      INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
-			ipath_dev_err(dd, "Temporarily disabling "
-			    "error(s) %llx reporting; too frequent (%s)\n",
-				(unsigned long long) dd->ipath_maskederrs,
-				msg);
-		else {
-			/*
-			 * rcvegrfull and rcvhdrqfull are "normal",
-			 * for some types of processes (mostly benchmarks)
-			 * that send huge numbers of messages, while not
-			 * processing them.  So only complain about
-			 * these at debug level.
-			 */
-			if (s_iserr)
-				ipath_dbg("Temporarily disabling reporting "
-				    "too frequent queue full errors (%s)\n",
-				    msg);
-			else
-				ipath_cdbg(ERRPKT,
-				    "Temporarily disabling reporting too"
-				    " frequent packet errors (%s)\n",
-				    msg);
-		}
-
-		/*
-		 * Re-enable the masked errors after around 3 minutes.  in
-		 * ipath_get_faststats().  If we have a series of fast
-		 * repeating but different errors, the interval will keep
-		 * stretching out, but that's OK, as that's pretty
-		 * catastrophic.
-		 */
-		dd->ipath_unmasktime = jiffies + HZ * 180;
-	}
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
-	if (ignore_this_time)
-		errs &= ~ignore_this_time;
-	if (errs & ~dd->ipath_lasterror) {
-		errs &= ~dd->ipath_lasterror;
-		/* never suppress duplicate hwerrors or ibstatuschange */
-		dd->ipath_lasterror |= errs &
-			~(INFINIPATH_E_HARDWARE |
-			  INFINIPATH_E_IBSTATUSCHANGED);
-	}
-
-	if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
-		dd->ipath_spectriggerhit++;
-		ipath_dbg("%lu special trigger hits\n",
-			dd->ipath_spectriggerhit);
-	}
-
-	/* likely due to cancel; so suppress message unless verbose */
-	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
-		time_after(dd->ipath_lastcancel, jiffies)) {
-		/* armlaunch takes precedence; it often causes both. */
-		ipath_cdbg(VERBOSE,
-			"Suppressed %s error (%llx) after sendbuf cancel\n",
-			(errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
-			"armlaunch" : "sendpktlen", (unsigned long long)errs);
-		errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
-	}
-
-	if (!errs)
-		return 0;
-
-	if (!noprint) {
-		ipath_err_t mask;
-		/*
-		 * The ones we mask off are handled specially below
-		 * or above.  Also mask SDMADISABLED by default as it
-		 * is too chatty.
-		 */
-		mask = INFINIPATH_E_IBSTATUSCHANGED |
-			INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-			INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
-
-		/* if we're in debug, then don't mask SDMADISABLED msgs */
-		if (ipath_debug & __IPATH_DBG)
-			mask &= ~INFINIPATH_E_SDMADISABLED;
-
-		ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
-	} else
-		/* so we don't need if (!noprint) at strlcat's below */
-		*msg = 0;
-
-	if (errs & E_SUM_PKTERRS) {
-		ipath_stats.sps_pkterrs++;
-		chkerrpkts = 1;
-	}
-	if (errs & E_SUM_ERRS)
-		ipath_stats.sps_errs++;
-
-	if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
-		ipath_stats.sps_crcerrs++;
-		chkerrpkts = 1;
-	}
-	iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
-
-
-	/*
-	 * We don't want to print these two as they happen, or we can make
-	 * the situation even worse, because it takes so long to print
-	 * messages to serial consoles.  Kernel ports get printed from
-	 * fast_stats, no more than every 5 seconds, user ports get printed
-	 * on close
-	 */
-	if (errs & INFINIPATH_E_RRCVHDRFULL)
-		chkerrpkts |= handle_hdrq_full(dd);
-	if (errs & INFINIPATH_E_RRCVEGRFULL) {
-		struct ipath_portdata *pd = dd->ipath_pd[0];
-
-		/*
-		 * since this is of less importance and not likely to
-		 * happen without also getting hdrfull, only count
-		 * occurrences; don't check each port (or even the kernel
-		 * vs user)
-		 */
-		ipath_stats.sps_etidfull++;
-		if (pd->port_head != ipath_get_hdrqtail(pd))
-			chkerrpkts |= 1;
-	}
-
-	/*
-	 * do this before IBSTATUSCHANGED, in case both bits set in a single
-	 * interrupt; we want the STATUSCHANGE to "win", so we do our
-	 * internal copy of state machine correctly
-	 */
-	if (errs & INFINIPATH_E_RIBLOSTLINK) {
-		/*
-		 * force through block below
-		 */
-		errs |= INFINIPATH_E_IBSTATUSCHANGED;
-		ipath_stats.sps_iblink++;
-		dd->ipath_flags |= IPATH_LINKDOWN;
-		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-				     | IPATH_LINKARMED | IPATH_LINKACTIVE);
-		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-
-		ipath_dbg("Lost link, link now down (%s)\n",
-			ipath_ibcstatus_str[ipath_read_kreg64(dd,
-			dd->ipath_kregs->kr_ibcstatus) & 0xf]);
-	}
-	if (errs & INFINIPATH_E_IBSTATUSCHANGED)
-		handle_e_ibstatuschanged(dd, errs);
-
-	if (errs & INFINIPATH_E_RESET) {
-		if (!noprint)
-			ipath_dev_err(dd, "Got reset, requires re-init "
-				      "(unload and reload driver)\n");
-		dd->ipath_flags &= ~IPATH_INITTED;	/* needs re-init */
-		/* mark as having had error */
-		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-		*dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
-	}
-
-	if (!noprint && *msg) {
-		if (iserr)
-			ipath_dev_err(dd, "%s error\n", msg);
-	}
-	if (dd->ipath_state_wanted & dd->ipath_flags) {
-		ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
-			   "waking\n", dd->ipath_state_wanted,
-			   dd->ipath_flags);
-		wake_up_interruptible(&ipath_state_wait);
-	}
-
-	return chkerrpkts;
-}
-
-/*
- * try to cleanup as much as possible for anything that might have gone
- * wrong while in freeze mode, such as pio buffers being written by user
- * processes (causing armlaunch), send errors due to going into freeze mode,
- * etc., and try to avoid causing extra interrupts while doing so.
- * Forcibly update the in-memory pioavail register copies after cleanup
- * because the chip won't do it while in freeze mode (the register values
- * themselves are kept correct).
- * Make sure that we don't lose any important interrupts by using the chip
- * feature that says that writing 0 to a bit in *clear that is set in
- * *status will cause an interrupt to be generated again (if allowed by
- * the *mask value).
- */
-void ipath_clear_freeze(struct ipath_devdata *dd)
-{
-	/* disable error interrupts, to avoid confusion */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
-
-	/* also disable interrupts; errormask is sometimes overwriten */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-	ipath_cancel_sends(dd, 1);
-
-	/* clear the freeze, and be sure chip saw it */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-			 dd->ipath_control);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-	/* force in-memory update now we are out of freeze */
-	ipath_force_pio_avail_update(dd);
-
-	/*
-	 * force new interrupt if any hwerr, error or interrupt bits are
-	 * still set, and clear "safe" send packet errors related to freeze
-	 * and cancelling sends.  Re-enable error interrupts before possible
-	 * force of re-interrupt on pending interrupts.
-	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-		E_SPKT_ERRS_IGNORE);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-		dd->ipath_errormask);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-}
-
-
-/* this is separate to allow for better optimization of ipath_intr() */
-
-static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
-{
-	/*
-	 * sometimes happen during driver init and unload, don't want
-	 * to process any interrupts at that point
-	 */
-
-	/* this is just a bandaid, not a fix, if something goes badly
-	 * wrong */
-	if (++*unexpectp > 100) {
-		if (++*unexpectp > 105) {
-			/*
-			 * ok, we must be taking somebody else's interrupts,
-			 * due to a messed up mptable and/or PIRQ table, so
-			 * unregister the interrupt.  We've seen this during
-			 * linuxbios development work, and it may happen in
-			 * the future again.
-			 */
-			if (dd->pcidev && dd->ipath_irq) {
-				ipath_dev_err(dd, "Now %u unexpected "
-					      "interrupts, unregistering "
-					      "interrupt handler\n",
-					      *unexpectp);
-				ipath_dbg("free_irq of irq %d\n",
-					  dd->ipath_irq);
-				dd->ipath_f_free_irq(dd);
-			}
-		}
-		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
-			ipath_dev_err(dd, "%u unexpected interrupts, "
-				      "disabling interrupts completely\n",
-				      *unexpectp);
-			/*
-			 * disable all interrupts, something is very wrong
-			 */
-			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-					 0ULL);
-		}
-	} else if (*unexpectp > 1)
-		ipath_dbg("Interrupt when not ready, should not happen, "
-			  "ignoring\n");
-}
-
-static noinline void ipath_bad_regread(struct ipath_devdata *dd)
-{
-	static int allbits;
-
-	/* separate routine, for better optimization of ipath_intr() */
-
-	/*
-	 * We print the message and disable interrupts, in hope of
-	 * having a better chance of debugging the problem.
-	 */
-	ipath_dev_err(dd,
-		      "Read of interrupt status failed (all bits set)\n");
-	if (allbits++) {
-		/* disable all interrupts, something is very wrong */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-		if (allbits == 2) {
-			ipath_dev_err(dd, "Still bad interrupt status, "
-				      "unregistering interrupt\n");
-			dd->ipath_f_free_irq(dd);
-		} else if (allbits > 2) {
-			if ((allbits % 10000) == 0)
-				printk(".");
-		} else
-			ipath_dev_err(dd, "Disabling interrupts, "
-				      "multiple errors\n");
-	}
-}
-
-static void handle_layer_pioavail(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-	int ret;
-
-	ret = ipath_ib_piobufavail(dd->verbs_dev);
-	if (ret > 0)
-		goto set;
-
-	return;
-set:
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			 dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-/*
- * Handle receive interrupts for user ports; this means a user
- * process was waiting for a packet to arrive, and didn't want
- * to poll
- */
-static void handle_urcv(struct ipath_devdata *dd, u64 istat)
-{
-	u64 portr;
-	int i;
-	int rcvdint = 0;
-
-	/*
-	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
-	 * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
-	 * would both like timely updates of the bits so that
-	 * we don't pass them by unnecessarily.  the rmb()
-	 * here ensures that we see them promptly -- the
-	 * corresponding wmb()'s are in ipath_poll_urgent()
-	 * and ipath_poll_next()...
-	 */
-	rmb();
-	portr = ((istat >> dd->ipath_i_rcvavail_shift) &
-		 dd->ipath_i_rcvavail_mask) |
-		((istat >> dd->ipath_i_rcvurg_shift) &
-		 dd->ipath_i_rcvurg_mask);
-	for (i = 1; i < dd->ipath_cfgports; i++) {
-		struct ipath_portdata *pd = dd->ipath_pd[i];
-
-		if (portr & (1 << i) && pd && pd->port_cnt) {
-			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
-					       &pd->port_flag)) {
-				clear_bit(i + dd->ipath_r_intravail_shift,
-					  &dd->ipath_rcvctrl);
-				wake_up_interruptible(&pd->port_wait);
-				rcvdint = 1;
-			} else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
-						      &pd->port_flag)) {
-				pd->port_urgent++;
-				wake_up_interruptible(&pd->port_wait);
-			}
-		}
-	}
-	if (rcvdint) {
-		/* only want to take one interrupt, so turn off the rcv
-		 * interrupt for all the ports that we set the rcv_waiting
-		 * (but never for kernel port)
-		 */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-				 dd->ipath_rcvctrl);
-	}
-}
-
-irqreturn_t ipath_intr(int irq, void *data)
-{
-	struct ipath_devdata *dd = data;
-	u64 istat, chk0rcv = 0;
-	ipath_err_t estat = 0;
-	irqreturn_t ret;
-	static unsigned unexpected = 0;
-	u64 kportrbits;
-
-	ipath_stats.sps_ints++;
-
-	if (dd->ipath_int_counter != (u32) -1)
-		dd->ipath_int_counter++;
-
-	if (!(dd->ipath_flags & IPATH_PRESENT)) {
-		/*
-		 * This return value is not great, but we do not want the
-		 * interrupt core code to remove our interrupt handler
-		 * because we don't appear to be handling an interrupt
-		 * during a chip reset.
-		 */
-		return IRQ_HANDLED;
-	}
-
-	/*
-	 * this needs to be flags&initted, not statusp, so we keep
-	 * taking interrupts even after link goes down, etc.
-	 * Also, we *must* clear the interrupt at some point, or we won't
-	 * take it again, which can be real bad for errors, etc...
-	 */
-
-	if (!(dd->ipath_flags & IPATH_INITTED)) {
-		ipath_bad_intr(dd, &unexpected);
-		ret = IRQ_NONE;
-		goto bail;
-	}
-
-	istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
-
-	if (unlikely(!istat)) {
-		ipath_stats.sps_nullintr++;
-		ret = IRQ_NONE; /* not our interrupt, or already handled */
-		goto bail;
-	}
-	if (unlikely(istat == -1)) {
-		ipath_bad_regread(dd);
-		/* don't know if it was our interrupt or not */
-		ret = IRQ_NONE;
-		goto bail;
-	}
-
-	if (unexpected)
-		unexpected = 0;
-
-	if (unlikely(istat & ~dd->ipath_i_bitsextant))
-		ipath_dev_err(dd,
-			      "interrupt with unknown interrupts %Lx set\n",
-			      (unsigned long long)
-			      istat & ~dd->ipath_i_bitsextant);
-	else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
-		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
-			(unsigned long long) istat);
-
-	if (istat & INFINIPATH_I_ERROR) {
-		ipath_stats.sps_errints++;
-		estat = ipath_read_kreg64(dd,
-					  dd->ipath_kregs->kr_errorstatus);
-		if (!estat)
-			dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
-				 "but no error bits set!\n",
-				 (unsigned long long) istat);
-		else if (estat == -1LL)
-			/*
-			 * should we try clearing all, or hope next read
-			 * works?
-			 */
-			ipath_dev_err(dd, "Read of error status failed "
-				      "(all bits set); ignoring\n");
-		else
-			chk0rcv |= handle_errors(dd, estat);
-	}
-
-	if (istat & INFINIPATH_I_GPIO) {
-		/*
-		 * GPIO interrupts fall in two broad classes:
-		 * GPIO_2 indicates (on some HT4xx boards) that a packet
-		 *        has arrived for Port 0. Checking for this
-		 *        is controlled by flag IPATH_GPIO_INTR.
-		 * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
-		 *        errors that we need to count. Checking for this
-		 *        is controlled by flag IPATH_GPIO_ERRINTRS.
-		 */
-		u32 gpiostatus;
-		u32 to_clear = 0;
-
-		gpiostatus = ipath_read_kreg32(
-			dd, dd->ipath_kregs->kr_gpio_status);
-		/* First the error-counter case. */
-		if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
-		    (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
-			/* want to clear the bits we see asserted. */
-			to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
-
-			/*
-			 * Count appropriately, clear bits out of our copy,
-			 * as they have been "handled".
-			 */
-			if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
-				ipath_dbg("FlowCtl on UnsupVL\n");
-				dd->ipath_rxfc_unsupvl_errs++;
-			}
-			if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
-				ipath_dbg("Overrun Threshold exceeded\n");
-				dd->ipath_overrun_thresh_errs++;
-			}
-			if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
-				ipath_dbg("Local Link Integrity error\n");
-				dd->ipath_lli_errs++;
-			}
-			gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
-		}
-		/* Now the Port0 Receive case */
-		if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
-		    (dd->ipath_flags & IPATH_GPIO_INTR)) {
-			/*
-			 * GPIO status bit 2 is set, and we expected it.
-			 * clear it and indicate in p0bits.
-			 * This probably only happens if a Port0 pkt
-			 * arrives at _just_ the wrong time, and we
-			 * handle that by seting chk0rcv;
-			 */
-			to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
-			gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
-			chk0rcv = 1;
-		}
-		if (gpiostatus) {
-			/*
-			 * Some unexpected bits remain. If they could have
-			 * caused the interrupt, complain and clear.
-			 * To avoid repetition of this condition, also clear
-			 * the mask. It is almost certainly due to error.
-			 */
-			const u32 mask = (u32) dd->ipath_gpio_mask;
-
-			if (mask & gpiostatus) {
-				ipath_dbg("Unexpected GPIO IRQ bits %x\n",
-				  gpiostatus & mask);
-				to_clear |= (gpiostatus & mask);
-				dd->ipath_gpio_mask &= ~(gpiostatus & mask);
-				ipath_write_kreg(dd,
-					dd->ipath_kregs->kr_gpio_mask,
-					dd->ipath_gpio_mask);
-			}
-		}
-		if (to_clear) {
-			ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-					(u64) to_clear);
-		}
-	}
-
-	/*
-	 * Clear the interrupt bits we found set, unless they are receive
-	 * related, in which case we already cleared them above, and don't
-	 * want to clear them again, because we might lose an interrupt.
-	 * Clear it early, so we "know" know the chip will have seen this by
-	 * the time we process the queue, and will re-interrupt if necessary.
-	 * The processor itself won't take the interrupt again until we return.
-	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
-
-	/*
-	 * Handle kernel receive queues before checking for pio buffers
-	 * available since receives can overflow; piobuf waiters can afford
-	 * a few extra cycles, since they were waiting anyway, and user's
-	 * waiting for receive are at the bottom.
-	 */
-	kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
-		(1ULL << dd->ipath_i_rcvurg_shift);
-	if (chk0rcv || (istat & kportrbits)) {
-		istat &= ~kportrbits;
-		ipath_kreceive(dd->ipath_pd[0]);
-	}
-
-	if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
-		     (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
-		handle_urcv(dd, istat);
-
-	if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
-		handle_sdma_intr(dd, istat);
-
-	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-				 dd->ipath_sendctrl);
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-		/* always process; sdma verbs uses PIO for acks and VL15  */
-		handle_layer_pioavail(dd);
-	}
-
-	ret = IRQ_HANDLED;
-
-bail:
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
deleted file mode 100644
index f0f9471..0000000
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ /dev/null
@@ -1,1373 +0,0 @@
-#ifndef _IPATH_KERNEL_H
-#define _IPATH_KERNEL_H
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This header file is the base header file for infinipath kernel code
- * ipath_user.h serves a similar purpose for user code.
- */
-
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_common.h"
-#include "ipath_debug.h"
-#include "ipath_registers.h"
-
-/* only s/w major version of InfiniPath we can handle */
-#define IPATH_CHIP_VERS_MAJ 2U
-
-/* don't care about this except printing */
-#define IPATH_CHIP_VERS_MIN 0U
-
-/* temporary, maybe always */
-extern struct infinipath_stats ipath_stats;
-
-#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
-/*
- * First-cut critierion for "device is active" is
- * two thousand dwords combined Tx, Rx traffic per
- * 5-second interval. SMA packets are 64 dwords,
- * and occur "a few per second", presumably each way.
- */
-#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
-/*
- * Struct used to indicate which errors are logged in each of the
- * error-counters that are logged to EEPROM. A counter is incremented
- * _once_ (saturating at 255) for each event with any bits set in
- * the error or hwerror register masks below.
- */
-#define IPATH_EEP_LOG_CNT (4)
-struct ipath_eep_log_mask {
-	u64 errs_to_log;
-	u64 hwerrs_to_log;
-};
-
-struct ipath_portdata {
-	void **port_rcvegrbuf;
-	dma_addr_t *port_rcvegrbuf_phys;
-	/* rcvhdrq base, needs mmap before useful */
-	void *port_rcvhdrq;
-	/* kernel virtual address where hdrqtail is updated */
-	void *port_rcvhdrtail_kvaddr;
-	/*
-	 * temp buffer for expected send setup, allocated at open, instead
-	 * of each setup call
-	 */
-	void *port_tid_pg_list;
-	/* when waiting for rcv or pioavail */
-	wait_queue_head_t port_wait;
-	/*
-	 * rcvegr bufs base, physical, must fit
-	 * in 44 bits so 32 bit programs mmap64 44 bit works)
-	 */
-	dma_addr_t port_rcvegr_phys;
-	/* mmap of hdrq, must fit in 44 bits */
-	dma_addr_t port_rcvhdrq_phys;
-	dma_addr_t port_rcvhdrqtailaddr_phys;
-	/*
-	 * number of opens (including slave subports) on this instance
-	 * (ignoring forks, dup, etc. for now)
-	 */
-	int port_cnt;
-	/*
-	 * how much space to leave at start of eager TID entries for
-	 * protocol use, on each TID
-	 */
-	/* instead of calculating it */
-	unsigned port_port;
-	/* non-zero if port is being shared. */
-	u16 port_subport_cnt;
-	/* non-zero if port is being shared. */
-	u16 port_subport_id;
-	/* number of pio bufs for this port (all procs, if shared) */
-	u32 port_piocnt;
-	/* first pio buffer for this port */
-	u32 port_pio_base;
-	/* chip offset of PIO buffers for this port */
-	u32 port_piobufs;
-	/* how many alloc_pages() chunks in port_rcvegrbuf_pages */
-	u32 port_rcvegrbuf_chunks;
-	/* how many egrbufs per chunk */
-	u32 port_rcvegrbufs_perchunk;
-	/* order for port_rcvegrbuf_pages */
-	size_t port_rcvegrbuf_size;
-	/* rcvhdrq size (for freeing) */
-	size_t port_rcvhdrq_size;
-	/* next expected TID to check when looking for free */
-	u32 port_tidcursor;
-	/* next expected TID to check */
-	unsigned long port_flag;
-	/* what happened */
-	unsigned long int_flag;
-	/* WAIT_RCV that timed out, no interrupt */
-	u32 port_rcvwait_to;
-	/* WAIT_PIO that timed out, no interrupt */
-	u32 port_piowait_to;
-	/* WAIT_RCV already happened, no wait */
-	u32 port_rcvnowait;
-	/* WAIT_PIO already happened, no wait */
-	u32 port_pionowait;
-	/* total number of rcvhdrqfull errors */
-	u32 port_hdrqfull;
-	/*
-	 * Used to suppress multiple instances of same
-	 * port staying stuck at same point.
-	 */
-	u32 port_lastrcvhdrqtail;
-	/* saved total number of rcvhdrqfull errors for poll edge trigger */
-	u32 port_hdrqfull_poll;
-	/* total number of polled urgent packets */
-	u32 port_urgent;
-	/* saved total number of polled urgent packets for poll edge trigger */
-	u32 port_urgent_poll;
-	/* pid of process using this port */
-	struct pid *port_pid;
-	struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
-	/* same size as task_struct .comm[] */
-	char port_comm[16];
-	/* pkeys set by this use of this port */
-	u16 port_pkeys[4];
-	/* so file ops can get at unit */
-	struct ipath_devdata *port_dd;
-	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
-	void *subport_uregbase;
-	/* An array of pages for the eager receive buffers * N */
-	void *subport_rcvegrbuf;
-	/* An array of pages for the eager header queue entries * N */
-	void *subport_rcvhdr_base;
-	/* The version of the library which opened this port */
-	u32 userversion;
-	/* Bitmask of active slaves */
-	u32 active_slaves;
-	/* Type of packets or conditions we want to poll for */
-	u16 poll_type;
-	/* port rcvhdrq head offset */
-	u32 port_head;
-	/* receive packet sequence counter */
-	u32 port_seq_cnt;
-};
-
-struct sk_buff;
-struct ipath_sge_state;
-struct ipath_verbs_txreq;
-
-/*
- * control information for layered drivers
- */
-struct _ipath_layer {
-	void *l_arg;
-};
-
-struct ipath_skbinfo {
-	struct sk_buff *skb;
-	dma_addr_t phys;
-};
-
-struct ipath_sdma_txreq {
-	int                 flags;
-	int                 sg_count;
-	union {
-		struct scatterlist *sg;
-		void *map_addr;
-	};
-	void              (*callback)(void *, int);
-	void               *callback_cookie;
-	int                 callback_status;
-	u16                 start_idx;  /* sdma private */
-	u16                 next_descq_idx;  /* sdma private */
-	struct list_head    list;       /* sdma private */
-};
-
-struct ipath_sdma_desc {
-	__le64 qw[2];
-};
-
-#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
-#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
-#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
-#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
-#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
-#define IPATH_SDMA_TXREQ_F_VL15         0x20
-
-#define IPATH_SDMA_TXREQ_S_OK        0
-#define IPATH_SDMA_TXREQ_S_SENDERROR 1
-#define IPATH_SDMA_TXREQ_S_ABORTED   2
-#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
-
-#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG	(1ull << 63)
-#define IPATH_SDMA_STATUS_ABORT_IN_PROG			(1ull << 62)
-#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE		(1ull << 61)
-#define IPATH_SDMA_STATUS_SCB_EMPTY			(1ull << 30)
-
-/* max dwords in small buffer packet */
-#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
-
-/*
- * Possible IB config parameters for ipath_f_get/set_ib_cfg()
- */
-#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
-#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
-#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
-#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
-#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
-#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
-#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
-#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
-#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
-#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
-#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
-
-
-struct ipath_devdata {
-	struct list_head ipath_list;
-
-	struct ipath_kregs const *ipath_kregs;
-	struct ipath_cregs const *ipath_cregs;
-
-	/* mem-mapped pointer to base of chip regs */
-	u64 __iomem *ipath_kregbase;
-	/* end of mem-mapped chip space; range checking */
-	u64 __iomem *ipath_kregend;
-	/* physical address of chip for io_remap, etc. */
-	unsigned long ipath_physaddr;
-	/* base of memory alloced for ipath_kregbase, for free */
-	u64 *ipath_kregalloc;
-	/* ipath_cfgports pointers */
-	struct ipath_portdata **ipath_pd;
-	/* sk_buffs used by port 0 eager receive queue */
-	struct ipath_skbinfo *ipath_port0_skbinfo;
-	/* kvirt address of 1st 2k pio buffer */
-	void __iomem *ipath_pio2kbase;
-	/* kvirt address of 1st 4k pio buffer */
-	void __iomem *ipath_pio4kbase;
-	/*
-	 * points to area where PIOavail registers will be DMA'ed.
-	 * Has to be on a page of it's own, because the page will be
-	 * mapped into user program space.  This copy is *ONLY* ever
-	 * written by DMA, not by the driver!  Need a copy per device
-	 * when we get to multiple devices
-	 */
-	volatile __le64 *ipath_pioavailregs_dma;
-	/* physical address where updates occur */
-	dma_addr_t ipath_pioavailregs_phys;
-	struct _ipath_layer ipath_layer;
-	/* setup intr */
-	int (*ipath_f_intrsetup)(struct ipath_devdata *);
-	/* fallback to alternate interrupt type if possible */
-	int (*ipath_f_intr_fallback)(struct ipath_devdata *);
-	/* setup on-chip bus config */
-	int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
-	/* hard reset chip */
-	int (*ipath_f_reset)(struct ipath_devdata *);
-	int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
-				     size_t);
-	void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
-	void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
-					size_t);
-	void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
-	int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
-	int (*ipath_f_early_init)(struct ipath_devdata *);
-	void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
-	void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
-				u32, unsigned long);
-	void (*ipath_f_tidtemplate)(struct ipath_devdata *);
-	void (*ipath_f_cleanup)(struct ipath_devdata *);
-	void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
-	/* fill out chip-specific fields */
-	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
-	/* free irq */
-	void (*ipath_f_free_irq)(struct ipath_devdata *);
-	struct ipath_message_header *(*ipath_f_get_msgheader)
-					(struct ipath_devdata *, __le32 *);
-	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
-	int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
-	int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
-	void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
-	void (*ipath_f_read_counters)(struct ipath_devdata *,
-					struct infinipath_counters *);
-	void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
-	/* per chip actions needed for IB Link up/down changes */
-	int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
-
-	unsigned ipath_lastegr_idx;
-	struct ipath_ibdev *verbs_dev;
-	struct timer_list verbs_timer;
-	/* total dwords sent (summed from counter) */
-	u64 ipath_sword;
-	/* total dwords rcvd (summed from counter) */
-	u64 ipath_rword;
-	/* total packets sent (summed from counter) */
-	u64 ipath_spkts;
-	/* total packets rcvd (summed from counter) */
-	u64 ipath_rpkts;
-	/* ipath_statusp initially points to this. */
-	u64 _ipath_status;
-	/* GUID for this interface, in network order */
-	__be64 ipath_guid;
-	/*
-	 * aggregrate of error bits reported since last cleared, for
-	 * limiting of error reporting
-	 */
-	ipath_err_t ipath_lasterror;
-	/*
-	 * aggregrate of error bits reported since last cleared, for
-	 * limiting of hwerror reporting
-	 */
-	ipath_err_t ipath_lasthwerror;
-	/* errors masked because they occur too fast */
-	ipath_err_t ipath_maskederrs;
-	u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
-	/* these 5 fields are used to establish deltas for IB Symbol
-	 * errors and linkrecovery errors. They can be reported on
-	 * some chips during link negotiation prior to INIT, and with
-	 * DDR when faking DDR negotiations with non-IBTA switches.
-	 * The chip counters are adjusted at driver unload if there is
-	 * a non-zero delta.
-	 */
-	u64 ibdeltainprog;
-	u64 ibsymdelta;
-	u64 ibsymsnap;
-	u64 iblnkerrdelta;
-	u64 iblnkerrsnap;
-
-	/* time in jiffies at which to re-enable maskederrs */
-	unsigned long ipath_unmasktime;
-	/* count of egrfull errors, combined for all ports */
-	u64 ipath_last_tidfull;
-	/* for ipath_qcheck() */
-	u64 ipath_lastport0rcv_cnt;
-	/* template for writing TIDs  */
-	u64 ipath_tidtemplate;
-	/* value to write to free TIDs */
-	u64 ipath_tidinvalid;
-	/* IBA6120 rcv interrupt setup */
-	u64 ipath_rhdrhead_intr_off;
-
-	/* size of memory at ipath_kregbase */
-	u32 ipath_kregsize;
-	/* number of registers used for pioavail */
-	u32 ipath_pioavregs;
-	/* IPATH_POLL, etc. */
-	u32 ipath_flags;
-	/* ipath_flags driver is waiting for */
-	u32 ipath_state_wanted;
-	/* last buffer for user use, first buf for kernel use is this
-	 * index. */
-	u32 ipath_lastport_piobuf;
-	/* is a stats timer active */
-	u32 ipath_stats_timer_active;
-	/* number of interrupts for this device -- saturates... */
-	u32 ipath_int_counter;
-	/* dwords sent read from counter */
-	u32 ipath_lastsword;
-	/* dwords received read from counter */
-	u32 ipath_lastrword;
-	/* sent packets read from counter */
-	u32 ipath_lastspkts;
-	/* received packets read from counter */
-	u32 ipath_lastrpkts;
-	/* pio bufs allocated per port */
-	u32 ipath_pbufsport;
-	/* if remainder on bufs/port, ports < extrabuf get 1 extra */
-	u32 ipath_ports_extrabuf;
-	u32 ipath_pioupd_thresh; /* update threshold, some chips */
-	/*
-	 * number of ports configured as max; zero is set to number chip
-	 * supports, less gives more pio bufs/port, etc.
-	 */
-	u32 ipath_cfgports;
-	/* count of port 0 hdrqfull errors */
-	u32 ipath_p0_hdrqfull;
-	/* port 0 number of receive eager buffers */
-	u32 ipath_p0_rcvegrcnt;
-
-	/*
-	 * index of last piobuffer we used.  Speeds up searching, by
-	 * starting at this point.  Doesn't matter if multiple cpu's use and
-	 * update, last updater is only write that matters.  Whenever it
-	 * wraps, we update shadow copies.  Need a copy per device when we
-	 * get to multiple devices
-	 */
-	u32 ipath_lastpioindex;
-	u32 ipath_lastpioindexl;
-	/* max length of freezemsg */
-	u32 ipath_freezelen;
-	/*
-	 * consecutive times we wanted a PIO buffer but were unable to
-	 * get one
-	 */
-	u32 ipath_consec_nopiobuf;
-	/*
-	 * hint that we should update ipath_pioavailshadow before
-	 * looking for a PIO buffer
-	 */
-	u32 ipath_upd_pio_shadow;
-	/* so we can rewrite it after a chip reset */
-	u32 ipath_pcibar0;
-	/* so we can rewrite it after a chip reset */
-	u32 ipath_pcibar1;
-	u32 ipath_x1_fix_tries;
-	u32 ipath_autoneg_tries;
-	u32 serdes_first_init_done;
-
-	struct ipath_relock {
-		atomic_t ipath_relock_timer_active;
-		struct timer_list ipath_relock_timer;
-		unsigned int ipath_relock_interval; /* in jiffies */
-	} ipath_relock_singleton;
-
-	/* interrupt number */
-	int ipath_irq;
-	/* HT/PCI Vendor ID (here for NodeInfo) */
-	u16 ipath_vendorid;
-	/* HT/PCI Device ID (here for NodeInfo) */
-	u16 ipath_deviceid;
-	/* offset in HT config space of slave/primary interface block */
-	u8 ipath_ht_slave_off;
-	/* for write combining settings */
-	int wc_cookie;
-	/* ref count for each pkey */
-	atomic_t ipath_pkeyrefs[4];
-	/* shadow copy of struct page *'s for exp tid pages */
-	struct page **ipath_pageshadow;
-	/* shadow copy of dma handles for exp tid pages */
-	dma_addr_t *ipath_physshadow;
-	u64 __iomem *ipath_egrtidbase;
-	/* lock to workaround chip bug 9437 and others */
-	spinlock_t ipath_kernel_tid_lock;
-	spinlock_t ipath_user_tid_lock;
-	spinlock_t ipath_sendctrl_lock;
-	/* around ipath_pd and (user ports) port_cnt use (intr vs free) */
-	spinlock_t ipath_uctxt_lock;
-
-	/*
-	 * IPATH_STATUS_*,
-	 * this address is mapped readonly into user processes so they can
-	 * get status cheaply, whenever they want.
-	 */
-	u64 *ipath_statusp;
-	/* freeze msg if hw error put chip in freeze */
-	char *ipath_freezemsg;
-	/* pci access data structure */
-	struct pci_dev *pcidev;
-	struct cdev *user_cdev;
-	struct cdev *diag_cdev;
-	struct device *user_dev;
-	struct device *diag_dev;
-	/* timer used to prevent stats overflow, error throttling, etc. */
-	struct timer_list ipath_stats_timer;
-	/* timer to verify interrupts work, and fallback if possible */
-	struct timer_list ipath_intrchk_timer;
-	void *ipath_dummy_hdrq;	/* used after port close */
-	dma_addr_t ipath_dummy_hdrq_phys;
-
-	/* SendDMA related entries */
-	spinlock_t            ipath_sdma_lock;
-	unsigned long         ipath_sdma_status;
-	unsigned long         ipath_sdma_abort_jiffies;
-	unsigned long         ipath_sdma_abort_intr_timeout;
-	unsigned long         ipath_sdma_buf_jiffies;
-	struct ipath_sdma_desc *ipath_sdma_descq;
-	u64		      ipath_sdma_descq_added;
-	u64		      ipath_sdma_descq_removed;
-	int		      ipath_sdma_desc_nreserved;
-	u16                   ipath_sdma_descq_cnt;
-	u16                   ipath_sdma_descq_tail;
-	u16                   ipath_sdma_descq_head;
-	u16                   ipath_sdma_next_intr;
-	u16                   ipath_sdma_reset_wait;
-	u8                    ipath_sdma_generation;
-	struct tasklet_struct ipath_sdma_abort_task;
-	struct tasklet_struct ipath_sdma_notify_task;
-	struct list_head      ipath_sdma_activelist;
-	struct list_head      ipath_sdma_notifylist;
-	atomic_t              ipath_sdma_vl15_count;
-	struct timer_list     ipath_sdma_vl15_timer;
-
-	dma_addr_t       ipath_sdma_descq_phys;
-	volatile __le64 *ipath_sdma_head_dma;
-	dma_addr_t       ipath_sdma_head_phys;
-
-	unsigned long ipath_ureg_align; /* user register alignment */
-
-	struct delayed_work ipath_autoneg_work;
-	wait_queue_head_t ipath_autoneg_wait;
-
-	/* HoL blocking / user app forward-progress state */
-	unsigned          ipath_hol_state;
-	unsigned          ipath_hol_next;
-	struct timer_list ipath_hol_timer;
-
-	/*
-	 * Shadow copies of registers; size indicates read access size.
-	 * Most of them are readonly, but some are write-only register,
-	 * where we manipulate the bits in the shadow copy, and then write
-	 * the shadow copy to infinipath.
-	 *
-	 * We deliberately make most of these 32 bits, since they have
-	 * restricted range.  For any that we read, we won't to generate 32
-	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
-	 * transactions for a 64 bit read, and we want to avoid unnecessary
-	 * HT transactions.
-	 */
-
-	/* This is the 64 bit group */
-
-	/*
-	 * shadow of pioavail, check to be sure it's large enough at
-	 * init time.
-	 */
-	unsigned long ipath_pioavailshadow[8];
-	/* bitmap of send buffers available for the kernel to use with PIO. */
-	unsigned long ipath_pioavailkernel[8];
-	/* shadow of kr_gpio_out, for rmw ops */
-	u64 ipath_gpio_out;
-	/* shadow the gpio mask register */
-	u64 ipath_gpio_mask;
-	/* shadow the gpio output enable, etc... */
-	u64 ipath_extctrl;
-	/* kr_revision shadow */
-	u64 ipath_revision;
-	/*
-	 * shadow of ibcctrl, for interrupt handling of link changes,
-	 * etc.
-	 */
-	u64 ipath_ibcctrl;
-	/*
-	 * last ibcstatus, to suppress "duplicate" status change messages,
-	 * mostly from 2 to 3
-	 */
-	u64 ipath_lastibcstat;
-	/* hwerrmask shadow */
-	ipath_err_t ipath_hwerrmask;
-	ipath_err_t ipath_errormask; /* errormask shadow */
-	/* interrupt config reg shadow */
-	u64 ipath_intconfig;
-	/* kr_sendpiobufbase value */
-	u64 ipath_piobufbase;
-	/* kr_ibcddrctrl shadow */
-	u64 ipath_ibcddrctrl;
-
-	/* these are the "32 bit" regs */
-
-	/*
-	 * number of GUIDs in the flash for this interface; may need some
-	 * rethinking for setting on other ifaces
-	 */
-	u32 ipath_nguid;
-	/*
-	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
-	 * all expect bit fields to be "unsigned long"
-	 */
-	/* shadow kr_rcvctrl */
-	unsigned long ipath_rcvctrl;
-	/* shadow kr_sendctrl */
-	unsigned long ipath_sendctrl;
-	/* to not count armlaunch after cancel */
-	unsigned long ipath_lastcancel;
-	/* count cases where special trigger was needed (double write) */
-	unsigned long ipath_spectriggerhit;
-
-	/* value we put in kr_rcvhdrcnt */
-	u32 ipath_rcvhdrcnt;
-	/* value we put in kr_rcvhdrsize */
-	u32 ipath_rcvhdrsize;
-	/* value we put in kr_rcvhdrentsize */
-	u32 ipath_rcvhdrentsize;
-	/* offset of last entry in rcvhdrq */
-	u32 ipath_hdrqlast;
-	/* kr_portcnt value */
-	u32 ipath_portcnt;
-	/* kr_pagealign value */
-	u32 ipath_palign;
-	/* number of "2KB" PIO buffers */
-	u32 ipath_piobcnt2k;
-	/* size in bytes of "2KB" PIO buffers */
-	u32 ipath_piosize2k;
-	/* number of "4KB" PIO buffers */
-	u32 ipath_piobcnt4k;
-	/* size in bytes of "4KB" PIO buffers */
-	u32 ipath_piosize4k;
-	u32 ipath_pioreserved; /* reserved special-inkernel; */
-	/* kr_rcvegrbase value */
-	u32 ipath_rcvegrbase;
-	/* kr_rcvegrcnt value */
-	u32 ipath_rcvegrcnt;
-	/* kr_rcvtidbase value */
-	u32 ipath_rcvtidbase;
-	/* kr_rcvtidcnt value */
-	u32 ipath_rcvtidcnt;
-	/* kr_sendregbase */
-	u32 ipath_sregbase;
-	/* kr_userregbase */
-	u32 ipath_uregbase;
-	/* kr_counterregbase */
-	u32 ipath_cregbase;
-	/* shadow the control register contents */
-	u32 ipath_control;
-	/* PCI revision register (HTC rev on FPGA) */
-	u32 ipath_pcirev;
-
-	/* chip address space used by 4k pio buffers */
-	u32 ipath_4kalign;
-	/* The MTU programmed for this unit */
-	u32 ipath_ibmtu;
-	/*
-	 * The max size IB packet, included IB headers that we can send.
-	 * Starts same as ipath_piosize, but is affected when ibmtu is
-	 * changed, or by size of eager buffers
-	 */
-	u32 ipath_ibmaxlen;
-	/*
-	 * ibmaxlen at init time, limited by chip and by receive buffer
-	 * size.  Not changed after init.
-	 */
-	u32 ipath_init_ibmaxlen;
-	/* size of each rcvegrbuffer */
-	u32 ipath_rcvegrbufsize;
-	/* localbus width (1, 2,4,8,16,32) from config space  */
-	u32 ipath_lbus_width;
-	/* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
-	u32 ipath_lbus_speed;
-	/*
-	 * number of sequential ibcstatus change for polling active/quiet
-	 * (i.e., link not coming up).
-	 */
-	u32 ipath_ibpollcnt;
-	/* low and high portions of MSI capability/vector */
-	u32 ipath_msi_lo;
-	/* saved after PCIe init for restore after reset */
-	u32 ipath_msi_hi;
-	/* MSI data (vector) saved for restore */
-	u16 ipath_msi_data;
-	/* MLID programmed for this instance */
-	u16 ipath_mlid;
-	/* LID programmed for this instance */
-	u16 ipath_lid;
-	/* list of pkeys programmed; 0 if not set */
-	u16 ipath_pkeys[4];
-	/*
-	 * ASCII serial number, from flash, large enough for original
-	 * all digit strings, and longer QLogic serial number format
-	 */
-	u8 ipath_serial[16];
-	/* human readable board version */
-	u8 ipath_boardversion[96];
-	u8 ipath_lbus_info[32]; /* human readable localbus info */
-	/* chip major rev, from ipath_revision */
-	u8 ipath_majrev;
-	/* chip minor rev, from ipath_revision */
-	u8 ipath_minrev;
-	/* board rev, from ipath_revision */
-	u8 ipath_boardrev;
-	/* saved for restore after reset */
-	u8 ipath_pci_cacheline;
-	/* LID mask control */
-	u8 ipath_lmc;
-	/* link width supported */
-	u8 ipath_link_width_supported;
-	/* link speed supported */
-	u8 ipath_link_speed_supported;
-	u8 ipath_link_width_enabled;
-	u8 ipath_link_speed_enabled;
-	u8 ipath_link_width_active;
-	u8 ipath_link_speed_active;
-	/* Rx Polarity inversion (compensate for ~tx on partner) */
-	u8 ipath_rx_pol_inv;
-
-	u8 ipath_r_portenable_shift;
-	u8 ipath_r_intravail_shift;
-	u8 ipath_r_tailupd_shift;
-	u8 ipath_r_portcfg_shift;
-
-	/* unit # of this chip, if present */
-	int ipath_unit;
-
-	/* local link integrity counter */
-	u32 ipath_lli_counter;
-	/* local link integrity errors */
-	u32 ipath_lli_errors;
-	/*
-	 * Above counts only cases where _successive_ LocalLinkIntegrity
-	 * errors were seen in the receive headers of kern-packets.
-	 * Below are the three (monotonically increasing) counters
-	 * maintained via GPIO interrupts on iba6120-rev2.
-	 */
-	u32 ipath_rxfc_unsupvl_errs;
-	u32 ipath_overrun_thresh_errs;
-	u32 ipath_lli_errs;
-
-	/*
-	 * Not all devices managed by a driver instance are the same
-	 * type, so these fields must be per-device.
-	 */
-	u64 ipath_i_bitsextant;
-	ipath_err_t ipath_e_bitsextant;
-	ipath_err_t ipath_hwe_bitsextant;
-
-	/*
-	 * Below should be computable from number of ports,
-	 * since they are never modified.
-	 */
-	u64 ipath_i_rcvavail_mask;
-	u64 ipath_i_rcvurg_mask;
-	u16 ipath_i_rcvurg_shift;
-	u16 ipath_i_rcvavail_shift;
-
-	/*
-	 * Register bits for selecting i2c direction and values, used for
-	 * I2C serial flash.
-	 */
-	u8 ipath_gpio_sda_num;
-	u8 ipath_gpio_scl_num;
-	u8 ipath_i2c_chain_type;
-	u64 ipath_gpio_sda;
-	u64 ipath_gpio_scl;
-
-	/* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
-	spinlock_t ipath_gpio_lock;
-
-	/*
-	 * IB link and linktraining states and masks that vary per chip in
-	 * some way.  Set at init, to avoid each IB status change interrupt
-	 */
-	u8 ibcs_ls_shift;
-	u8 ibcs_lts_mask;
-	u32 ibcs_mask;
-	u32 ib_init;
-	u32 ib_arm;
-	u32 ib_active;
-
-	u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
-
-	/*
-	 * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
-	 * reg. Changes for IBA7220
-	 */
-	u8 ibcc_lic_mask; /* LinkInitCmd */
-	u8 ibcc_lc_shift; /* LinkCmd */
-	u8 ibcc_mpl_shift; /* Maxpktlen */
-
-	u8 delay_mult;
-
-	/* used to override LED behavior */
-	u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
-	u16 ipath_led_override_timeoff; /* delta to next timer event */
-	u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
-	u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
-	atomic_t ipath_led_override_timer_active;
-	/* Used to flash LEDs in override mode */
-	struct timer_list ipath_led_override_timer;
-
-	/* Support (including locks) for EEPROM logging of errors and time */
-	/* control access to actual counters, timer */
-	spinlock_t ipath_eep_st_lock;
-	/* control high-level access to EEPROM */
-	struct mutex ipath_eep_lock;
-	/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
-	uint64_t ipath_traffic_wds;
-	/* active time is kept in seconds, but logged in hours */
-	atomic_t ipath_active_time;
-	/* Below are nominal shadow of EEPROM, new since last EEPROM update */
-	uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
-	uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
-	uint16_t ipath_eep_hrs;
-	/*
-	 * masks for which bits of errs, hwerrs that cause
-	 * each of the counters to increment.
-	 */
-	struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
-
-	/* interrupt mitigation reload register info */
-	u16 ipath_jint_idle_ticks;	/* idle clock ticks */
-	u16 ipath_jint_max_packets;	/* max packets across all ports */
-
-	/*
-	 * lock for access to SerDes, and flags to sequence preset
-	 * versus steady-state. 7220-only at the moment.
-	 */
-	spinlock_t ipath_sdepb_lock;
-	u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
-};
-
-/* ipath_hol_state values (stopping/starting user proc, send flushing) */
-#define IPATH_HOL_UP       0
-#define IPATH_HOL_DOWN     1
-/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
-#define IPATH_HOL_DOWNSTOP 0
-#define IPATH_HOL_DOWNCONT 1
-
-/* bit positions for sdma_status */
-#define IPATH_SDMA_ABORTING  0
-#define IPATH_SDMA_DISARMED  1
-#define IPATH_SDMA_DISABLED  2
-#define IPATH_SDMA_LAYERBUF  3
-#define IPATH_SDMA_RUNNING  30
-#define IPATH_SDMA_SHUTDOWN 31
-
-/* bit combinations that correspond to abort states */
-#define IPATH_SDMA_ABORT_NONE 0
-#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
-#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
-	(1UL << IPATH_SDMA_DISARMED))
-#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
-	(1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
-	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
-	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-
-#define IPATH_SDMA_BUF_NONE 0
-#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
-
-/* Private data for file operations */
-struct ipath_filedata {
-	struct ipath_portdata *pd;
-	unsigned subport;
-	unsigned tidcursor;
-	struct ipath_user_sdma_queue *pq;
-};
-extern struct list_head ipath_dev_list;
-extern spinlock_t ipath_devs_lock;
-extern struct ipath_devdata *ipath_lookup(int unit);
-
-int ipath_init_chip(struct ipath_devdata *, int);
-int ipath_enable_wc(struct ipath_devdata *dd);
-void ipath_disable_wc(struct ipath_devdata *dd);
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
-void ipath_shutdown_device(struct ipath_devdata *);
-void ipath_clear_freeze(struct ipath_devdata *);
-
-struct file_operations;
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-		    struct cdev **cdevp, struct device **devp);
-void ipath_cdev_cleanup(struct cdev **cdevp,
-			struct device **devp);
-
-int ipath_diag_add(struct ipath_devdata *);
-void ipath_diag_remove(struct ipath_devdata *);
-
-extern wait_queue_head_t ipath_state_wait;
-
-int ipath_user_add(struct ipath_devdata *dd);
-void ipath_user_remove(struct ipath_devdata *dd);
-
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
-
-extern int ipath_diag_inuse;
-
-irqreturn_t ipath_intr(int irq, void *devid);
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-		     ipath_err_t err);
-#if __IPATH_INFO || __IPATH_DBG
-extern const char *ipath_ibcstatus_str[];
-#endif
-
-/* clean up any per-chip chip-specific stuff */
-void ipath_chip_cleanup(struct ipath_devdata *);
-/* clean up any chip type-specific stuff */
-void ipath_chip_done(void);
-
-void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
-			  unsigned cnt);
-void ipath_cancel_sends(struct ipath_devdata *, int);
-
-int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
-void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
-
-int ipath_parse_ushort(const char *str, unsigned short *valp);
-
-void ipath_kreceive(struct ipath_portdata *);
-int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
-int ipath_reset_device(int);
-void ipath_get_faststats(unsigned long);
-int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
-int ipath_set_linkstate(struct ipath_devdata *, u8);
-int ipath_set_mtu(struct ipath_devdata *, u16);
-int ipath_set_lid(struct ipath_devdata *, u32, u8);
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
-void ipath_enable_armlaunch(struct ipath_devdata *);
-void ipath_disable_armlaunch(struct ipath_devdata *);
-void ipath_hol_down(struct ipath_devdata *);
-void ipath_hol_up(struct ipath_devdata *);
-void ipath_hol_event(unsigned long);
-void ipath_toggle_rclkrls(struct ipath_devdata *);
-void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
-void ipath_set_relock_poll(struct ipath_devdata *, int);
-void ipath_shutdown_relock_poll(struct ipath_devdata *);
-
-/* for use in system calls, where we want to know device type, etc. */
-#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
-#define subport_fp(fp) \
-	((struct ipath_filedata *)(fp)->private_data)->subport
-#define tidcursor_fp(fp) \
-	((struct ipath_filedata *)(fp)->private_data)->tidcursor
-#define user_sdma_queue_fp(fp) \
-	((struct ipath_filedata *)(fp)->private_data)->pq
-
-/*
- * values for ipath_flags
- */
-		/* chip can report link latency (IB 1.2) */
-#define IPATH_HAS_LINK_LATENCY 0x1
-		/* The chip is up and initted */
-#define IPATH_INITTED       0x2
-		/* set if any user code has set kr_rcvhdrsize */
-#define IPATH_RCVHDRSZ_SET  0x4
-		/* The chip is present and valid for accesses */
-#define IPATH_PRESENT       0x8
-		/* HT link0 is only 8 bits wide, ignore upper byte crc
-		 * errors, etc. */
-#define IPATH_8BIT_IN_HT0   0x10
-		/* HT link1 is only 8 bits wide, ignore upper byte crc
-		 * errors, etc. */
-#define IPATH_8BIT_IN_HT1   0x20
-		/* The link is down */
-#define IPATH_LINKDOWN      0x40
-		/* The link level is up (0x11) */
-#define IPATH_LINKINIT      0x80
-		/* The link is in the armed (0x21) state */
-#define IPATH_LINKARMED     0x100
-		/* The link is in the active (0x31) state */
-#define IPATH_LINKACTIVE    0x200
-		/* link current state is unknown */
-#define IPATH_LINKUNK       0x400
-		/* Write combining flush needed for PIO */
-#define IPATH_PIO_FLUSH_WC  0x1000
-		/* DMA Receive tail pointer */
-#define IPATH_NODMA_RTAIL   0x2000
-		/* no IB cable, or no device on IB cable */
-#define IPATH_NOCABLE       0x4000
-		/* Supports port zero per packet receive interrupts via
-		 * GPIO */
-#define IPATH_GPIO_INTR     0x8000
-		/* uses the coded 4byte TID, not 8 byte */
-#define IPATH_4BYTE_TID     0x10000
-		/* packet/word counters are 32 bit, else those 4 counters
-		 * are 64bit */
-#define IPATH_32BITCOUNTERS 0x20000
-		/* Interrupt register is 64 bits */
-#define IPATH_INTREG_64     0x40000
-		/* can miss port0 rx interrupts */
-#define IPATH_DISABLED      0x80000 /* administratively disabled */
-		/* Use GPIO interrupts for new counters */
-#define IPATH_GPIO_ERRINTRS 0x100000
-#define IPATH_SWAP_PIOBUFS  0x200000
-		/* Supports Send DMA */
-#define IPATH_HAS_SEND_DMA  0x400000
-		/* Supports Send Count (not just word count) in PBC */
-#define IPATH_HAS_PBC_CNT   0x800000
-		/* Suppress heartbeat, even if turning off loopback */
-#define IPATH_NO_HRTBT      0x1000000
-#define IPATH_HAS_THRESH_UPDATE 0x4000000
-#define IPATH_HAS_MULT_IB_SPEED 0x8000000
-#define IPATH_IB_AUTONEG_INPROG 0x10000000
-#define IPATH_IB_AUTONEG_FAILED 0x20000000
-		/* Linkdown-disable intentionally, Do not attempt to bring up */
-#define IPATH_IB_LINK_DISABLED 0x40000000
-#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
-
-/* Bits in GPIO for the added interrupts */
-#define IPATH_GPIO_PORT0_BIT 2
-#define IPATH_GPIO_RXUVL_BIT 3
-#define IPATH_GPIO_OVRUN_BIT 4
-#define IPATH_GPIO_LLI_BIT 5
-#define IPATH_GPIO_ERRINTR_MASK 0x38
-
-/* portdata flag bit offsets */
-		/* waiting for a packet to arrive */
-#define IPATH_PORT_WAITING_RCV   2
-		/* master has not finished initializing */
-#define IPATH_PORT_MASTER_UNINIT 4
-		/* waiting for an urgent packet to arrive */
-#define IPATH_PORT_WAITING_URG 5
-
-/* free up any allocated data at closes */
-void ipath_free_data(struct ipath_portdata *dd);
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-				unsigned len, int avail);
-void ipath_init_iba6110_funcs(struct ipath_devdata *);
-void ipath_get_eeprom_info(struct ipath_devdata *);
-int ipath_update_eeprom_log(struct ipath_devdata *dd);
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
-u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_disarm_senderrbufs(struct ipath_devdata *);
-void ipath_force_pio_avail_update(struct ipath_devdata *);
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
-
-/*
- * Set LED override, only the two LSBs have "public" meaning, but
- * any non-zero value substitutes them for the Link and LinkTrain
- * LED states.
- */
-#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
-#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
-
-/* send dma routines */
-int setup_sdma(struct ipath_devdata *);
-void teardown_sdma(struct ipath_devdata *);
-void ipath_restart_sdma(struct ipath_devdata *);
-void ipath_sdma_intr(struct ipath_devdata *);
-int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
-			  u32, struct ipath_verbs_txreq *);
-/* ipath_sdma_lock should be locked before calling this. */
-int ipath_sdma_make_progress(struct ipath_devdata *dd);
-
-/* must be called under ipath_sdma_lock */
-static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
-{
-	return dd->ipath_sdma_descq_cnt -
-		(dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
-		1 - dd->ipath_sdma_desc_nreserved;
-}
-
-static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
-{
-	dd->ipath_sdma_desc_nreserved += cnt;
-}
-
-static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
-{
-	dd->ipath_sdma_desc_nreserved -= cnt;
-}
-
-/*
- * number of words used for protocol header if not set by ipath_userinit();
- */
-#define IPATH_DFLT_RCVHDRSIZE 9
-
-int ipath_get_user_pages(unsigned long, size_t, struct page **);
-void ipath_release_user_pages(struct page **, size_t);
-void ipath_release_user_pages_on_close(struct page **, size_t);
-int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
-int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
-int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
-int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
-
-/* these are used for the registers that vary with port */
-void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
-			   unsigned, u64);
-
-/*
- * We could have a single register get/put routine, that takes a group type,
- * but this is somewhat clearer and cleaner.  It also gives us some error
- * checking.  64 bit register reads should always work, but are inefficient
- * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
- * so we use kreg32 wherever possible.  User register and counter register
- * reads are always 32 bit reads, so only one form of those routines.
- */
-
-/*
- * At the moment, none of the s-registers are writable, so no
- * ipath_write_sreg().
- */
-
-/**
- * ipath_read_ureg32 - read 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @port: port number
- *
- * Return the contents of a register that is virtualized to be per port.
- * Returns -1 on errors (not distinguishable from valid contents at
- * runtime; we may add a separate error variable at some point).
- */
-static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
-				    ipath_ureg regno, int port)
-{
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-		return 0;
-
-	return readl(regno + (u64 __iomem *)
-		     (dd->ipath_uregbase +
-		      (char __iomem *)dd->ipath_kregbase +
-		      dd->ipath_ureg_align * port));
-}
-
-/**
- * ipath_write_ureg - write 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @value: value
- * @port: port
- *
- * Write the contents of a register that is virtualized to be per port.
- */
-static inline void ipath_write_ureg(const struct ipath_devdata *dd,
-				    ipath_ureg regno, u64 value, int port)
-{
-	u64 __iomem *ubase = (u64 __iomem *)
-		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
-		 dd->ipath_ureg_align * port);
-	if (dd->ipath_kregbase)
-		writeq(value, &ubase[regno]);
-}
-
-static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
-				    ipath_kreg regno)
-{
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-		return -1;
-	return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
-				    ipath_kreg regno)
-{
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-		return -1;
-
-	return readq(&dd->ipath_kregbase[regno]);
-}
-
-static inline void ipath_write_kreg(const struct ipath_devdata *dd,
-				    ipath_kreg regno, u64 value)
-{
-	if (dd->ipath_kregbase)
-		writeq(value, &dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
-				  ipath_sreg regno)
-{
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-		return 0;
-
-	return readq(regno + (u64 __iomem *)
-		     (dd->ipath_cregbase +
-		      (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
-					 ipath_sreg regno)
-{
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-		return 0;
-	return readl(regno + (u64 __iomem *)
-		     (dd->ipath_cregbase +
-		      (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_write_creg(const struct ipath_devdata *dd,
-				    ipath_creg regno, u64 value)
-{
-	if (dd->ipath_kregbase)
-		writeq(value, regno + (u64 __iomem *)
-		       (dd->ipath_cregbase +
-			(char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
-{
-	*((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
-}
-
-static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
-{
-	return (u32) le64_to_cpu(*((volatile __le64 *)
-				pd->port_rcvhdrtail_kvaddr));
-}
-
-static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
-{
-	const struct ipath_devdata *dd = pd->port_dd;
-	u32 hdrqtail;
-
-	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-		__le32 *rhf_addr;
-		u32 seq;
-
-		rhf_addr = (__le32 *) pd->port_rcvhdrq +
-			pd->port_head + dd->ipath_rhf_offset;
-		seq = ipath_hdrget_seq(rhf_addr);
-		hdrqtail = pd->port_head;
-		if (seq == pd->port_seq_cnt)
-			hdrqtail++;
-	} else
-		hdrqtail = ipath_get_rcvhdrtail(pd);
-
-	return hdrqtail;
-}
-
-static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
-{
-	return (dd->ipath_flags & IPATH_INTREG_64) ?
-		ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return linkstate
- * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
- * everywhere, anyway (and should be, for almost all purposes).
- */
-static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-	u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
-		INFINIPATH_IBCS_LINKSTATE_MASK;
-	if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
-		state = INFINIPATH_IBCS_L_STATE_ACTIVE;
-	return state;
-}
-
-/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
-static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
-{
-	return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-		dd->ibcs_lts_mask;
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return logical link state
- * combination of link state and linktraining state (down, active, init,
- * arm, etc.
- */
-static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
-{
-	u32 ibs;
-	ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-		dd->ibcs_lts_mask;
-	ibs |= (u32)(ibcs &
-		(INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
-	return ibs;
-}
-
-/*
- * sysfs interface.
- */
-
-struct device_driver;
-
-extern const char ib_ipath_version[];
-
-extern const struct attribute_group *ipath_driver_attr_groups[];
-
-int ipath_device_create_group(struct device *, struct ipath_devdata *);
-void ipath_device_remove_group(struct device *, struct ipath_devdata *);
-int ipath_expose_reset(struct device *);
-
-int ipath_init_ipathfs(void);
-void ipath_exit_ipathfs(void);
-int ipathfs_add_device(struct ipath_devdata *);
-int ipathfs_remove_device(struct ipath_devdata *);
-
-/*
- * dma_addr wrappers - all 0's invalid for hw
- */
-dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
-			  size_t, int);
-dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
-const char *ipath_get_unit_name(int unit);
-
-/*
- * Flush write combining store buffers (if present) and perform a write
- * barrier.
- */
-#if defined(CONFIG_X86_64)
-#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
-#else
-#define ipath_flush_wc() wmb()
-#endif
-
-extern unsigned ipath_debug; /* debugging bit mask */
-extern unsigned ipath_linkrecovery;
-extern unsigned ipath_mtu4096;
-extern struct mutex ipath_mutex;
-
-#define IPATH_DRV_NAME		"ib_ipath"
-#define IPATH_MAJOR		233
-#define IPATH_USER_MINOR_BASE	0
-#define IPATH_DIAGPKT_MINOR	127
-#define IPATH_DIAG_MINOR_BASE	129
-#define IPATH_NMINORS		255
-
-#define ipath_dev_err(dd,fmt,...) \
-	do { \
-		const struct ipath_devdata *__dd = (dd); \
-		if (__dd->pcidev) \
-			dev_err(&__dd->pcidev->dev, "%s: " fmt, \
-				ipath_get_unit_name(__dd->ipath_unit), \
-				##__VA_ARGS__); \
-		else \
-			printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
-			       ipath_get_unit_name(__dd->ipath_unit), \
-			       ##__VA_ARGS__); \
-	} while (0)
-
-#if _IPATH_DEBUGGING
-
-# define __IPATH_DBG_WHICH(which,fmt,...) \
-	do { \
-		if (unlikely(ipath_debug & (which))) \
-			printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
-			       __func__,##__VA_ARGS__); \
-	} while(0)
-
-# define ipath_dbg(fmt,...) \
-	__IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
-# define ipath_cdbg(which,fmt,...) \
-	__IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
-
-#else /* ! _IPATH_DEBUGGING */
-
-# define ipath_dbg(fmt,...)
-# define ipath_cdbg(which,fmt,...)
-
-#endif /* _IPATH_DEBUGGING */
-
-/*
- * this is used for formatting hw error messages...
- */
-struct ipath_hwerror_msgs {
-	u64 mask;
-	const char *msg;
-};
-
-#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
-
-/* in ipath_intr.c... */
-void ipath_format_hwerrors(u64 hwerrs,
-			   const struct ipath_hwerror_msgs *hwerrmsgs,
-			   size_t nhwerrmsgs,
-			   char *msg, size_t lmsg);
-
-#endif				/* _IPATH_KERNEL_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
deleted file mode 100644
index c0e933f..0000000
--- a/drivers/infiniband/hw/ipath/ipath_keys.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <asm/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_alloc_lkey - allocate an lkey
- * @rkt: lkey table in which to allocate the lkey
- * @mr: memory region that this lkey protects
- *
- * Returns 1 if successful, otherwise returns 0.
- */
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
-{
-	unsigned long flags;
-	u32 r;
-	u32 n;
-	int ret;
-
-	spin_lock_irqsave(&rkt->lock, flags);
-
-	/* Find the next available LKEY */
-	r = n = rkt->next;
-	for (;;) {
-		if (rkt->table[r] == NULL)
-			break;
-		r = (r + 1) & (rkt->max - 1);
-		if (r == n) {
-			spin_unlock_irqrestore(&rkt->lock, flags);
-			ipath_dbg("LKEY table full\n");
-			ret = 0;
-			goto bail;
-		}
-	}
-	rkt->next = (r + 1) & (rkt->max - 1);
-	/*
-	 * Make sure lkey is never zero which is reserved to indicate an
-	 * unrestricted LKEY.
-	 */
-	rkt->gen++;
-	mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
-		((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
-		 << 8);
-	if (mr->lkey == 0) {
-		mr->lkey |= 1 << 8;
-		rkt->gen++;
-	}
-	rkt->table[r] = mr;
-	spin_unlock_irqrestore(&rkt->lock, flags);
-
-	ret = 1;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_free_lkey - free an lkey
- * @rkt: table from which to free the lkey
- * @lkey: lkey id to free
- */
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
-{
-	unsigned long flags;
-	u32 r;
-
-	if (lkey == 0)
-		return;
-	r = lkey >> (32 - ib_ipath_lkey_table_size);
-	spin_lock_irqsave(&rkt->lock, flags);
-	rkt->table[r] = NULL;
-	spin_unlock_irqrestore(&rkt->lock, flags);
-}
-
-/**
- * ipath_lkey_ok - check IB SGE for validity and initialize
- * @rkt: table containing lkey to check SGE against
- * @isge: outgoing internal SGE
- * @sge: SGE to check
- * @acc: access flags
- *
- * Return 1 if valid and successful, otherwise returns 0.
- *
- * Check the IB SGE for validity and initialize our internal version
- * of it.
- */
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-		  struct ib_sge *sge, int acc)
-{
-	struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
-	struct ipath_mregion *mr;
-	unsigned n, m;
-	size_t off;
-	int ret;
-
-	/*
-	 * We use LKEY == zero for kernel virtual addresses
-	 * (see ipath_get_dma_mr and ipath_dma.c).
-	 */
-	if (sge->lkey == 0) {
-		/* always a kernel port, no locking needed */
-		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-		if (pd->user) {
-			ret = 0;
-			goto bail;
-		}
-		isge->mr = NULL;
-		isge->vaddr = (void *) sge->addr;
-		isge->length = sge->length;
-		isge->sge_length = sge->length;
-		ret = 1;
-		goto bail;
-	}
-	mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
-	if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
-		     qp->ibqp.pd != mr->pd)) {
-		ret = 0;
-		goto bail;
-	}
-
-	off = sge->addr - mr->user_base;
-	if (unlikely(sge->addr < mr->user_base ||
-		     off + sge->length > mr->length ||
-		     (mr->access_flags & acc) != acc)) {
-		ret = 0;
-		goto bail;
-	}
-
-	off += mr->offset;
-	m = 0;
-	n = 0;
-	while (off >= mr->map[m]->segs[n].length) {
-		off -= mr->map[m]->segs[n].length;
-		n++;
-		if (n >= IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-	isge->mr = mr;
-	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
-	isge->length = mr->map[m]->segs[n].length - off;
-	isge->sge_length = sge->length;
-	isge->m = m;
-	isge->n = n;
-
-	ret = 1;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_rkey_ok - check the IB virtual address, length, and RKEY
- * @dev: infiniband device
- * @ss: SGE state
- * @len: length of data
- * @vaddr: virtual address to place data
- * @rkey: rkey to check
- * @acc: access flags
- *
- * Return 1 if successful, otherwise 0.
- */
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-		  u32 len, u64 vaddr, u32 rkey, int acc)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ipath_lkey_table *rkt = &dev->lk_table;
-	struct ipath_sge *sge = &ss->sge;
-	struct ipath_mregion *mr;
-	unsigned n, m;
-	size_t off;
-	int ret;
-
-	/*
-	 * We use RKEY == zero for kernel virtual addresses
-	 * (see ipath_get_dma_mr and ipath_dma.c).
-	 */
-	if (rkey == 0) {
-		/* always a kernel port, no locking needed */
-		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-		if (pd->user) {
-			ret = 0;
-			goto bail;
-		}
-		sge->mr = NULL;
-		sge->vaddr = (void *) vaddr;
-		sge->length = len;
-		sge->sge_length = len;
-		ss->sg_list = NULL;
-		ss->num_sge = 1;
-		ret = 1;
-		goto bail;
-	}
-
-	mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
-	if (unlikely(mr == NULL || mr->lkey != rkey ||
-		     qp->ibqp.pd != mr->pd)) {
-		ret = 0;
-		goto bail;
-	}
-
-	off = vaddr - mr->iova;
-	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
-		     (mr->access_flags & acc) == 0)) {
-		ret = 0;
-		goto bail;
-	}
-
-	off += mr->offset;
-	m = 0;
-	n = 0;
-	while (off >= mr->map[m]->segs[n].length) {
-		off -= mr->map[m]->segs[n].length;
-		n++;
-		if (n >= IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-	sge->mr = mr;
-	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
-	sge->length = mr->map[m]->segs[n].length - off;
-	sge->sge_length = len;
-	sge->m = m;
-	sge->n = n;
-	ss->sg_list = NULL;
-	ss->num_sge = 1;
-
-	ret = 1;
-
-bail:
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
deleted file mode 100644
index ad3a926..0000000
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ /dev/null
@@ -1,1521 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_pma.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define IB_SMP_UNSUP_VERSION	cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD	cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR	cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD	cpu_to_be16(0x001C)
-
-static int reply(struct ib_smp *smp)
-{
-	/*
-	 * The verbs framework will handle the directed/LID route
-	 * packet changes.
-	 */
-	smp->method = IB_MGMT_METHOD_GET_RESP;
-	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-		smp->status |= IB_SMP_DIRECTION;
-	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-static int recv_subn_get_nodedescription(struct ib_smp *smp,
-					 struct ib_device *ibdev)
-{
-	if (smp->attr_mod)
-		smp->status |= IB_SMP_INVALID_FIELD;
-
-	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
-
-	return reply(smp);
-}
-
-struct nodeinfo {
-	u8 base_version;
-	u8 class_version;
-	u8 node_type;
-	u8 num_ports;
-	__be64 sys_guid;
-	__be64 node_guid;
-	__be64 port_guid;
-	__be16 partition_cap;
-	__be16 device_id;
-	__be32 revision;
-	u8 local_port_num;
-	u8 vendor_id[3];
-} __attribute__ ((packed));
-
-static int recv_subn_get_nodeinfo(struct ib_smp *smp,
-				  struct ib_device *ibdev, u8 port)
-{
-	struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
-	struct ipath_devdata *dd = to_idev(ibdev)->dd;
-	u32 vendor, majrev, minrev;
-
-	/* GUID 0 is illegal */
-	if (smp->attr_mod || (dd->ipath_guid == 0))
-		smp->status |= IB_SMP_INVALID_FIELD;
-
-	nip->base_version = 1;
-	nip->class_version = 1;
-	nip->node_type = 1;	/* channel adapter */
-	/*
-	 * XXX The num_ports value will need a layer function to get
-	 * the value if we ever have more than one IB port on a chip.
-	 * We will also need to get the GUID for the port.
-	 */
-	nip->num_ports = ibdev->phys_port_cnt;
-	/* This is already in network order */
-	nip->sys_guid = to_idev(ibdev)->sys_image_guid;
-	nip->node_guid = dd->ipath_guid;
-	nip->port_guid = dd->ipath_guid;
-	nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
-	nip->device_id = cpu_to_be16(dd->ipath_deviceid);
-	majrev = dd->ipath_majrev;
-	minrev = dd->ipath_minrev;
-	nip->revision = cpu_to_be32((majrev << 16) | minrev);
-	nip->local_port_num = port;
-	vendor = dd->ipath_vendorid;
-	nip->vendor_id[0] = IPATH_SRC_OUI_1;
-	nip->vendor_id[1] = IPATH_SRC_OUI_2;
-	nip->vendor_id[2] = IPATH_SRC_OUI_3;
-
-	return reply(smp);
-}
-
-static int recv_subn_get_guidinfo(struct ib_smp *smp,
-				  struct ib_device *ibdev)
-{
-	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
-	__be64 *p = (__be64 *) smp->data;
-
-	/* 32 blocks of 8 64-bit GUIDs per block */
-
-	memset(smp->data, 0, sizeof(smp->data));
-
-	/*
-	 * We only support one GUID for now.  If this changes, the
-	 * portinfo.guid_cap field needs to be updated too.
-	 */
-	if (startgx == 0) {
-		__be64 g = to_idev(ibdev)->dd->ipath_guid;
-		if (g == 0)
-			/* GUID 0 is illegal */
-			smp->status |= IB_SMP_INVALID_FIELD;
-		else
-			/* The first is a copy of the read-only HW GUID. */
-			*p = g;
-	} else
-		smp->status |= IB_SMP_INVALID_FIELD;
-
-	return reply(smp);
-}
-
-static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
-{
-	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
-}
-
-static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
-{
-	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
-}
-
-static int get_overrunthreshold(struct ipath_devdata *dd)
-{
-	return (dd->ipath_ibcctrl >>
-		INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-}
-
-/**
- * set_overrunthreshold - set the overrun threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
-{
-	unsigned v;
-
-	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-	if (v != n) {
-		dd->ipath_ibcctrl &=
-			~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
-			  INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
-		dd->ipath_ibcctrl |=
-			(u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-				 dd->ipath_ibcctrl);
-	}
-	return 0;
-}
-
-static int get_phyerrthreshold(struct ipath_devdata *dd)
-{
-	return (dd->ipath_ibcctrl >>
-		INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-}
-
-/**
- * set_phyerrthreshold - set the physical error threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
-{
-	unsigned v;
-
-	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-	if (v != n) {
-		dd->ipath_ibcctrl &=
-			~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
-			  INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
-		dd->ipath_ibcctrl |=
-			(u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-				 dd->ipath_ibcctrl);
-	}
-	return 0;
-}
-
-/**
- * get_linkdowndefaultstate - get the default linkdown state
- * @dd: the infinipath device
- *
- * Returns zero if the default is POLL, 1 if the default is SLEEP.
- */
-static int get_linkdowndefaultstate(struct ipath_devdata *dd)
-{
-	return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
-}
-
-static int recv_subn_get_portinfo(struct ib_smp *smp,
-				  struct ib_device *ibdev, u8 port)
-{
-	struct ipath_ibdev *dev;
-	struct ipath_devdata *dd;
-	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-	u16 lid;
-	u8 ibcstat;
-	u8 mtu;
-	int ret;
-
-	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
-		smp->status |= IB_SMP_INVALID_FIELD;
-		ret = reply(smp);
-		goto bail;
-	}
-
-	dev = to_idev(ibdev);
-	dd = dev->dd;
-
-	/* Clear all fields.  Only set the non-zero fields. */
-	memset(smp->data, 0, sizeof(smp->data));
-
-	/* Only return the mkey if the protection field allows it. */
-	if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
-	    dev->mkeyprot == 0)
-		pip->mkey = dev->mkey;
-	pip->gid_prefix = dev->gid_prefix;
-	lid = dd->ipath_lid;
-	pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
-	pip->sm_lid = cpu_to_be16(dev->sm_lid);
-	pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
-	/* pip->diag_code; */
-	pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
-	pip->local_port_num = port;
-	pip->link_width_enabled = dd->ipath_link_width_enabled;
-	pip->link_width_supported = dd->ipath_link_width_supported;
-	pip->link_width_active = dd->ipath_link_width_active;
-	pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
-	ibcstat = dd->ipath_lastibcstat;
-	/* map LinkState to IB portinfo values.  */
-	pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
-
-	pip->portphysstate_linkdown =
-		(ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
-		(get_linkdowndefaultstate(dd) ? 1 : 2);
-	pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
-	pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
-		dd->ipath_link_speed_enabled;
-	switch (dd->ipath_ibmtu) {
-	case 4096:
-		mtu = IB_MTU_4096;
-		break;
-	case 2048:
-		mtu = IB_MTU_2048;
-		break;
-	case 1024:
-		mtu = IB_MTU_1024;
-		break;
-	case 512:
-		mtu = IB_MTU_512;
-		break;
-	case 256:
-		mtu = IB_MTU_256;
-		break;
-	default:		/* oops, something is wrong */
-		mtu = IB_MTU_2048;
-		break;
-	}
-	pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
-	pip->vlcap_inittype = 0x10;	/* VLCap = VL0, InitType = 0 */
-	pip->vl_high_limit = dev->vl_high_limit;
-	/* pip->vl_arb_high_cap; // only one VL */
-	/* pip->vl_arb_low_cap; // only one VL */
-	/* InitTypeReply = 0 */
-	/* our mtu cap depends on whether 4K MTU enabled or not */
-	pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-	/* HCAs ignore VLStallCount and HOQLife */
-	/* pip->vlstallcnt_hoqlife; */
-	pip->operationalvl_pei_peo_fpi_fpo = 0x10;	/* OVLs = 1 */
-	pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
-	/* P_KeyViolations are counted by hardware. */
-	pip->pkey_violations =
-		cpu_to_be16((ipath_get_cr_errpkey(dd) -
-			     dev->z_pkey_violations) & 0xFFFF);
-	pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
-	/* Only the hardware GUID is supported for now */
-	pip->guid_cap = 1;
-	pip->clientrereg_resv_subnetto = dev->subnet_timeout;
-	/* 32.768 usec. response time (guessing) */
-	pip->resv_resptimevalue = 3;
-	pip->localphyerrors_overrunerrors =
-		(get_phyerrthreshold(dd) << 4) |
-		get_overrunthreshold(dd);
-	/* pip->max_credit_hint; */
-	if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
-		u32 v;
-
-		v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
-		pip->link_roundtrip_latency[0] = v >> 16;
-		pip->link_roundtrip_latency[1] = v >> 8;
-		pip->link_roundtrip_latency[2] = v;
-	}
-
-	ret = reply(smp);
-
-bail:
-	return ret;
-}
-
-/**
- * get_pkeys - return the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the pkey table is placed here
- */
-static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
-{
-	/* always a kernel port, no locking needed */
-	struct ipath_portdata *pd = dd->ipath_pd[0];
-
-	memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
-
-	return 0;
-}
-
-static int recv_subn_get_pkeytable(struct ib_smp *smp,
-				   struct ib_device *ibdev)
-{
-	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-	u16 *p = (u16 *) smp->data;
-	__be16 *q = (__be16 *) smp->data;
-
-	/* 64 blocks of 32 16-bit P_Key entries */
-
-	memset(smp->data, 0, sizeof(smp->data));
-	if (startpx == 0) {
-		struct ipath_ibdev *dev = to_idev(ibdev);
-		unsigned i, n = ipath_get_npkeys(dev->dd);
-
-		get_pkeys(dev->dd, p);
-
-		for (i = 0; i < n; i++)
-			q[i] = cpu_to_be16(p[i]);
-	} else
-		smp->status |= IB_SMP_INVALID_FIELD;
-
-	return reply(smp);
-}
-
-static int recv_subn_set_guidinfo(struct ib_smp *smp,
-				  struct ib_device *ibdev)
-{
-	/* The only GUID we support is the first read-only entry. */
-	return recv_subn_get_guidinfo(smp, ibdev);
-}
-
-/**
- * set_linkdowndefaultstate - set the default linkdown state
- * @dd: the infinipath device
- * @sleep: the new state
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
-{
-	if (sleep)
-		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-	else
-		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-			 dd->ipath_ibcctrl);
-	return 0;
-}
-
-/**
- * recv_subn_set_portinfo - set port information
- * @smp: the incoming SM packet
- * @ibdev: the infiniband device
- * @port: the port on the device
- *
- * Set Portinfo (see ch. 14.2.5.6).
- */
-static int recv_subn_set_portinfo(struct ib_smp *smp,
-				  struct ib_device *ibdev, u8 port)
-{
-	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-	struct ib_event event;
-	struct ipath_ibdev *dev;
-	struct ipath_devdata *dd;
-	char clientrereg = 0;
-	u16 lid, smlid;
-	u8 lwe;
-	u8 lse;
-	u8 state;
-	u16 lstate;
-	u32 mtu;
-	int ret, ore;
-
-	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
-		goto err;
-
-	dev = to_idev(ibdev);
-	dd = dev->dd;
-	event.device = ibdev;
-	event.element.port_num = port;
-
-	dev->mkey = pip->mkey;
-	dev->gid_prefix = pip->gid_prefix;
-	dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
-
-	lid = be16_to_cpu(pip->lid);
-	if (dd->ipath_lid != lid ||
-	    dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
-		/* Must be a valid unicast LID address. */
-		if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
-			goto err;
-		ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
-		event.event = IB_EVENT_LID_CHANGE;
-		ib_dispatch_event(&event);
-	}
-
-	smlid = be16_to_cpu(pip->sm_lid);
-	if (smlid != dev->sm_lid) {
-		/* Must be a valid unicast LID address. */
-		if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
-			goto err;
-		dev->sm_lid = smlid;
-		event.event = IB_EVENT_SM_CHANGE;
-		ib_dispatch_event(&event);
-	}
-
-	/* Allow 1x or 4x to be set (see 14.2.6.6). */
-	lwe = pip->link_width_enabled;
-	if (lwe) {
-		if (lwe == 0xFF)
-			lwe = dd->ipath_link_width_supported;
-		else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
-			goto err;
-		set_link_width_enabled(dd, lwe);
-	}
-
-	/* Allow 2.5 or 5.0 Gbs. */
-	lse = pip->linkspeedactive_enabled & 0xF;
-	if (lse) {
-		if (lse == 15)
-			lse = dd->ipath_link_speed_supported;
-		else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
-			goto err;
-		set_link_speed_enabled(dd, lse);
-	}
-
-	/* Set link down default state. */
-	switch (pip->portphysstate_linkdown & 0xF) {
-	case 0: /* NOP */
-		break;
-	case 1: /* SLEEP */
-		if (set_linkdowndefaultstate(dd, 1))
-			goto err;
-		break;
-	case 2: /* POLL */
-		if (set_linkdowndefaultstate(dd, 0))
-			goto err;
-		break;
-	default:
-		goto err;
-	}
-
-	dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
-	dev->vl_high_limit = pip->vl_high_limit;
-
-	switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
-	case IB_MTU_256:
-		mtu = 256;
-		break;
-	case IB_MTU_512:
-		mtu = 512;
-		break;
-	case IB_MTU_1024:
-		mtu = 1024;
-		break;
-	case IB_MTU_2048:
-		mtu = 2048;
-		break;
-	case IB_MTU_4096:
-		if (!ipath_mtu4096)
-			goto err;
-		mtu = 4096;
-		break;
-	default:
-		/* XXX We have already partially updated our state! */
-		goto err;
-	}
-	ipath_set_mtu(dd, mtu);
-
-	dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
-
-	/* We only support VL0 */
-	if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
-		goto err;
-
-	if (pip->mkey_violations == 0)
-		dev->mkey_violations = 0;
-
-	/*
-	 * Hardware counter can't be reset so snapshot and subtract
-	 * later.
-	 */
-	if (pip->pkey_violations == 0)
-		dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
-
-	if (pip->qkey_violations == 0)
-		dev->qkey_violations = 0;
-
-	ore = pip->localphyerrors_overrunerrors;
-	if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
-		goto err;
-
-	if (set_overrunthreshold(dd, (ore & 0xF)))
-		goto err;
-
-	dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
-
-	if (pip->clientrereg_resv_subnetto & 0x80) {
-		clientrereg = 1;
-		event.event = IB_EVENT_CLIENT_REREGISTER;
-		ib_dispatch_event(&event);
-	}
-
-	/*
-	 * Do the port state change now that the other link parameters
-	 * have been set.
-	 * Changing the port physical state only makes sense if the link
-	 * is down or is being set to down.
-	 */
-	state = pip->linkspeed_portstate & 0xF;
-	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
-	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
-		goto err;
-
-	/*
-	 * Only state changes of DOWN, ARM, and ACTIVE are valid
-	 * and must be in the correct state to take effect (see 7.2.6).
-	 */
-	switch (state) {
-	case IB_PORT_NOP:
-		if (lstate == 0)
-			break;
-		/* FALLTHROUGH */
-	case IB_PORT_DOWN:
-		if (lstate == 0)
-			lstate = IPATH_IB_LINKDOWN_ONLY;
-		else if (lstate == 1)
-			lstate = IPATH_IB_LINKDOWN_SLEEP;
-		else if (lstate == 2)
-			lstate = IPATH_IB_LINKDOWN;
-		else if (lstate == 3)
-			lstate = IPATH_IB_LINKDOWN_DISABLE;
-		else
-			goto err;
-		ipath_set_linkstate(dd, lstate);
-		if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
-			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-			goto done;
-		}
-		ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
-				IPATH_LINKACTIVE, 1000);
-		break;
-	case IB_PORT_ARMED:
-		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-		break;
-	case IB_PORT_ACTIVE:
-		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-		break;
-	default:
-		/* XXX We have already partially updated our state! */
-		goto err;
-	}
-
-	ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-	if (clientrereg)
-		pip->clientrereg_resv_subnetto |= 0x80;
-
-	goto done;
-
-err:
-	smp->status |= IB_SMP_INVALID_FIELD;
-	ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-done:
-	return ret;
-}
-
-/**
- * rm_pkey - decrecment the reference count for the given PKEY
- * @dd: the infinipath device
- * @key: the PKEY index
- *
- * Return true if this was the last reference and the hardware table entry
- * needs to be changed.
- */
-static int rm_pkey(struct ipath_devdata *dd, u16 key)
-{
-	int i;
-	int ret;
-
-	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-		if (dd->ipath_pkeys[i] != key)
-			continue;
-		if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
-			dd->ipath_pkeys[i] = 0;
-			ret = 1;
-			goto bail;
-		}
-		break;
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * add_pkey - add the given PKEY to the hardware table
- * @dd: the infinipath device
- * @key: the PKEY
- *
- * Return an error code if unable to add the entry, zero if no change,
- * or 1 if the hardware PKEY register needs to be updated.
- */
-static int add_pkey(struct ipath_devdata *dd, u16 key)
-{
-	int i;
-	u16 lkey = key & 0x7FFF;
-	int any = 0;
-	int ret;
-
-	if (lkey == 0x7FFF) {
-		ret = 0;
-		goto bail;
-	}
-
-	/* Look for an empty slot or a matching PKEY. */
-	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-		if (!dd->ipath_pkeys[i]) {
-			any++;
-			continue;
-		}
-		/* If it matches exactly, try to increment the ref count */
-		if (dd->ipath_pkeys[i] == key) {
-			if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
-				ret = 0;
-				goto bail;
-			}
-			/* Lost the race. Look for an empty slot below. */
-			atomic_dec(&dd->ipath_pkeyrefs[i]);
-			any++;
-		}
-		/*
-		 * It makes no sense to have both the limited and unlimited
-		 * PKEY set at the same time since the unlimited one will
-		 * disable the limited one.
-		 */
-		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-			ret = -EEXIST;
-			goto bail;
-		}
-	}
-	if (!any) {
-		ret = -EBUSY;
-		goto bail;
-	}
-	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-		if (!dd->ipath_pkeys[i] &&
-		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-			/* for ipathstats, etc. */
-			ipath_stats.sps_pkeys[i] = lkey;
-			dd->ipath_pkeys[i] = key;
-			ret = 1;
-			goto bail;
-		}
-	}
-	ret = -EBUSY;
-
-bail:
-	return ret;
-}
-
-/**
- * set_pkeys - set the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the PKEY table
- */
-static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
-{
-	struct ipath_portdata *pd;
-	int i;
-	int changed = 0;
-
-	/* always a kernel port, no locking needed */
-	pd = dd->ipath_pd[0];
-
-	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-		u16 key = pkeys[i];
-		u16 okey = pd->port_pkeys[i];
-
-		if (key == okey)
-			continue;
-		/*
-		 * The value of this PKEY table entry is changing.
-		 * Remove the old entry in the hardware's array of PKEYs.
-		 */
-		if (okey & 0x7FFF)
-			changed |= rm_pkey(dd, okey);
-		if (key & 0x7FFF) {
-			int ret = add_pkey(dd, key);
-
-			if (ret < 0)
-				key = 0;
-			else
-				changed |= ret;
-		}
-		pd->port_pkeys[i] = key;
-	}
-	if (changed) {
-		u64 pkey;
-		struct ib_event event;
-
-		pkey = (u64) dd->ipath_pkeys[0] |
-			((u64) dd->ipath_pkeys[1] << 16) |
-			((u64) dd->ipath_pkeys[2] << 32) |
-			((u64) dd->ipath_pkeys[3] << 48);
-		ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
-			   (unsigned long long) pkey);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-				 pkey);
-
-		event.event = IB_EVENT_PKEY_CHANGE;
-		event.device = &dd->verbs_dev->ibdev;
-		event.element.port_num = port;
-		ib_dispatch_event(&event);
-	}
-	return 0;
-}
-
-static int recv_subn_set_pkeytable(struct ib_smp *smp,
-				   struct ib_device *ibdev, u8 port)
-{
-	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-	__be16 *p = (__be16 *) smp->data;
-	u16 *q = (u16 *) smp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	unsigned i, n = ipath_get_npkeys(dev->dd);
-
-	for (i = 0; i < n; i++)
-		q[i] = be16_to_cpu(p[i]);
-
-	if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
-		smp->status |= IB_SMP_INVALID_FIELD;
-
-	return recv_subn_get_pkeytable(smp, ibdev);
-}
-
-static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
-{
-	struct ib_class_port_info *p =
-		(struct ib_class_port_info *)pmp->data;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-
-	if (pmp->mad_hdr.attr_mod != 0)
-		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-	/* Indicate AllPortSelect is valid (only one port anyway) */
-	p->capability_mask = cpu_to_be16(1 << 8);
-	p->base_version = 1;
-	p->class_version = 1;
-	/*
-	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824
-	 * sec.
-	 */
-	p->resp_time_value = 18;
-
-	return reply((struct ib_smp *) pmp);
-}
-
-/*
- * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
- * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
- * We support 5 counters which only count the mandatory quantities.
- */
-#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
-				    COUNTER_MASK(1, 1) | \
-				    COUNTER_MASK(1, 2) | \
-				    COUNTER_MASK(1, 3) | \
-				    COUNTER_MASK(1, 4))
-
-static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
-					   struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portsamplescontrol *p =
-		(struct ib_pma_portsamplescontrol *)pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-	unsigned long flags;
-	u8 port_select = p->port_select;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-
-	p->port_select = port_select;
-	if (pmp->mad_hdr.attr_mod != 0 ||
-	    (port_select != port && port_select != 0xFF))
-		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-	/*
-	 * Ticks are 10x the link transfer period which for 2.5Gbs is 4
-	 * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
-	 * intervals are counted in ticks.  Since we use Linux timers, that
-	 * count in jiffies, we can't sample for less than 1000 ticks if HZ
-	 * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
-	 * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
-	 * have hardware support for delaying packets.
-	 */
-	if (crp->cr_psstat)
-		p->tick = dev->dd->ipath_link_speed_active - 1;
-	else
-		p->tick = 250;		/* 1 usec. */
-	p->counter_width = 4;	/* 32 bit counters */
-	p->counter_mask0_9 = COUNTER_MASK0_9;
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	if (crp->cr_psstat)
-		p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-	else
-		p->sample_status = dev->pma_sample_status;
-	p->sample_start = cpu_to_be32(dev->pma_sample_start);
-	p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
-	p->tag = cpu_to_be16(dev->pma_tag);
-	p->counter_select[0] = dev->pma_counter_select[0];
-	p->counter_select[1] = dev->pma_counter_select[1];
-	p->counter_select[2] = dev->pma_counter_select[2];
-	p->counter_select[3] = dev->pma_counter_select[3];
-	p->counter_select[4] = dev->pma_counter_select[4];
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-	return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
-					   struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portsamplescontrol *p =
-		(struct ib_pma_portsamplescontrol *)pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-	unsigned long flags;
-	u8 status;
-	int ret;
-
-	if (pmp->mad_hdr.attr_mod != 0 ||
-	    (p->port_select != port && p->port_select != 0xFF)) {
-		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-		ret = reply((struct ib_smp *) pmp);
-		goto bail;
-	}
-
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	if (crp->cr_psstat)
-		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-	else
-		status = dev->pma_sample_status;
-	if (status == IB_PMA_SAMPLE_STATUS_DONE) {
-		dev->pma_sample_start = be32_to_cpu(p->sample_start);
-		dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
-		dev->pma_tag = be16_to_cpu(p->tag);
-		dev->pma_counter_select[0] = p->counter_select[0];
-		dev->pma_counter_select[1] = p->counter_select[1];
-		dev->pma_counter_select[2] = p->counter_select[2];
-		dev->pma_counter_select[3] = p->counter_select[3];
-		dev->pma_counter_select[4] = p->counter_select[4];
-		if (crp->cr_psstat) {
-			ipath_write_creg(dev->dd, crp->cr_psinterval,
-					 dev->pma_sample_interval);
-			ipath_write_creg(dev->dd, crp->cr_psstart,
-					 dev->pma_sample_start);
-		} else
-			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
-	}
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-	ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
-
-bail:
-	return ret;
-}
-
-static u64 get_counter(struct ipath_ibdev *dev,
-		       struct ipath_cregs const *crp,
-		       __be16 sel)
-{
-	u64 ret;
-
-	switch (sel) {
-	case IB_PMA_PORT_XMIT_DATA:
-		ret = (crp->cr_psxmitdatacount) ?
-			ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
-			dev->ipath_sword;
-		break;
-	case IB_PMA_PORT_RCV_DATA:
-		ret = (crp->cr_psrcvdatacount) ?
-			ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
-			dev->ipath_rword;
-		break;
-	case IB_PMA_PORT_XMIT_PKTS:
-		ret = (crp->cr_psxmitpktscount) ?
-			ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
-			dev->ipath_spkts;
-		break;
-	case IB_PMA_PORT_RCV_PKTS:
-		ret = (crp->cr_psrcvpktscount) ?
-			ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
-			dev->ipath_rpkts;
-		break;
-	case IB_PMA_PORT_XMIT_WAIT:
-		ret = (crp->cr_psxmitwaitcount) ?
-			ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
-			dev->ipath_xmit_wait;
-		break;
-	default:
-		ret = 0;
-	}
-
-	return ret;
-}
-
-static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
-					  struct ib_device *ibdev)
-{
-	struct ib_pma_portsamplesresult *p =
-		(struct ib_pma_portsamplesresult *)pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-	u8 status;
-	int i;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-	p->tag = cpu_to_be16(dev->pma_tag);
-	if (crp->cr_psstat)
-		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-	else
-		status = dev->pma_sample_status;
-	p->sample_status = cpu_to_be16(status);
-	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-		    cpu_to_be32(
-			get_counter(dev, crp, dev->pma_counter_select[i]));
-
-	return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
-					      struct ib_device *ibdev)
-{
-	struct ib_pma_portsamplesresult_ext *p =
-		(struct ib_pma_portsamplesresult_ext *)pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-	u8 status;
-	int i;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-	p->tag = cpu_to_be16(dev->pma_tag);
-	if (crp->cr_psstat)
-		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-	else
-		status = dev->pma_sample_status;
-	p->sample_status = cpu_to_be16(status);
-	/* 64 bits */
-	p->extended_width = cpu_to_be32(0x80000000);
-	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-		    cpu_to_be64(
-			get_counter(dev, crp, dev->pma_counter_select[i]));
-
-	return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
-				     struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-		pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_verbs_counters cntrs;
-	u8 port_select = p->port_select;
-
-	ipath_get_counters(dev->dd, &cntrs);
-
-	/* Adjust counters for any resets done. */
-	cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
-	cntrs.link_error_recovery_counter -=
-		dev->z_link_error_recovery_counter;
-	cntrs.link_downed_counter -= dev->z_link_downed_counter;
-	cntrs.port_rcv_errors += dev->rcv_errors;
-	cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
-	cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
-	cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
-	cntrs.port_xmit_data -= dev->z_port_xmit_data;
-	cntrs.port_rcv_data -= dev->z_port_rcv_data;
-	cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
-	cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
-	cntrs.local_link_integrity_errors -=
-		dev->z_local_link_integrity_errors;
-	cntrs.excessive_buffer_overrun_errors -=
-		dev->z_excessive_buffer_overrun_errors;
-	cntrs.vl15_dropped -= dev->z_vl15_dropped;
-	cntrs.vl15_dropped += dev->n_vl15_dropped;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-
-	p->port_select = port_select;
-	if (pmp->mad_hdr.attr_mod != 0 ||
-	    (port_select != port && port_select != 0xFF))
-		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-	if (cntrs.symbol_error_counter > 0xFFFFUL)
-		p->symbol_error_counter = cpu_to_be16(0xFFFF);
-	else
-		p->symbol_error_counter =
-			cpu_to_be16((u16)cntrs.symbol_error_counter);
-	if (cntrs.link_error_recovery_counter > 0xFFUL)
-		p->link_error_recovery_counter = 0xFF;
-	else
-		p->link_error_recovery_counter =
-			(u8)cntrs.link_error_recovery_counter;
-	if (cntrs.link_downed_counter > 0xFFUL)
-		p->link_downed_counter = 0xFF;
-	else
-		p->link_downed_counter = (u8)cntrs.link_downed_counter;
-	if (cntrs.port_rcv_errors > 0xFFFFUL)
-		p->port_rcv_errors = cpu_to_be16(0xFFFF);
-	else
-		p->port_rcv_errors =
-			cpu_to_be16((u16) cntrs.port_rcv_errors);
-	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
-		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
-	else
-		p->port_rcv_remphys_errors =
-			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
-	if (cntrs.port_xmit_discards > 0xFFFFUL)
-		p->port_xmit_discards = cpu_to_be16(0xFFFF);
-	else
-		p->port_xmit_discards =
-			cpu_to_be16((u16)cntrs.port_xmit_discards);
-	if (cntrs.local_link_integrity_errors > 0xFUL)
-		cntrs.local_link_integrity_errors = 0xFUL;
-	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
-		cntrs.excessive_buffer_overrun_errors = 0xFUL;
-	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
-		cntrs.excessive_buffer_overrun_errors;
-	if (cntrs.vl15_dropped > 0xFFFFUL)
-		p->vl15_dropped = cpu_to_be16(0xFFFF);
-	else
-		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
-	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
-		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
-	else
-		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
-	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
-		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
-	else
-		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
-	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
-		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
-	else
-		p->port_xmit_packets =
-			cpu_to_be32((u32)cntrs.port_xmit_packets);
-	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
-		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
-	else
-		p->port_rcv_packets =
-			cpu_to_be32((u32) cntrs.port_rcv_packets);
-
-	return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
-					 struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portcounters_ext *p =
-		(struct ib_pma_portcounters_ext *)pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	u64 swords, rwords, spkts, rpkts, xwait;
-	u8 port_select = p->port_select;
-
-	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-				&rpkts, &xwait);
-
-	/* Adjust counters for any resets done. */
-	swords -= dev->z_port_xmit_data;
-	rwords -= dev->z_port_rcv_data;
-	spkts -= dev->z_port_xmit_packets;
-	rpkts -= dev->z_port_rcv_packets;
-
-	memset(pmp->data, 0, sizeof(pmp->data));
-
-	p->port_select = port_select;
-	if (pmp->mad_hdr.attr_mod != 0 ||
-	    (port_select != port && port_select != 0xFF))
-		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-	p->port_xmit_data = cpu_to_be64(swords);
-	p->port_rcv_data = cpu_to_be64(rwords);
-	p->port_xmit_packets = cpu_to_be64(spkts);
-	p->port_rcv_packets = cpu_to_be64(rpkts);
-	p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
-	p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
-	p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
-	p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
-
-	return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
-				     struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-		pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_verbs_counters cntrs;
-
-	/*
-	 * Since the HW doesn't support clearing counters, we save the
-	 * current count and subtract it from future responses.
-	 */
-	ipath_get_counters(dev->dd, &cntrs);
-
-	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
-		dev->z_symbol_error_counter = cntrs.symbol_error_counter;
-
-	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
-		dev->z_link_error_recovery_counter =
-			cntrs.link_error_recovery_counter;
-
-	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
-		dev->z_link_downed_counter = cntrs.link_downed_counter;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
-		dev->z_port_rcv_errors =
-			cntrs.port_rcv_errors + dev->rcv_errors;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
-		dev->z_port_rcv_remphys_errors =
-			cntrs.port_rcv_remphys_errors;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
-		dev->z_port_xmit_discards = cntrs.port_xmit_discards;
-
-	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
-		dev->z_local_link_integrity_errors =
-			cntrs.local_link_integrity_errors;
-
-	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
-		dev->z_excessive_buffer_overrun_errors =
-			cntrs.excessive_buffer_overrun_errors;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
-		dev->n_vl15_dropped = 0;
-		dev->z_vl15_dropped = cntrs.vl15_dropped;
-	}
-
-	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
-		dev->z_port_xmit_data = cntrs.port_xmit_data;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
-		dev->z_port_rcv_data = cntrs.port_rcv_data;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
-		dev->z_port_xmit_packets = cntrs.port_xmit_packets;
-
-	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
-		dev->z_port_rcv_packets = cntrs.port_rcv_packets;
-
-	return recv_pma_get_portcounters(pmp, ibdev, port);
-}
-
-static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
-					 struct ib_device *ibdev, u8 port)
-{
-	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-		pmp->data;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	u64 swords, rwords, spkts, rpkts, xwait;
-
-	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-				&rpkts, &xwait);
-
-	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
-		dev->z_port_xmit_data = swords;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
-		dev->z_port_rcv_data = rwords;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
-		dev->z_port_xmit_packets = spkts;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
-		dev->z_port_rcv_packets = rpkts;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
-		dev->n_unicast_xmit = 0;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
-		dev->n_unicast_rcv = 0;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
-		dev->n_multicast_xmit = 0;
-
-	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
-		dev->n_multicast_rcv = 0;
-
-	return recv_pma_get_portcounters_ext(pmp, ibdev, port);
-}
-
-static int process_subn(struct ib_device *ibdev, int mad_flags,
-			u8 port_num, const struct ib_mad *in_mad,
-			struct ib_mad *out_mad)
-{
-	struct ib_smp *smp = (struct ib_smp *)out_mad;
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	int ret;
-
-	*out_mad = *in_mad;
-	if (smp->class_version != 1) {
-		smp->status |= IB_SMP_UNSUP_VERSION;
-		ret = reply(smp);
-		goto bail;
-	}
-
-	/* Is the mkey in the process of expiring? */
-	if (dev->mkey_lease_timeout &&
-	    time_after_eq(jiffies, dev->mkey_lease_timeout)) {
-		/* Clear timeout and mkey protection field. */
-		dev->mkey_lease_timeout = 0;
-		dev->mkeyprot = 0;
-	}
-
-	/*
-	 * M_Key checking depends on
-	 * Portinfo:M_Key_protect_bits
-	 */
-	if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
-	    dev->mkey != smp->mkey &&
-	    (smp->method == IB_MGMT_METHOD_SET ||
-	     (smp->method == IB_MGMT_METHOD_GET &&
-	      dev->mkeyprot >= 2))) {
-		if (dev->mkey_violations != 0xFFFF)
-			++dev->mkey_violations;
-		if (dev->mkey_lease_timeout ||
-		    dev->mkey_lease_period == 0) {
-			ret = IB_MAD_RESULT_SUCCESS |
-				IB_MAD_RESULT_CONSUMED;
-			goto bail;
-		}
-		dev->mkey_lease_timeout = jiffies +
-			dev->mkey_lease_period * HZ;
-		/* Future: Generate a trap notice. */
-		ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-		goto bail;
-	} else if (dev->mkey_lease_timeout)
-		dev->mkey_lease_timeout = 0;
-
-	switch (smp->method) {
-	case IB_MGMT_METHOD_GET:
-		switch (smp->attr_id) {
-		case IB_SMP_ATTR_NODE_DESC:
-			ret = recv_subn_get_nodedescription(smp, ibdev);
-			goto bail;
-		case IB_SMP_ATTR_NODE_INFO:
-			ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
-			goto bail;
-		case IB_SMP_ATTR_GUID_INFO:
-			ret = recv_subn_get_guidinfo(smp, ibdev);
-			goto bail;
-		case IB_SMP_ATTR_PORT_INFO:
-			ret = recv_subn_get_portinfo(smp, ibdev, port_num);
-			goto bail;
-		case IB_SMP_ATTR_PKEY_TABLE:
-			ret = recv_subn_get_pkeytable(smp, ibdev);
-			goto bail;
-		case IB_SMP_ATTR_SM_INFO:
-			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-				ret = IB_MAD_RESULT_SUCCESS |
-					IB_MAD_RESULT_CONSUMED;
-				goto bail;
-			}
-			if (dev->port_cap_flags & IB_PORT_SM) {
-				ret = IB_MAD_RESULT_SUCCESS;
-				goto bail;
-			}
-			/* FALLTHROUGH */
-		default:
-			smp->status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply(smp);
-			goto bail;
-		}
-
-	case IB_MGMT_METHOD_SET:
-		switch (smp->attr_id) {
-		case IB_SMP_ATTR_GUID_INFO:
-			ret = recv_subn_set_guidinfo(smp, ibdev);
-			goto bail;
-		case IB_SMP_ATTR_PORT_INFO:
-			ret = recv_subn_set_portinfo(smp, ibdev, port_num);
-			goto bail;
-		case IB_SMP_ATTR_PKEY_TABLE:
-			ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
-			goto bail;
-		case IB_SMP_ATTR_SM_INFO:
-			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-				ret = IB_MAD_RESULT_SUCCESS |
-					IB_MAD_RESULT_CONSUMED;
-				goto bail;
-			}
-			if (dev->port_cap_flags & IB_PORT_SM) {
-				ret = IB_MAD_RESULT_SUCCESS;
-				goto bail;
-			}
-			/* FALLTHROUGH */
-		default:
-			smp->status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply(smp);
-			goto bail;
-		}
-
-	case IB_MGMT_METHOD_TRAP:
-	case IB_MGMT_METHOD_REPORT:
-	case IB_MGMT_METHOD_REPORT_RESP:
-	case IB_MGMT_METHOD_TRAP_REPRESS:
-	case IB_MGMT_METHOD_GET_RESP:
-		/*
-		 * The ib_mad module will call us to process responses
-		 * before checking for other consumers.
-		 * Just tell the caller to process it normally.
-		 */
-		ret = IB_MAD_RESULT_SUCCESS;
-		goto bail;
-	default:
-		smp->status |= IB_SMP_UNSUP_METHOD;
-		ret = reply(smp);
-	}
-
-bail:
-	return ret;
-}
-
-static int process_perf(struct ib_device *ibdev, u8 port_num,
-			const struct ib_mad *in_mad,
-			struct ib_mad *out_mad)
-{
-	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
-	int ret;
-
-	*out_mad = *in_mad;
-	if (pmp->mad_hdr.class_version != 1) {
-		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-		ret = reply((struct ib_smp *) pmp);
-		goto bail;
-	}
-
-	switch (pmp->mad_hdr.method) {
-	case IB_MGMT_METHOD_GET:
-		switch (pmp->mad_hdr.attr_id) {
-		case IB_PMA_CLASS_PORT_INFO:
-			ret = recv_pma_get_classportinfo(pmp);
-			goto bail;
-		case IB_PMA_PORT_SAMPLES_CONTROL:
-			ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
-							      port_num);
-			goto bail;
-		case IB_PMA_PORT_SAMPLES_RESULT:
-			ret = recv_pma_get_portsamplesresult(pmp, ibdev);
-			goto bail;
-		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
-			ret = recv_pma_get_portsamplesresult_ext(pmp,
-								 ibdev);
-			goto bail;
-		case IB_PMA_PORT_COUNTERS:
-			ret = recv_pma_get_portcounters(pmp, ibdev,
-							port_num);
-			goto bail;
-		case IB_PMA_PORT_COUNTERS_EXT:
-			ret = recv_pma_get_portcounters_ext(pmp, ibdev,
-							    port_num);
-			goto bail;
-		default:
-			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply((struct ib_smp *) pmp);
-			goto bail;
-		}
-
-	case IB_MGMT_METHOD_SET:
-		switch (pmp->mad_hdr.attr_id) {
-		case IB_PMA_PORT_SAMPLES_CONTROL:
-			ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
-							      port_num);
-			goto bail;
-		case IB_PMA_PORT_COUNTERS:
-			ret = recv_pma_set_portcounters(pmp, ibdev,
-							port_num);
-			goto bail;
-		case IB_PMA_PORT_COUNTERS_EXT:
-			ret = recv_pma_set_portcounters_ext(pmp, ibdev,
-							    port_num);
-			goto bail;
-		default:
-			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply((struct ib_smp *) pmp);
-			goto bail;
-		}
-
-	case IB_MGMT_METHOD_GET_RESP:
-		/*
-		 * The ib_mad module will call us to process responses
-		 * before checking for other consumers.
-		 * Just tell the caller to process it normally.
-		 */
-		ret = IB_MAD_RESULT_SUCCESS;
-		goto bail;
-	default:
-		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-		ret = reply((struct ib_smp *) pmp);
-	}
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_process_mad - process an incoming MAD packet
- * @ibdev: the infiniband device this packet came in on
- * @mad_flags: MAD flags
- * @port_num: the port number this packet came in on
- * @in_wc: the work completion entry for this packet
- * @in_grh: the global route header for this packet
- * @in_mad: the incoming MAD
- * @out_mad: any outgoing MAD reply
- *
- * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
- * interested in processing.
- *
- * Note that the verbs framework has already done the MAD sanity checks,
- * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
- * MADs.
- *
- * This is called by the ib_mad module.
- */
-int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		      const struct ib_mad_hdr *in, size_t in_mad_size,
-		      struct ib_mad_hdr *out, size_t *out_mad_size,
-		      u16 *out_mad_pkey_index)
-{
-	int ret;
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
-
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
-		return IB_MAD_RESULT_FAILURE;
-
-	switch (in_mad->mad_hdr.mgmt_class) {
-	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-		ret = process_subn(ibdev, mad_flags, port_num,
-				   in_mad, out_mad);
-		goto bail;
-	case IB_MGMT_CLASS_PERF_MGMT:
-		ret = process_perf(ibdev, port_num, in_mad, out_mad);
-		goto bail;
-	default:
-		ret = IB_MAD_RESULT_SUCCESS;
-	}
-
-bail:
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
deleted file mode 100644
index e732742..0000000
--- a/drivers/infiniband/hw/ipath/ipath_mmap.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <asm/pgtable.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_release_mmap_info - free mmap info structure
- * @ref: a pointer to the kref within struct ipath_mmap_info
- */
-void ipath_release_mmap_info(struct kref *ref)
-{
-	struct ipath_mmap_info *ip =
-		container_of(ref, struct ipath_mmap_info, ref);
-	struct ipath_ibdev *dev = to_idev(ip->context->device);
-
-	spin_lock_irq(&dev->pending_lock);
-	list_del(&ip->pending_mmaps);
-	spin_unlock_irq(&dev->pending_lock);
-
-	vfree(ip->obj);
-	kfree(ip);
-}
-
-/*
- * open and close keep track of how many times the CQ is mapped,
- * to avoid releasing it.
- */
-static void ipath_vma_open(struct vm_area_struct *vma)
-{
-	struct ipath_mmap_info *ip = vma->vm_private_data;
-
-	kref_get(&ip->ref);
-}
-
-static void ipath_vma_close(struct vm_area_struct *vma)
-{
-	struct ipath_mmap_info *ip = vma->vm_private_data;
-
-	kref_put(&ip->ref, ipath_release_mmap_info);
-}
-
-static const struct vm_operations_struct ipath_vm_ops = {
-	.open =     ipath_vma_open,
-	.close =    ipath_vma_close,
-};
-
-/**
- * ipath_mmap - create a new mmap region
- * @context: the IB user context of the process making the mmap() call
- * @vma: the VMA to be initialized
- * Return zero if the mmap is OK. Otherwise, return an errno.
- */
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-	struct ipath_ibdev *dev = to_idev(context->device);
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long size = vma->vm_end - vma->vm_start;
-	struct ipath_mmap_info *ip, *pp;
-	int ret = -EINVAL;
-
-	/*
-	 * Search the device's list of objects waiting for a mmap call.
-	 * Normally, this list is very short since a call to create a
-	 * CQ, QP, or SRQ is soon followed by a call to mmap().
-	 */
-	spin_lock_irq(&dev->pending_lock);
-	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
-				 pending_mmaps) {
-		/* Only the creator is allowed to mmap the object */
-		if (context != ip->context || (__u64) offset != ip->offset)
-			continue;
-		/* Don't allow a mmap larger than the object. */
-		if (size > ip->size)
-			break;
-
-		list_del_init(&ip->pending_mmaps);
-		spin_unlock_irq(&dev->pending_lock);
-
-		ret = remap_vmalloc_range(vma, ip->obj, 0);
-		if (ret)
-			goto done;
-		vma->vm_ops = &ipath_vm_ops;
-		vma->vm_private_data = ip;
-		ipath_vma_open(vma);
-		goto done;
-	}
-	spin_unlock_irq(&dev->pending_lock);
-done:
-	return ret;
-}
-
-/*
- * Allocate information for ipath_mmap
- */
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-					       u32 size,
-					       struct ib_ucontext *context,
-					       void *obj) {
-	struct ipath_mmap_info *ip;
-
-	ip = kmalloc(sizeof *ip, GFP_KERNEL);
-	if (!ip)
-		goto bail;
-
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irq(&dev->mmap_offset_lock);
-	if (dev->mmap_offset == 0)
-		dev->mmap_offset = PAGE_SIZE;
-	ip->offset = dev->mmap_offset;
-	dev->mmap_offset += size;
-	spin_unlock_irq(&dev->mmap_offset_lock);
-
-	INIT_LIST_HEAD(&ip->pending_mmaps);
-	ip->size = size;
-	ip->context = context;
-	ip->obj = obj;
-	kref_init(&ip->ref);
-
-bail:
-	return ip;
-}
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-			    struct ipath_mmap_info *ip,
-			    u32 size, void *obj) {
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irq(&dev->mmap_offset_lock);
-	if (dev->mmap_offset == 0)
-		dev->mmap_offset = PAGE_SIZE;
-	ip->offset = dev->mmap_offset;
-	dev->mmap_offset += size;
-	spin_unlock_irq(&dev->mmap_offset_lock);
-
-	ip->size = size;
-	ip->obj = obj;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
deleted file mode 100644
index c7278f6..0000000
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/slab.h>
-
-#include <rdma/ib_umem.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-
-/* Fast memory region */
-struct ipath_fmr {
-	struct ib_fmr ibfmr;
-	u8 page_shift;
-	struct ipath_mregion mr;        /* must be last */
-};
-
-static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
-{
-	return container_of(ibfmr, struct ipath_fmr, ibfmr);
-}
-
-/**
- * ipath_get_dma_mr - get a DMA memory region
- * @pd: protection domain for this memory region
- * @acc: access flags
- *
- * Returns the memory region on success, otherwise returns an errno.
- * Note that all DMA addresses should be created via the
- * struct ib_dma_mapping_ops functions (see ipath_dma.c).
- */
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	struct ipath_mr *mr;
-	struct ib_mr *ret;
-
-	mr = kzalloc(sizeof *mr, GFP_KERNEL);
-	if (!mr) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	mr->mr.access_flags = acc;
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-static struct ipath_mr *alloc_mr(int count,
-				 struct ipath_lkey_table *lk_table)
-{
-	struct ipath_mr *mr;
-	int m, i = 0;
-
-	/* Allocate struct plus pointers to first level page tables. */
-	m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-	mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
-	if (!mr)
-		goto done;
-
-	/* Allocate first level page tables. */
-	for (; i < m; i++) {
-		mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
-		if (!mr->mr.map[i])
-			goto bail;
-	}
-	mr->mr.mapsz = m;
-
-	/*
-	 * ib_reg_phys_mr() will initialize mr->ibmr except for
-	 * lkey and rkey.
-	 */
-	if (!ipath_alloc_lkey(lk_table, &mr->mr))
-		goto bail;
-	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
-
-	goto done;
-
-bail:
-	while (i) {
-		i--;
-		kfree(mr->mr.map[i]);
-	}
-	kfree(mr);
-	mr = NULL;
-
-done:
-	return mr;
-}
-
-/**
- * ipath_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-				struct ib_phys_buf *buffer_list,
-				int num_phys_buf, int acc, u64 *iova_start)
-{
-	struct ipath_mr *mr;
-	int n, m, i;
-	struct ib_mr *ret;
-
-	mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
-	if (mr == NULL) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	mr->mr.pd = pd;
-	mr->mr.user_base = *iova_start;
-	mr->mr.iova = *iova_start;
-	mr->mr.length = 0;
-	mr->mr.offset = 0;
-	mr->mr.access_flags = acc;
-	mr->mr.max_segs = num_phys_buf;
-	mr->umem = NULL;
-
-	m = 0;
-	n = 0;
-	for (i = 0; i < num_phys_buf; i++) {
-		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-		mr->mr.length += buffer_list[i].size;
-		n++;
-		if (n == IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_reg_user_mr - register a userspace memory region
- * @pd: protection domain for this memory region
- * @start: starting userspace address
- * @length: length of region to register
- * @virt_addr: virtual address to use (from HCA's point of view)
- * @mr_access_flags: access flags for this memory region
- * @udata: unused by the InfiniPath driver
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				u64 virt_addr, int mr_access_flags,
-				struct ib_udata *udata)
-{
-	struct ipath_mr *mr;
-	struct ib_umem *umem;
-	int n, m, entry;
-	struct scatterlist *sg;
-	struct ib_mr *ret;
-
-	if (length == 0) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	umem = ib_umem_get(pd->uobject->context, start, length,
-			   mr_access_flags, 0);
-	if (IS_ERR(umem))
-		return (void *) umem;
-
-	n = umem->nmap;
-	mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
-	if (!mr) {
-		ret = ERR_PTR(-ENOMEM);
-		ib_umem_release(umem);
-		goto bail;
-	}
-
-	mr->mr.pd = pd;
-	mr->mr.user_base = start;
-	mr->mr.iova = virt_addr;
-	mr->mr.length = length;
-	mr->mr.offset = ib_umem_offset(umem);
-	mr->mr.access_flags = mr_access_flags;
-	mr->mr.max_segs = n;
-	mr->umem = umem;
-
-	m = 0;
-	n = 0;
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		void *vaddr;
-
-		vaddr = page_address(sg_page(sg));
-		if (!vaddr) {
-			ret = ERR_PTR(-EINVAL);
-			goto bail;
-		}
-		mr->mr.map[m]->segs[n].vaddr = vaddr;
-		mr->mr.map[m]->segs[n].length = umem->page_size;
-		n++;
-		if (n == IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_dereg_mr - unregister and free a memory region
- * @ibmr: the memory region to free
- *
- * Returns 0 on success.
- *
- * Note that this is called to free MRs created by ipath_get_dma_mr()
- * or ipath_reg_user_mr().
- */
-int ipath_dereg_mr(struct ib_mr *ibmr)
-{
-	struct ipath_mr *mr = to_imr(ibmr);
-	int i;
-
-	ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
-	i = mr->mr.mapsz;
-	while (i) {
-		i--;
-		kfree(mr->mr.map[i]);
-	}
-
-	if (mr->umem)
-		ib_umem_release(mr->umem);
-
-	kfree(mr);
-	return 0;
-}
-
-/**
- * ipath_alloc_fmr - allocate a fast memory region
- * @pd: the protection domain for this memory region
- * @mr_access_flags: access flags for this memory region
- * @fmr_attr: fast memory region attributes
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-			       struct ib_fmr_attr *fmr_attr)
-{
-	struct ipath_fmr *fmr;
-	int m, i = 0;
-	struct ib_fmr *ret;
-
-	/* Allocate struct plus pointers to first level page tables. */
-	m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-	fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
-	if (!fmr)
-		goto bail;
-
-	/* Allocate first level page tables. */
-	for (; i < m; i++) {
-		fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
-					 GFP_KERNEL);
-		if (!fmr->mr.map[i])
-			goto bail;
-	}
-	fmr->mr.mapsz = m;
-
-	/*
-	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
-	 * rkey.
-	 */
-	if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
-		goto bail;
-	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
-	/*
-	 * Resources are allocated but no valid mapping (RKEY can't be
-	 * used).
-	 */
-	fmr->mr.pd = pd;
-	fmr->mr.user_base = 0;
-	fmr->mr.iova = 0;
-	fmr->mr.length = 0;
-	fmr->mr.offset = 0;
-	fmr->mr.access_flags = mr_access_flags;
-	fmr->mr.max_segs = fmr_attr->max_pages;
-	fmr->page_shift = fmr_attr->page_shift;
-
-	ret = &fmr->ibfmr;
-	goto done;
-
-bail:
-	while (i)
-		kfree(fmr->mr.map[--i]);
-	kfree(fmr);
-	ret = ERR_PTR(-ENOMEM);
-
-done:
-	return ret;
-}
-
-/**
- * ipath_map_phys_fmr - set up a fast memory region
- * @ibmfr: the fast memory region to set up
- * @page_list: the list of pages to associate with the fast memory region
- * @list_len: the number of pages to associate with the fast memory region
- * @iova: the virtual address of the start of the fast memory region
- *
- * This may be called from interrupt context.
- */
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-		       int list_len, u64 iova)
-{
-	struct ipath_fmr *fmr = to_ifmr(ibfmr);
-	struct ipath_lkey_table *rkt;
-	unsigned long flags;
-	int m, n, i;
-	u32 ps;
-	int ret;
-
-	if (list_len > fmr->mr.max_segs) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	rkt = &to_idev(ibfmr->device)->lk_table;
-	spin_lock_irqsave(&rkt->lock, flags);
-	fmr->mr.user_base = iova;
-	fmr->mr.iova = iova;
-	ps = 1 << fmr->page_shift;
-	fmr->mr.length = list_len * ps;
-	m = 0;
-	n = 0;
-	ps = 1 << fmr->page_shift;
-	for (i = 0; i < list_len; i++) {
-		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
-		fmr->mr.map[m]->segs[n].length = ps;
-		if (++n == IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-	spin_unlock_irqrestore(&rkt->lock, flags);
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_unmap_fmr - unmap fast memory regions
- * @fmr_list: the list of fast memory regions to unmap
- *
- * Returns 0 on success.
- */
-int ipath_unmap_fmr(struct list_head *fmr_list)
-{
-	struct ipath_fmr *fmr;
-	struct ipath_lkey_table *rkt;
-	unsigned long flags;
-
-	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
-		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
-		spin_lock_irqsave(&rkt->lock, flags);
-		fmr->mr.user_base = 0;
-		fmr->mr.iova = 0;
-		fmr->mr.length = 0;
-		spin_unlock_irqrestore(&rkt->lock, flags);
-	}
-	return 0;
-}
-
-/**
- * ipath_dealloc_fmr - deallocate a fast memory region
- * @ibfmr: the fast memory region to deallocate
- *
- * Returns 0 on success.
- */
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
-{
-	struct ipath_fmr *fmr = to_ifmr(ibfmr);
-	int i;
-
-	ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
-	i = fmr->mr.mapsz;
-	while (i)
-		kfree(fmr->mr.map[--i]);
-	kfree(fmr);
-	return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
deleted file mode 100644
index face876..0000000
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-#define BITS_PER_PAGE		(PAGE_SIZE*BITS_PER_BYTE)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define mk_qpn(qpt, map, off)	(((map) - (qpt)->map) * BITS_PER_PAGE + \
-				 (off))
-#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
-						      BITS_PER_PAGE, off)
-
-/*
- * Convert the AETH credit code into the number of credits.
- */
-static u32 credit_table[31] = {
-	0,			/* 0 */
-	1,			/* 1 */
-	2,			/* 2 */
-	3,			/* 3 */
-	4,			/* 4 */
-	6,			/* 5 */
-	8,			/* 6 */
-	12,			/* 7 */
-	16,			/* 8 */
-	24,			/* 9 */
-	32,			/* A */
-	48,			/* B */
-	64,			/* C */
-	96,			/* D */
-	128,			/* E */
-	192,			/* F */
-	256,			/* 10 */
-	384,			/* 11 */
-	512,			/* 12 */
-	768,			/* 13 */
-	1024,			/* 14 */
-	1536,			/* 15 */
-	2048,			/* 16 */
-	3072,			/* 17 */
-	4096,			/* 18 */
-	6144,			/* 19 */
-	8192,			/* 1A */
-	12288,			/* 1B */
-	16384,			/* 1C */
-	24576,			/* 1D */
-	32768			/* 1E */
-};
-
-
-static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
-{
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	unsigned long flags;
-
-	/*
-	 * Free the page if someone raced with us installing it.
-	 */
-
-	spin_lock_irqsave(&qpt->lock, flags);
-	if (map->page)
-		free_page(page);
-	else
-		map->page = (void *)page;
-	spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-
-static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
-{
-	u32 i, offset, max_scan, qpn;
-	struct qpn_map *map;
-	u32 ret = -1;
-
-	if (type == IB_QPT_SMI)
-		ret = 0;
-	else if (type == IB_QPT_GSI)
-		ret = 1;
-
-	if (ret != -1) {
-		map = &qpt->map[0];
-		if (unlikely(!map->page)) {
-			get_map_page(qpt, map);
-			if (unlikely(!map->page)) {
-				ret = -ENOMEM;
-				goto bail;
-			}
-		}
-		if (!test_and_set_bit(ret, map->page))
-			atomic_dec(&map->n_free);
-		else
-			ret = -EBUSY;
-		goto bail;
-	}
-
-	qpn = qpt->last + 1;
-	if (qpn >= QPN_MAX)
-		qpn = 2;
-	offset = qpn & BITS_PER_PAGE_MASK;
-	map = &qpt->map[qpn / BITS_PER_PAGE];
-	max_scan = qpt->nmaps - !offset;
-	for (i = 0;;) {
-		if (unlikely(!map->page)) {
-			get_map_page(qpt, map);
-			if (unlikely(!map->page))
-				break;
-		}
-		if (likely(atomic_read(&map->n_free))) {
-			do {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->n_free);
-					qpt->last = qpn;
-					ret = qpn;
-					goto bail;
-				}
-				offset = find_next_offset(map, offset);
-				qpn = mk_qpn(qpt, map, offset);
-				/*
-				 * This test differs from alloc_pidmap().
-				 * If find_next_offset() does find a zero
-				 * bit, we don't need to check for QPN
-				 * wrapping around past our starting QPN.
-				 * We just need to be sure we don't loop
-				 * forever.
-				 */
-			} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
-		}
-		/*
-		 * In order to keep the number of pages allocated to a
-		 * minimum, we scan the all existing pages before increasing
-		 * the size of the bitmap table.
-		 */
-		if (++i > max_scan) {
-			if (qpt->nmaps == QPNMAP_ENTRIES)
-				break;
-			map = &qpt->map[qpt->nmaps++];
-			offset = 0;
-		} else if (map < &qpt->map[qpt->nmaps]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &qpt->map[0];
-			offset = 2;
-		}
-		qpn = mk_qpn(qpt, map, offset);
-	}
-
-	ret = -ENOMEM;
-
-bail:
-	return ret;
-}
-
-static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-	struct qpn_map *map;
-
-	map = qpt->map + qpn / BITS_PER_PAGE;
-	if (map->page)
-		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
-	atomic_inc(&map->n_free);
-}
-
-/**
- * ipath_alloc_qpn - allocate a QP number
- * @qpt: the QP table
- * @qp: the QP
- * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
- *
- * Allocate the next available QPN and put the QP into the hash table.
- * The hash table holds a reference to the QP.
- */
-static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
-			   enum ib_qp_type type)
-{
-	unsigned long flags;
-	int ret;
-
-	ret = alloc_qpn(qpt, type);
-	if (ret < 0)
-		goto bail;
-	qp->ibqp.qp_num = ret;
-
-	/* Add the QP to the hash table. */
-	spin_lock_irqsave(&qpt->lock, flags);
-
-	ret %= qpt->max;
-	qp->next = qpt->table[ret];
-	qpt->table[ret] = qp;
-	atomic_inc(&qp->refcount);
-
-	spin_unlock_irqrestore(&qpt->lock, flags);
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_free_qp - remove a QP from the QP table
- * @qpt: the QP table
- * @qp: the QP to remove
- *
- * Remove the QP from the table so it can't be found asynchronously by
- * the receive interrupt routine.
- */
-static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
-{
-	struct ipath_qp *q, **qpp;
-	unsigned long flags;
-
-	spin_lock_irqsave(&qpt->lock, flags);
-
-	/* Remove QP from the hash table. */
-	qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
-	for (; (q = *qpp) != NULL; qpp = &q->next) {
-		if (q == qp) {
-			*qpp = qp->next;
-			qp->next = NULL;
-			atomic_dec(&qp->refcount);
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-/**
- * ipath_free_all_qps - check for QPs still in use
- * @qpt: the QP table to empty
- *
- * There should not be any QPs still in use.
- * Free memory for table.
- */
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
-{
-	unsigned long flags;
-	struct ipath_qp *qp;
-	u32 n, qp_inuse = 0;
-
-	spin_lock_irqsave(&qpt->lock, flags);
-	for (n = 0; n < qpt->max; n++) {
-		qp = qpt->table[n];
-		qpt->table[n] = NULL;
-
-		for (; qp; qp = qp->next)
-			qp_inuse++;
-	}
-	spin_unlock_irqrestore(&qpt->lock, flags);
-
-	for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
-		if (qpt->map[n].page)
-			free_page((unsigned long) qpt->map[n].page);
-	return qp_inuse;
-}
-
-/**
- * ipath_lookup_qpn - return the QP with the given QPN
- * @qpt: the QP table
- * @qpn: the QP number to look up
- *
- * The caller is responsible for decrementing the QP reference count
- * when done.
- */
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-	unsigned long flags;
-	struct ipath_qp *qp;
-
-	spin_lock_irqsave(&qpt->lock, flags);
-
-	for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
-		if (qp->ibqp.qp_num == qpn) {
-			atomic_inc(&qp->refcount);
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&qpt->lock, flags);
-	return qp;
-}
-
-/**
- * ipath_reset_qp - initialize the QP state to the reset state
- * @qp: the QP to reset
- * @type: the QP type
- */
-static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
-{
-	qp->remote_qpn = 0;
-	qp->qkey = 0;
-	qp->qp_access_flags = 0;
-	atomic_set(&qp->s_dma_busy, 0);
-	qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
-	qp->s_hdrwords = 0;
-	qp->s_wqe = NULL;
-	qp->s_pkt_delay = 0;
-	qp->s_draining = 0;
-	qp->s_psn = 0;
-	qp->r_psn = 0;
-	qp->r_msn = 0;
-	if (type == IB_QPT_RC) {
-		qp->s_state = IB_OPCODE_RC_SEND_LAST;
-		qp->r_state = IB_OPCODE_RC_SEND_LAST;
-	} else {
-		qp->s_state = IB_OPCODE_UC_SEND_LAST;
-		qp->r_state = IB_OPCODE_UC_SEND_LAST;
-	}
-	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
-	qp->r_nak_state = 0;
-	qp->r_aflags = 0;
-	qp->r_flags = 0;
-	qp->s_rnr_timeout = 0;
-	qp->s_head = 0;
-	qp->s_tail = 0;
-	qp->s_cur = 0;
-	qp->s_last = 0;
-	qp->s_ssn = 1;
-	qp->s_lsn = 0;
-	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
-	qp->r_head_ack_queue = 0;
-	qp->s_tail_ack_queue = 0;
-	qp->s_num_rd_atomic = 0;
-	if (qp->r_rq.wq) {
-		qp->r_rq.wq->head = 0;
-		qp->r_rq.wq->tail = 0;
-	}
-}
-
-/**
- * ipath_error_qp - put a QP into the error state
- * @qp: the QP to put into the error state
- * @err: the receive completion error to signal if a RWQE is active
- *
- * Flushes both send and receive work queues.
- * Returns true if last WQE event should be generated.
- * The QP s_lock should be held and interrupts disabled.
- * If we are already in error state, just return.
- */
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ib_wc wc;
-	int ret = 0;
-
-	if (qp->state == IB_QPS_ERR)
-		goto bail;
-
-	qp->state = IB_QPS_ERR;
-
-	spin_lock(&dev->pending_lock);
-	if (!list_empty(&qp->timerwait))
-		list_del_init(&qp->timerwait);
-	if (!list_empty(&qp->piowait))
-		list_del_init(&qp->piowait);
-	spin_unlock(&dev->pending_lock);
-
-	/* Schedule the sending tasklet to drain the send work queue. */
-	if (qp->s_last != qp->s_head)
-		ipath_schedule_send(qp);
-
-	memset(&wc, 0, sizeof(wc));
-	wc.qp = &qp->ibqp;
-	wc.opcode = IB_WC_RECV;
-
-	if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
-		wc.wr_id = qp->r_wr_id;
-		wc.status = err;
-		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-	}
-	wc.status = IB_WC_WR_FLUSH_ERR;
-
-	if (qp->r_rq.wq) {
-		struct ipath_rwq *wq;
-		u32 head;
-		u32 tail;
-
-		spin_lock(&qp->r_rq.lock);
-
-		/* sanity check pointers before trusting them */
-		wq = qp->r_rq.wq;
-		head = wq->head;
-		if (head >= qp->r_rq.size)
-			head = 0;
-		tail = wq->tail;
-		if (tail >= qp->r_rq.size)
-			tail = 0;
-		while (tail != head) {
-			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
-			if (++tail >= qp->r_rq.size)
-				tail = 0;
-			ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-		}
-		wq->tail = tail;
-
-		spin_unlock(&qp->r_rq.lock);
-	} else if (qp->ibqp.event_handler)
-		ret = 1;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_modify_qp - modify the attributes of a queue pair
- * @ibqp: the queue pair who's attributes we're modifying
- * @attr: the new attributes
- * @attr_mask: the mask of attributes to modify
- * @udata: user data for ipathverbs.so
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		    int attr_mask, struct ib_udata *udata)
-{
-	struct ipath_ibdev *dev = to_idev(ibqp->device);
-	struct ipath_qp *qp = to_iqp(ibqp);
-	enum ib_qp_state cur_state, new_state;
-	int lastwqe = 0;
-	int ret;
-
-	spin_lock_irq(&qp->s_lock);
-
-	cur_state = attr_mask & IB_QP_CUR_STATE ?
-		attr->cur_qp_state : qp->state;
-	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
-		goto inval;
-
-	if (attr_mask & IB_QP_AV) {
-		if (attr->ah_attr.dlid == 0 ||
-		    attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
-			goto inval;
-
-		if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
-		    (attr->ah_attr.grh.sgid_index > 1))
-			goto inval;
-	}
-
-	if (attr_mask & IB_QP_PKEY_INDEX)
-		if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
-			goto inval;
-
-	if (attr_mask & IB_QP_MIN_RNR_TIMER)
-		if (attr->min_rnr_timer > 31)
-			goto inval;
-
-	if (attr_mask & IB_QP_PORT)
-		if (attr->port_num == 0 ||
-		    attr->port_num > ibqp->device->phys_port_cnt)
-			goto inval;
-
-	/*
-	 * don't allow invalid Path MTU values or greater than 2048
-	 * unless we are configured for a 4KB MTU
-	 */
-	if ((attr_mask & IB_QP_PATH_MTU) &&
-		(ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
-		(attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
-		goto inval;
-
-	if (attr_mask & IB_QP_PATH_MIG_STATE)
-		if (attr->path_mig_state != IB_MIG_MIGRATED &&
-		    attr->path_mig_state != IB_MIG_REARM)
-			goto inval;
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-		if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
-			goto inval;
-
-	switch (new_state) {
-	case IB_QPS_RESET:
-		if (qp->state != IB_QPS_RESET) {
-			qp->state = IB_QPS_RESET;
-			spin_lock(&dev->pending_lock);
-			if (!list_empty(&qp->timerwait))
-				list_del_init(&qp->timerwait);
-			if (!list_empty(&qp->piowait))
-				list_del_init(&qp->piowait);
-			spin_unlock(&dev->pending_lock);
-			qp->s_flags &= ~IPATH_S_ANY_WAIT;
-			spin_unlock_irq(&qp->s_lock);
-			/* Stop the sending tasklet */
-			tasklet_kill(&qp->s_task);
-			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-			spin_lock_irq(&qp->s_lock);
-		}
-		ipath_reset_qp(qp, ibqp->qp_type);
-		break;
-
-	case IB_QPS_SQD:
-		qp->s_draining = qp->s_last != qp->s_cur;
-		qp->state = new_state;
-		break;
-
-	case IB_QPS_SQE:
-		if (qp->ibqp.qp_type == IB_QPT_RC)
-			goto inval;
-		qp->state = new_state;
-		break;
-
-	case IB_QPS_ERR:
-		lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-		break;
-
-	default:
-		qp->state = new_state;
-		break;
-	}
-
-	if (attr_mask & IB_QP_PKEY_INDEX)
-		qp->s_pkey_index = attr->pkey_index;
-
-	if (attr_mask & IB_QP_DEST_QPN)
-		qp->remote_qpn = attr->dest_qp_num;
-
-	if (attr_mask & IB_QP_SQ_PSN) {
-		qp->s_psn = qp->s_next_psn = attr->sq_psn;
-		qp->s_last_psn = qp->s_next_psn - 1;
-	}
-
-	if (attr_mask & IB_QP_RQ_PSN)
-		qp->r_psn = attr->rq_psn;
-
-	if (attr_mask & IB_QP_ACCESS_FLAGS)
-		qp->qp_access_flags = attr->qp_access_flags;
-
-	if (attr_mask & IB_QP_AV) {
-		qp->remote_ah_attr = attr->ah_attr;
-		qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
-	}
-
-	if (attr_mask & IB_QP_PATH_MTU)
-		qp->path_mtu = attr->path_mtu;
-
-	if (attr_mask & IB_QP_RETRY_CNT)
-		qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
-
-	if (attr_mask & IB_QP_RNR_RETRY) {
-		qp->s_rnr_retry = attr->rnr_retry;
-		if (qp->s_rnr_retry > 7)
-			qp->s_rnr_retry = 7;
-		qp->s_rnr_retry_cnt = qp->s_rnr_retry;
-	}
-
-	if (attr_mask & IB_QP_MIN_RNR_TIMER)
-		qp->r_min_rnr_timer = attr->min_rnr_timer;
-
-	if (attr_mask & IB_QP_TIMEOUT)
-		qp->timeout = attr->timeout;
-
-	if (attr_mask & IB_QP_QKEY)
-		qp->qkey = attr->qkey;
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
-
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
-		qp->s_max_rd_atomic = attr->max_rd_atomic;
-
-	spin_unlock_irq(&qp->s_lock);
-
-	if (lastwqe) {
-		struct ib_event ev;
-
-		ev.device = qp->ibqp.device;
-		ev.element.qp = &qp->ibqp;
-		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-	}
-	ret = 0;
-	goto bail;
-
-inval:
-	spin_unlock_irq(&qp->s_lock);
-	ret = -EINVAL;
-
-bail:
-	return ret;
-}
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		   int attr_mask, struct ib_qp_init_attr *init_attr)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-
-	attr->qp_state = qp->state;
-	attr->cur_qp_state = attr->qp_state;
-	attr->path_mtu = qp->path_mtu;
-	attr->path_mig_state = 0;
-	attr->qkey = qp->qkey;
-	attr->rq_psn = qp->r_psn;
-	attr->sq_psn = qp->s_next_psn;
-	attr->dest_qp_num = qp->remote_qpn;
-	attr->qp_access_flags = qp->qp_access_flags;
-	attr->cap.max_send_wr = qp->s_size - 1;
-	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
-	attr->cap.max_send_sge = qp->s_max_sge;
-	attr->cap.max_recv_sge = qp->r_rq.max_sge;
-	attr->cap.max_inline_data = 0;
-	attr->ah_attr = qp->remote_ah_attr;
-	memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
-	attr->pkey_index = qp->s_pkey_index;
-	attr->alt_pkey_index = 0;
-	attr->en_sqd_async_notify = 0;
-	attr->sq_draining = qp->s_draining;
-	attr->max_rd_atomic = qp->s_max_rd_atomic;
-	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
-	attr->min_rnr_timer = qp->r_min_rnr_timer;
-	attr->port_num = 1;
-	attr->timeout = qp->timeout;
-	attr->retry_cnt = qp->s_retry_cnt;
-	attr->rnr_retry = qp->s_rnr_retry_cnt;
-	attr->alt_port_num = 0;
-	attr->alt_timeout = 0;
-
-	init_attr->event_handler = qp->ibqp.event_handler;
-	init_attr->qp_context = qp->ibqp.qp_context;
-	init_attr->send_cq = qp->ibqp.send_cq;
-	init_attr->recv_cq = qp->ibqp.recv_cq;
-	init_attr->srq = qp->ibqp.srq;
-	init_attr->cap = attr->cap;
-	if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
-		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-	else
-		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
-	init_attr->qp_type = qp->ibqp.qp_type;
-	init_attr->port_num = 1;
-	return 0;
-}
-
-/**
- * ipath_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 ipath_compute_aeth(struct ipath_qp *qp)
-{
-	u32 aeth = qp->r_msn & IPATH_MSN_MASK;
-
-	if (qp->ibqp.srq) {
-		/*
-		 * Shared receive queues don't generate credits.
-		 * Set the credit field to the invalid value.
-		 */
-		aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
-	} else {
-		u32 min, max, x;
-		u32 credits;
-		struct ipath_rwq *wq = qp->r_rq.wq;
-		u32 head;
-		u32 tail;
-
-		/* sanity check pointers before trusting them */
-		head = wq->head;
-		if (head >= qp->r_rq.size)
-			head = 0;
-		tail = wq->tail;
-		if (tail >= qp->r_rq.size)
-			tail = 0;
-		/*
-		 * Compute the number of credits available (RWQEs).
-		 * XXX Not holding the r_rq.lock here so there is a small
-		 * chance that the pair of reads are not atomic.
-		 */
-		credits = head - tail;
-		if ((int)credits < 0)
-			credits += qp->r_rq.size;
-		/*
-		 * Binary search the credit table to find the code to
-		 * use.
-		 */
-		min = 0;
-		max = 31;
-		for (;;) {
-			x = (min + max) / 2;
-			if (credit_table[x] == credits)
-				break;
-			if (credit_table[x] > credits)
-				max = x;
-			else if (min == x)
-				break;
-			else
-				min = x;
-		}
-		aeth |= x << IPATH_AETH_CREDIT_SHIFT;
-	}
-	return cpu_to_be32(aeth);
-}
-
-/**
- * ipath_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: unused by InfiniPath
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-			      struct ib_qp_init_attr *init_attr,
-			      struct ib_udata *udata)
-{
-	struct ipath_qp *qp;
-	int err;
-	struct ipath_swqe *swq = NULL;
-	struct ipath_ibdev *dev;
-	size_t sz;
-	size_t sg_list_sz;
-	struct ib_qp *ret;
-
-	if (init_attr->create_flags) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
-	    init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	/* Check receive queue parameters if no SRQ is specified. */
-	if (!init_attr->srq) {
-		if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
-		    init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
-			ret = ERR_PTR(-EINVAL);
-			goto bail;
-		}
-		if (init_attr->cap.max_send_sge +
-		    init_attr->cap.max_send_wr +
-		    init_attr->cap.max_recv_sge +
-		    init_attr->cap.max_recv_wr == 0) {
-			ret = ERR_PTR(-EINVAL);
-			goto bail;
-		}
-	}
-
-	switch (init_attr->qp_type) {
-	case IB_QPT_UC:
-	case IB_QPT_RC:
-	case IB_QPT_UD:
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
-		sz = sizeof(struct ipath_sge) *
-			init_attr->cap.max_send_sge +
-			sizeof(struct ipath_swqe);
-		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
-		if (swq == NULL) {
-			ret = ERR_PTR(-ENOMEM);
-			goto bail;
-		}
-		sz = sizeof(*qp);
-		sg_list_sz = 0;
-		if (init_attr->srq) {
-			struct ipath_srq *srq = to_isrq(init_attr->srq);
-
-			if (srq->rq.max_sge > 1)
-				sg_list_sz = sizeof(*qp->r_sg_list) *
-					(srq->rq.max_sge - 1);
-		} else if (init_attr->cap.max_recv_sge > 1)
-			sg_list_sz = sizeof(*qp->r_sg_list) *
-				(init_attr->cap.max_recv_sge - 1);
-		qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
-		if (!qp) {
-			ret = ERR_PTR(-ENOMEM);
-			goto bail_swq;
-		}
-		if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
-		    init_attr->qp_type == IB_QPT_SMI ||
-		    init_attr->qp_type == IB_QPT_GSI)) {
-			qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
-			if (!qp->r_ud_sg_list) {
-				ret = ERR_PTR(-ENOMEM);
-				goto bail_qp;
-			}
-		} else
-			qp->r_ud_sg_list = NULL;
-		if (init_attr->srq) {
-			sz = 0;
-			qp->r_rq.size = 0;
-			qp->r_rq.max_sge = 0;
-			qp->r_rq.wq = NULL;
-			init_attr->cap.max_recv_wr = 0;
-			init_attr->cap.max_recv_sge = 0;
-		} else {
-			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
-			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
-			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
-				sizeof(struct ipath_rwqe);
-			qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
-					      qp->r_rq.size * sz);
-			if (!qp->r_rq.wq) {
-				ret = ERR_PTR(-ENOMEM);
-				goto bail_sg_list;
-			}
-		}
-
-		/*
-		 * ib_create_qp() will initialize qp->ibqp
-		 * except for qp->ibqp.qp_num.
-		 */
-		spin_lock_init(&qp->s_lock);
-		spin_lock_init(&qp->r_rq.lock);
-		atomic_set(&qp->refcount, 0);
-		init_waitqueue_head(&qp->wait);
-		init_waitqueue_head(&qp->wait_dma);
-		tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
-		INIT_LIST_HEAD(&qp->piowait);
-		INIT_LIST_HEAD(&qp->timerwait);
-		qp->state = IB_QPS_RESET;
-		qp->s_wq = swq;
-		qp->s_size = init_attr->cap.max_send_wr + 1;
-		qp->s_max_sge = init_attr->cap.max_send_sge;
-		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
-			qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
-		else
-			qp->s_flags = 0;
-		dev = to_idev(ibpd->device);
-		err = ipath_alloc_qpn(&dev->qp_table, qp,
-				      init_attr->qp_type);
-		if (err) {
-			ret = ERR_PTR(err);
-			vfree(qp->r_rq.wq);
-			goto bail_sg_list;
-		}
-		qp->ip = NULL;
-		qp->s_tx = NULL;
-		ipath_reset_qp(qp, init_attr->qp_type);
-		break;
-
-	default:
-		/* Don't support raw QPs */
-		ret = ERR_PTR(-ENOSYS);
-		goto bail;
-	}
-
-	init_attr->cap.max_inline_data = 0;
-
-	/*
-	 * Return the address of the RWQ as the offset to mmap.
-	 * See ipath_mmap() for details.
-	 */
-	if (udata && udata->outlen >= sizeof(__u64)) {
-		if (!qp->r_rq.wq) {
-			__u64 offset = 0;
-
-			err = ib_copy_to_udata(udata, &offset,
-					       sizeof(offset));
-			if (err) {
-				ret = ERR_PTR(err);
-				goto bail_ip;
-			}
-		} else {
-			u32 s = sizeof(struct ipath_rwq) +
-				qp->r_rq.size * sz;
-
-			qp->ip =
-			    ipath_create_mmap_info(dev, s,
-						   ibpd->uobject->context,
-						   qp->r_rq.wq);
-			if (!qp->ip) {
-				ret = ERR_PTR(-ENOMEM);
-				goto bail_ip;
-			}
-
-			err = ib_copy_to_udata(udata, &(qp->ip->offset),
-					       sizeof(qp->ip->offset));
-			if (err) {
-				ret = ERR_PTR(err);
-				goto bail_ip;
-			}
-		}
-	}
-
-	spin_lock(&dev->n_qps_lock);
-	if (dev->n_qps_allocated == ib_ipath_max_qps) {
-		spin_unlock(&dev->n_qps_lock);
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_ip;
-	}
-
-	dev->n_qps_allocated++;
-	spin_unlock(&dev->n_qps_lock);
-
-	if (qp->ip) {
-		spin_lock_irq(&dev->pending_lock);
-		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
-		spin_unlock_irq(&dev->pending_lock);
-	}
-
-	ret = &qp->ibqp;
-	goto bail;
-
-bail_ip:
-	if (qp->ip)
-		kref_put(&qp->ip->ref, ipath_release_mmap_info);
-	else
-		vfree(qp->r_rq.wq);
-	ipath_free_qp(&dev->qp_table, qp);
-	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-bail_sg_list:
-	kfree(qp->r_ud_sg_list);
-bail_qp:
-	kfree(qp);
-bail_swq:
-	vfree(swq);
-bail:
-	return ret;
-}
-
-/**
- * ipath_destroy_qp - destroy a queue pair
- * @ibqp: the queue pair to destroy
- *
- * Returns 0 on success.
- *
- * Note that this can be called while the QP is actively sending or
- * receiving!
- */
-int ipath_destroy_qp(struct ib_qp *ibqp)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-	struct ipath_ibdev *dev = to_idev(ibqp->device);
-
-	/* Make sure HW and driver activity is stopped. */
-	spin_lock_irq(&qp->s_lock);
-	if (qp->state != IB_QPS_RESET) {
-		qp->state = IB_QPS_RESET;
-		spin_lock(&dev->pending_lock);
-		if (!list_empty(&qp->timerwait))
-			list_del_init(&qp->timerwait);
-		if (!list_empty(&qp->piowait))
-			list_del_init(&qp->piowait);
-		spin_unlock(&dev->pending_lock);
-		qp->s_flags &= ~IPATH_S_ANY_WAIT;
-		spin_unlock_irq(&qp->s_lock);
-		/* Stop the sending tasklet */
-		tasklet_kill(&qp->s_task);
-		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-	} else
-		spin_unlock_irq(&qp->s_lock);
-
-	ipath_free_qp(&dev->qp_table, qp);
-
-	if (qp->s_tx) {
-		atomic_dec(&qp->refcount);
-		if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-			kfree(qp->s_tx->txreq.map_addr);
-		spin_lock_irq(&dev->pending_lock);
-		list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
-		spin_unlock_irq(&dev->pending_lock);
-		qp->s_tx = NULL;
-	}
-
-	wait_event(qp->wait, !atomic_read(&qp->refcount));
-
-	/* all user's cleaned up, mark it available */
-	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-	spin_lock(&dev->n_qps_lock);
-	dev->n_qps_allocated--;
-	spin_unlock(&dev->n_qps_lock);
-
-	if (qp->ip)
-		kref_put(&qp->ip->ref, ipath_release_mmap_info);
-	else
-		vfree(qp->r_rq.wq);
-	kfree(qp->r_ud_sg_list);
-	vfree(qp->s_wq);
-	kfree(qp);
-	return 0;
-}
-
-/**
- * ipath_init_qp_table - initialize the QP table for a device
- * @idev: the device who's QP table we're initializing
- * @size: the size of the QP table
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
-{
-	int i;
-	int ret;
-
-	idev->qp_table.last = 1;	/* QPN 0 and 1 are special. */
-	idev->qp_table.max = size;
-	idev->qp_table.nmaps = 1;
-	idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
-				       GFP_KERNEL);
-	if (idev->qp_table.table == NULL) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
-		atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
-		idev->qp_table.map[i].page = NULL;
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
-{
-	u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
-
-	/*
-	 * If the credit is invalid, we can send
-	 * as many packets as we like.  Otherwise, we have to
-	 * honor the credit field.
-	 */
-	if (credit == IPATH_AETH_CREDIT_INVAL)
-		qp->s_lsn = (u32) -1;
-	else if (qp->s_lsn != (u32) -1) {
-		/* Compute new LSN (i.e., MSN + credit) */
-		credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
-		if (ipath_cmp24(credit, qp->s_lsn) > 0)
-			qp->s_lsn = credit;
-	}
-
-	/* Restart sending if it was blocked due to lack of credits. */
-	if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
-	    qp->s_cur != qp->s_head &&
-	    (qp->s_lsn == (u32) -1 ||
-	     ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
-			 qp->s_lsn + 1) <= 0))
-		ipath_schedule_send(qp);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
deleted file mode 100644
index 79b3dbc..0000000
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ /dev/null
@@ -1,1969 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_RC_##x
-
-static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
-		       u32 psn, u32 pmtu)
-{
-	u32 len;
-
-	len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ipath_skip_sge(ss, len);
-	return wqe->length - len;
-}
-
-/**
- * ipath_init_restart- initialize the qp->s_sge after a restart
- * @qp: the QP who's SGE we're restarting
- * @wqe: the work queue to initialize the QP's SGE from
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
-{
-	struct ipath_ibdev *dev;
-
-	qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
-				ib_mtu_enum_to_int(qp->path_mtu));
-	dev = to_idev(qp->ibqp.device);
-	spin_lock(&dev->pending_lock);
-	if (list_empty(&qp->timerwait))
-		list_add_tail(&qp->timerwait,
-			      &dev->pending[dev->pending_index]);
-	spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
- * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- *
- * Return 1 if constructed; otherwise, return 0.
- * Note that we are in the responder's side of the QP context.
- * Note the QP s_lock must be held.
- */
-static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
-			     struct ipath_other_headers *ohdr, u32 pmtu)
-{
-	struct ipath_ack_entry *e;
-	u32 hwords;
-	u32 len;
-	u32 bth0;
-	u32 bth2;
-
-	/* Don't send an ACK if we aren't supposed to. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-		goto bail;
-
-	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
-	hwords = 5;
-
-	switch (qp->s_ack_state) {
-	case OP(RDMA_READ_RESPONSE_LAST):
-	case OP(RDMA_READ_RESPONSE_ONLY):
-	case OP(ATOMIC_ACKNOWLEDGE):
-		/*
-		 * We can increment the tail pointer now that the last
-		 * response has been sent instead of only being
-		 * constructed.
-		 */
-		if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
-			qp->s_tail_ack_queue = 0;
-		/* FALLTHROUGH */
-	case OP(SEND_ONLY):
-	case OP(ACKNOWLEDGE):
-		/* Check for no next entry in the queue. */
-		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
-			if (qp->s_flags & IPATH_S_ACK_PENDING)
-				goto normal;
-			qp->s_ack_state = OP(ACKNOWLEDGE);
-			goto bail;
-		}
-
-		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST)) {
-			/* Copy SGE state in case we need to resend */
-			qp->s_ack_rdma_sge = e->rdma_sge;
-			qp->s_cur_sge = &qp->s_ack_rdma_sge;
-			len = e->rdma_sge.sge.sge_length;
-			if (len > pmtu) {
-				len = pmtu;
-				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
-			} else {
-				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-				e->sent = 1;
-			}
-			ohdr->u.aeth = ipath_compute_aeth(qp);
-			hwords++;
-			qp->s_ack_rdma_psn = e->psn;
-			bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-		} else {
-			/* COMPARE_SWAP or FETCH_ADD */
-			qp->s_cur_sge = NULL;
-			len = 0;
-			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
-			ohdr->u.at.aeth = ipath_compute_aeth(qp);
-			ohdr->u.at.atomic_ack_eth[0] =
-				cpu_to_be32(e->atomic_data >> 32);
-			ohdr->u.at.atomic_ack_eth[1] =
-				cpu_to_be32(e->atomic_data);
-			hwords += sizeof(ohdr->u.at) / sizeof(u32);
-			bth2 = e->psn;
-			e->sent = 1;
-		}
-		bth0 = qp->s_ack_state << 24;
-		break;
-
-	case OP(RDMA_READ_RESPONSE_FIRST):
-		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-		/* FALLTHROUGH */
-	case OP(RDMA_READ_RESPONSE_MIDDLE):
-		len = qp->s_ack_rdma_sge.sge.sge_length;
-		if (len > pmtu)
-			len = pmtu;
-		else {
-			ohdr->u.aeth = ipath_compute_aeth(qp);
-			hwords++;
-			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-			qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
-		}
-		bth0 = qp->s_ack_state << 24;
-		bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-		break;
-
-	default:
-	normal:
-		/*
-		 * Send a regular ACK.
-		 * Set the s_ack_state so we wait until after sending
-		 * the ACK before setting s_ack_state to ACKNOWLEDGE
-		 * (see above).
-		 */
-		qp->s_ack_state = OP(SEND_ONLY);
-		qp->s_flags &= ~IPATH_S_ACK_PENDING;
-		qp->s_cur_sge = NULL;
-		if (qp->s_nak_state)
-			ohdr->u.aeth =
-				cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-					    (qp->s_nak_state <<
-					     IPATH_AETH_CREDIT_SHIFT));
-		else
-			ohdr->u.aeth = ipath_compute_aeth(qp);
-		hwords++;
-		len = 0;
-		bth0 = OP(ACKNOWLEDGE) << 24;
-		bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
-	}
-	qp->s_hdrwords = hwords;
-	qp->s_cur_size = len;
-	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
-	return 1;
-
-bail:
-	return 0;
-}
-
-/**
- * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_rc_req(struct ipath_qp *qp)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ipath_other_headers *ohdr;
-	struct ipath_sge_state *ss;
-	struct ipath_swqe *wqe;
-	u32 hwords;
-	u32 len;
-	u32 bth0;
-	u32 bth2;
-	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-	char newreq;
-	unsigned long flags;
-	int ret = 0;
-
-	ohdr = &qp->s_hdr.u.oth;
-	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-		ohdr = &qp->s_hdr.u.l.oth;
-
-	/*
-	 * The lock is needed to synchronize between the sending tasklet,
-	 * the receive interrupt handler, and timeout resends.
-	 */
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	/* Sending responses has higher priority over sending requests. */
-	if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-	     (qp->s_flags & IPATH_S_ACK_PENDING) ||
-	     qp->s_ack_state != OP(ACKNOWLEDGE)) &&
-	    ipath_make_rc_ack(dev, qp, ohdr, pmtu))
-		goto done;
-
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-			goto bail;
-		/* We are in the error state, flush the work request. */
-		if (qp->s_last == qp->s_head)
-			goto bail;
-		/* If DMAs are in progress, we can't flush immediately. */
-		if (atomic_read(&qp->s_dma_busy)) {
-			qp->s_flags |= IPATH_S_WAIT_DMA;
-			goto bail;
-		}
-		wqe = get_swqe_ptr(qp, qp->s_last);
-		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-		goto done;
-	}
-
-	/* Leave BUSY set until RNR timeout. */
-	if (qp->s_rnr_timeout) {
-		qp->s_flags |= IPATH_S_WAITING;
-		goto bail;
-	}
-
-	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
-	hwords = 5;
-	bth0 = 1 << 22; /* Set M bit */
-
-	/* Send a request. */
-	wqe = get_swqe_ptr(qp, qp->s_cur);
-	switch (qp->s_state) {
-	default:
-		if (!(ib_ipath_state_ops[qp->state] &
-		    IPATH_PROCESS_NEXT_SEND_OK))
-			goto bail;
-		/*
-		 * Resend an old request or start a new one.
-		 *
-		 * We keep track of the current SWQE so that
-		 * we don't reset the "furthest progress" state
-		 * if we need to back up.
-		 */
-		newreq = 0;
-		if (qp->s_cur == qp->s_tail) {
-			/* Check if send work queue is empty. */
-			if (qp->s_tail == qp->s_head)
-				goto bail;
-			/*
-			 * If a fence is requested, wait for previous
-			 * RDMA read and atomic operations to finish.
-			 */
-			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-			    qp->s_num_rd_atomic) {
-				qp->s_flags |= IPATH_S_FENCE_PENDING;
-				goto bail;
-			}
-			wqe->psn = qp->s_next_psn;
-			newreq = 1;
-		}
-		/*
-		 * Note that we have to be careful not to modify the
-		 * original work request since we may need to resend
-		 * it.
-		 */
-		len = wqe->length;
-		ss = &qp->s_sge;
-		bth2 = 0;
-		switch (wqe->wr.opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_IMM:
-			/* If no credit, return. */
-			if (qp->s_lsn != (u32) -1 &&
-			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-				goto bail;
-			}
-			wqe->lpsn = wqe->psn;
-			if (len > pmtu) {
-				wqe->lpsn += (len - 1) / pmtu;
-				qp->s_state = OP(SEND_FIRST);
-				len = pmtu;
-				break;
-			}
-			if (wqe->wr.opcode == IB_WR_SEND)
-				qp->s_state = OP(SEND_ONLY);
-			else {
-				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
-				/* Immediate data comes after the BTH */
-				ohdr->u.imm_data = wqe->wr.ex.imm_data;
-				hwords += 1;
-			}
-			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-				bth0 |= 1 << 23;
-			bth2 = 1 << 31;	/* Request ACK. */
-			if (++qp->s_cur == qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		case IB_WR_RDMA_WRITE:
-			if (newreq && qp->s_lsn != (u32) -1)
-				qp->s_lsn++;
-			/* FALLTHROUGH */
-		case IB_WR_RDMA_WRITE_WITH_IMM:
-			/* If no credit, return. */
-			if (qp->s_lsn != (u32) -1 &&
-			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-				goto bail;
-			}
-			ohdr->u.rc.reth.vaddr =
-				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-			ohdr->u.rc.reth.rkey =
-				cpu_to_be32(wqe->wr.wr.rdma.rkey);
-			ohdr->u.rc.reth.length = cpu_to_be32(len);
-			hwords += sizeof(struct ib_reth) / sizeof(u32);
-			wqe->lpsn = wqe->psn;
-			if (len > pmtu) {
-				wqe->lpsn += (len - 1) / pmtu;
-				qp->s_state = OP(RDMA_WRITE_FIRST);
-				len = pmtu;
-				break;
-			}
-			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-				qp->s_state = OP(RDMA_WRITE_ONLY);
-			else {
-				qp->s_state =
-					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-				/* Immediate data comes after RETH */
-				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-				hwords += 1;
-				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-					bth0 |= 1 << 23;
-			}
-			bth2 = 1 << 31;	/* Request ACK. */
-			if (++qp->s_cur == qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		case IB_WR_RDMA_READ:
-			/*
-			 * Don't allow more operations to be started
-			 * than the QP limits allow.
-			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= IPATH_S_RDMAR_PENDING;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
-				if (qp->s_lsn != (u32) -1)
-					qp->s_lsn++;
-				/*
-				 * Adjust s_next_psn to count the
-				 * expected number of responses.
-				 */
-				if (len > pmtu)
-					qp->s_next_psn += (len - 1) / pmtu;
-				wqe->lpsn = qp->s_next_psn++;
-			}
-			ohdr->u.rc.reth.vaddr =
-				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-			ohdr->u.rc.reth.rkey =
-				cpu_to_be32(wqe->wr.wr.rdma.rkey);
-			ohdr->u.rc.reth.length = cpu_to_be32(len);
-			qp->s_state = OP(RDMA_READ_REQUEST);
-			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-			ss = NULL;
-			len = 0;
-			if (++qp->s_cur == qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		case IB_WR_ATOMIC_CMP_AND_SWP:
-		case IB_WR_ATOMIC_FETCH_AND_ADD:
-			/*
-			 * Don't allow more operations to be started
-			 * than the QP limits allow.
-			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= IPATH_S_RDMAR_PENDING;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
-				if (qp->s_lsn != (u32) -1)
-					qp->s_lsn++;
-				wqe->lpsn = wqe->psn;
-			}
-			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-				qp->s_state = OP(COMPARE_SWAP);
-				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-					wqe->wr.wr.atomic.swap);
-				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
-					wqe->wr.wr.atomic.compare_add);
-			} else {
-				qp->s_state = OP(FETCH_ADD);
-				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-					wqe->wr.wr.atomic.compare_add);
-				ohdr->u.atomic_eth.compare_data = 0;
-			}
-			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
-				wqe->wr.wr.atomic.remote_addr >> 32);
-			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
-				wqe->wr.wr.atomic.remote_addr);
-			ohdr->u.atomic_eth.rkey = cpu_to_be32(
-				wqe->wr.wr.atomic.rkey);
-			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
-			ss = NULL;
-			len = 0;
-			if (++qp->s_cur == qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		default:
-			goto bail;
-		}
-		qp->s_sge.sge = wqe->sg_list[0];
-		qp->s_sge.sg_list = wqe->sg_list + 1;
-		qp->s_sge.num_sge = wqe->wr.num_sge;
-		qp->s_len = wqe->length;
-		if (newreq) {
-			qp->s_tail++;
-			if (qp->s_tail >= qp->s_size)
-				qp->s_tail = 0;
-		}
-		bth2 |= qp->s_psn & IPATH_PSN_MASK;
-		if (wqe->wr.opcode == IB_WR_RDMA_READ)
-			qp->s_psn = wqe->lpsn + 1;
-		else {
-			qp->s_psn++;
-			if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-				qp->s_next_psn = qp->s_psn;
-		}
-		/*
-		 * Put the QP on the pending list so lost ACKs will cause
-		 * a retry.  More than one request can be pending so the
-		 * QP may already be on the dev->pending list.
-		 */
-		spin_lock(&dev->pending_lock);
-		if (list_empty(&qp->timerwait))
-			list_add_tail(&qp->timerwait,
-				      &dev->pending[dev->pending_index]);
-		spin_unlock(&dev->pending_lock);
-		break;
-
-	case OP(RDMA_READ_RESPONSE_FIRST):
-		/*
-		 * This case can only happen if a send is restarted.
-		 * See ipath_restart_rc().
-		 */
-		ipath_init_restart(qp, wqe);
-		/* FALLTHROUGH */
-	case OP(SEND_FIRST):
-		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
-	case OP(SEND_MIDDLE):
-		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-			qp->s_next_psn = qp->s_psn;
-		ss = &qp->s_sge;
-		len = qp->s_len;
-		if (len > pmtu) {
-			len = pmtu;
-			break;
-		}
-		if (wqe->wr.opcode == IB_WR_SEND)
-			qp->s_state = OP(SEND_LAST);
-		else {
-			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.ex.imm_data;
-			hwords += 1;
-		}
-		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-			bth0 |= 1 << 23;
-		bth2 |= 1 << 31;	/* Request ACK. */
-		qp->s_cur++;
-		if (qp->s_cur >= qp->s_size)
-			qp->s_cur = 0;
-		break;
-
-	case OP(RDMA_READ_RESPONSE_LAST):
-		/*
-		 * This case can only happen if a RDMA write is restarted.
-		 * See ipath_restart_rc().
-		 */
-		ipath_init_restart(qp, wqe);
-		/* FALLTHROUGH */
-	case OP(RDMA_WRITE_FIRST):
-		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
-	case OP(RDMA_WRITE_MIDDLE):
-		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-			qp->s_next_psn = qp->s_psn;
-		ss = &qp->s_sge;
-		len = qp->s_len;
-		if (len > pmtu) {
-			len = pmtu;
-			break;
-		}
-		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-			qp->s_state = OP(RDMA_WRITE_LAST);
-		else {
-			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.ex.imm_data;
-			hwords += 1;
-			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-				bth0 |= 1 << 23;
-		}
-		bth2 |= 1 << 31;	/* Request ACK. */
-		qp->s_cur++;
-		if (qp->s_cur >= qp->s_size)
-			qp->s_cur = 0;
-		break;
-
-	case OP(RDMA_READ_RESPONSE_MIDDLE):
-		/*
-		 * This case can only happen if a RDMA read is restarted.
-		 * See ipath_restart_rc().
-		 */
-		ipath_init_restart(qp, wqe);
-		len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-		ohdr->u.rc.reth.vaddr =
-			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
-		ohdr->u.rc.reth.rkey =
-			cpu_to_be32(wqe->wr.wr.rdma.rkey);
-		ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
-		qp->s_state = OP(RDMA_READ_REQUEST);
-		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-		bth2 = qp->s_psn & IPATH_PSN_MASK;
-		qp->s_psn = wqe->lpsn + 1;
-		ss = NULL;
-		len = 0;
-		qp->s_cur++;
-		if (qp->s_cur == qp->s_size)
-			qp->s_cur = 0;
-		break;
-	}
-	if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
-		bth2 |= 1 << 31;	/* Request ACK. */
-	qp->s_len -= len;
-	qp->s_hdrwords = hwords;
-	qp->s_cur_sge = ss;
-	qp->s_cur_size = len;
-	ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
-done:
-	ret = 1;
-	goto unlock;
-
-bail:
-	qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-	return ret;
-}
-
-/**
- * send_rc_ack - Construct an ACK packet and send it
- * @qp: a pointer to the QP
- *
- * This is called from ipath_rc_rcv() and only uses the receive
- * side QP state.
- * Note that RDMA reads and atomics are handled in the
- * send side QP state and tasklet.
- */
-static void send_rc_ack(struct ipath_qp *qp)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ipath_devdata *dd;
-	u16 lrh0;
-	u32 bth0;
-	u32 hwords;
-	u32 __iomem *piobuf;
-	struct ipath_ib_header hdr;
-	struct ipath_other_headers *ohdr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-	if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-	    (qp->s_flags & IPATH_S_ACK_PENDING) ||
-	    qp->s_ack_state != OP(ACKNOWLEDGE))
-		goto queue_ack;
-
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-
-	/* Don't try to send ACKs if the link isn't ACTIVE */
-	dd = dev->dd;
-	if (!(dd->ipath_flags & IPATH_LINKACTIVE))
-		goto done;
-
-	piobuf = ipath_getpiobuf(dd, 0, NULL);
-	if (!piobuf) {
-		/*
-		 * We are out of PIO buffers at the moment.
-		 * Pass responsibility for sending the ACK to the
-		 * send tasklet so that when a PIO buffer becomes
-		 * available, the ACK is sent ahead of other outgoing
-		 * packets.
-		 */
-		spin_lock_irqsave(&qp->s_lock, flags);
-		goto queue_ack;
-	}
-
-	/* Construct the header. */
-	ohdr = &hdr.u.oth;
-	lrh0 = IPATH_LRH_BTH;
-	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
-	hwords = 6;
-	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-		hwords += ipath_make_grh(dev, &hdr.u.l.grh,
-					 &qp->remote_ah_attr.grh,
-					 hwords, 0);
-		ohdr = &hdr.u.l.oth;
-		lrh0 = IPATH_LRH_GRH;
-	}
-	/* read pkey_index w/o lock (its atomic) */
-	bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
-		(OP(ACKNOWLEDGE) << 24) | (1 << 22);
-	if (qp->r_nak_state)
-		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-					    (qp->r_nak_state <<
-					     IPATH_AETH_CREDIT_SHIFT));
-	else
-		ohdr->u.aeth = ipath_compute_aeth(qp);
-	lrh0 |= qp->remote_ah_attr.sl << 4;
-	hdr.lrh[0] = cpu_to_be16(lrh0);
-	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
-				 qp->remote_ah_attr.src_path_bits);
-	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
-
-	writeq(hwords + 1, piobuf);
-
-	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-		u32 *hdrp = (u32 *) &hdr;
-
-		ipath_flush_wc();
-		__iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
-		ipath_flush_wc();
-		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
-	} else
-		__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
-
-	ipath_flush_wc();
-
-	dev->n_unicast_xmit++;
-	goto done;
-
-queue_ack:
-	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
-		dev->n_rc_qacks++;
-		qp->s_flags |= IPATH_S_ACK_PENDING;
-		qp->s_nak_state = qp->r_nak_state;
-		qp->s_ack_psn = qp->r_ack_psn;
-
-		/* Schedule the send tasklet. */
-		ipath_schedule_send(qp);
-	}
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-	return;
-}
-
-/**
- * reset_psn - reset the QP state to send starting from PSN
- * @qp: the QP
- * @psn: the packet sequence number to restart at
- *
- * This is called from ipath_rc_rcv() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held.
- */
-static void reset_psn(struct ipath_qp *qp, u32 psn)
-{
-	u32 n = qp->s_last;
-	struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
-	u32 opcode;
-
-	qp->s_cur = n;
-
-	/*
-	 * If we are starting the request from the beginning,
-	 * let the normal send code handle initialization.
-	 */
-	if (ipath_cmp24(psn, wqe->psn) <= 0) {
-		qp->s_state = OP(SEND_LAST);
-		goto done;
-	}
-
-	/* Find the work request opcode corresponding to the given PSN. */
-	opcode = wqe->wr.opcode;
-	for (;;) {
-		int diff;
-
-		if (++n == qp->s_size)
-			n = 0;
-		if (n == qp->s_tail)
-			break;
-		wqe = get_swqe_ptr(qp, n);
-		diff = ipath_cmp24(psn, wqe->psn);
-		if (diff < 0)
-			break;
-		qp->s_cur = n;
-		/*
-		 * If we are starting the request from the beginning,
-		 * let the normal send code handle initialization.
-		 */
-		if (diff == 0) {
-			qp->s_state = OP(SEND_LAST);
-			goto done;
-		}
-		opcode = wqe->wr.opcode;
-	}
-
-	/*
-	 * Set the state to restart in the middle of a request.
-	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
-	 * See ipath_make_rc_req().
-	 */
-	switch (opcode) {
-	case IB_WR_SEND:
-	case IB_WR_SEND_WITH_IMM:
-		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
-		break;
-
-	case IB_WR_RDMA_WRITE:
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
-		break;
-
-	case IB_WR_RDMA_READ:
-		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-		break;
-
-	default:
-		/*
-		 * This case shouldn't happen since its only
-		 * one PSN per req.
-		 */
-		qp->s_state = OP(SEND_LAST);
-	}
-done:
-	qp->s_psn = psn;
-}
-
-/**
- * ipath_restart_rc - back up requester to resend the last un-ACKed request
- * @qp: the QP to restart
- * @psn: packet sequence number for the request
- * @wc: the work completion request
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
-{
-	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
-	struct ipath_ibdev *dev;
-
-	if (qp->s_retry == 0) {
-		ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-		goto bail;
-	}
-	qp->s_retry--;
-
-	/*
-	 * Remove the QP from the timeout queue.
-	 * Note: it may already have been removed by ipath_ib_timer().
-	 */
-	dev = to_idev(qp->ibqp.device);
-	spin_lock(&dev->pending_lock);
-	if (!list_empty(&qp->timerwait))
-		list_del_init(&qp->timerwait);
-	if (!list_empty(&qp->piowait))
-		list_del_init(&qp->piowait);
-	spin_unlock(&dev->pending_lock);
-
-	if (wqe->wr.opcode == IB_WR_RDMA_READ)
-		dev->n_rc_resends++;
-	else
-		dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
-
-	reset_psn(qp, psn);
-	ipath_schedule_send(qp);
-
-bail:
-	return;
-}
-
-static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
-{
-	qp->s_last_psn = psn;
-}
-
-/**
- * do_rc_ack - process an incoming RC ACK
- * @qp: the QP the ACK came in on
- * @psn: the packet sequence number of the ACK
- * @opcode: the opcode of the request that resulted in the ACK
- *
- * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held and interrupts disabled.
- * Returns 1 if OK, 0 if current operation should be aborted (NAK).
- */
-static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
-		     u64 val)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ib_wc wc;
-	enum ib_wc_status status;
-	struct ipath_swqe *wqe;
-	int ret = 0;
-	u32 ack_psn;
-	int diff;
-
-	/*
-	 * Remove the QP from the timeout queue (or RNR timeout queue).
-	 * If ipath_ib_timer() has already removed it,
-	 * it's OK since we hold the QP s_lock and ipath_restart_rc()
-	 * just won't find anything to restart if we ACK everything.
-	 */
-	spin_lock(&dev->pending_lock);
-	if (!list_empty(&qp->timerwait))
-		list_del_init(&qp->timerwait);
-	spin_unlock(&dev->pending_lock);
-
-	/*
-	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
-	 * requests and implicitly NAK RDMA read and atomic requests issued
-	 * before the NAK'ed request.  The MSN won't include the NAK'ed
-	 * request but will include an ACK'ed request(s).
-	 */
-	ack_psn = psn;
-	if (aeth >> 29)
-		ack_psn--;
-	wqe = get_swqe_ptr(qp, qp->s_last);
-
-	/*
-	 * The MSN might be for a later WQE than the PSN indicates so
-	 * only complete WQEs that the PSN finishes.
-	 */
-	while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
-		/*
-		 * RDMA_READ_RESPONSE_ONLY is a special case since
-		 * we want to generate completion events for everything
-		 * before the RDMA read, copy the data, then generate
-		 * the completion for the read.
-		 */
-		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
-		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
-		    diff == 0) {
-			ret = 1;
-			goto bail;
-		}
-		/*
-		 * If this request is a RDMA read or atomic, and the ACK is
-		 * for a later operation, this ACK NAKs the RDMA read or
-		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
-		 * can ACK a RDMA read and likewise for atomic ops.  Note
-		 * that the NAK case can only happen if relaxed ordering is
-		 * used and requests are sent after an RDMA read or atomic
-		 * is sent but before the response is received.
-		 */
-		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
-		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-			/*
-			 * The last valid PSN seen is the previous
-			 * request's.
-			 */
-			update_last_psn(qp, wqe->psn - 1);
-			/* Retry this request. */
-			ipath_restart_rc(qp, wqe->psn);
-			/*
-			 * No need to process the ACK/NAK since we are
-			 * restarting an earlier request.
-			 */
-			goto bail;
-		}
-		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-			*(u64 *) wqe->sg_list[0].vaddr = val;
-		if (qp->s_num_rd_atomic &&
-		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
-		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
-			qp->s_num_rd_atomic--;
-			/* Restart sending task if fence is complete */
-			if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
-			     !qp->s_num_rd_atomic) ||
-			    qp->s_flags & IPATH_S_RDMAR_PENDING)
-				ipath_schedule_send(qp);
-		}
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof wc);
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			wc.src_qp = qp->remote_qpn;
-			wc.slid = qp->remote_ah_attr.dlid;
-			wc.sl = qp->remote_ah_attr.sl;
-			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
-		}
-		qp->s_retry = qp->s_retry_cnt;
-		/*
-		 * If we are completing a request which is in the process of
-		 * being resent, we can stop resending it since we know the
-		 * responder has already seen it.
-		 */
-		if (qp->s_last == qp->s_cur) {
-			if (++qp->s_cur >= qp->s_size)
-				qp->s_cur = 0;
-			qp->s_last = qp->s_cur;
-			if (qp->s_last == qp->s_tail)
-				break;
-			wqe = get_swqe_ptr(qp, qp->s_cur);
-			qp->s_state = OP(SEND_LAST);
-			qp->s_psn = wqe->psn;
-		} else {
-			if (++qp->s_last >= qp->s_size)
-				qp->s_last = 0;
-			if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
-				qp->s_draining = 0;
-			if (qp->s_last == qp->s_tail)
-				break;
-			wqe = get_swqe_ptr(qp, qp->s_last);
-		}
-	}
-
-	switch (aeth >> 29) {
-	case 0:		/* ACK */
-		dev->n_rc_acks++;
-		/* If this is a partial ACK, reset the retransmit timer. */
-		if (qp->s_last != qp->s_tail) {
-			spin_lock(&dev->pending_lock);
-			if (list_empty(&qp->timerwait))
-				list_add_tail(&qp->timerwait,
-					&dev->pending[dev->pending_index]);
-			spin_unlock(&dev->pending_lock);
-			/*
-			 * If we get a partial ACK for a resent operation,
-			 * we can stop resending the earlier packets and
-			 * continue with the next packet the receiver wants.
-			 */
-			if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-				reset_psn(qp, psn + 1);
-				ipath_schedule_send(qp);
-			}
-		} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-			qp->s_state = OP(SEND_LAST);
-			qp->s_psn = psn + 1;
-		}
-		ipath_get_credit(qp, aeth);
-		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-		qp->s_retry = qp->s_retry_cnt;
-		update_last_psn(qp, psn);
-		ret = 1;
-		goto bail;
-
-	case 1:		/* RNR NAK */
-		dev->n_rnr_naks++;
-		if (qp->s_last == qp->s_tail)
-			goto bail;
-		if (qp->s_rnr_retry == 0) {
-			status = IB_WC_RNR_RETRY_EXC_ERR;
-			goto class_b;
-		}
-		if (qp->s_rnr_retry_cnt < 7)
-			qp->s_rnr_retry--;
-
-		/* The last valid PSN is the previous PSN. */
-		update_last_psn(qp, psn - 1);
-
-		if (wqe->wr.opcode == IB_WR_RDMA_READ)
-			dev->n_rc_resends++;
-		else
-			dev->n_rc_resends +=
-				(qp->s_psn - psn) & IPATH_PSN_MASK;
-
-		reset_psn(qp, psn);
-
-		qp->s_rnr_timeout =
-			ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
-					   IPATH_AETH_CREDIT_MASK];
-		ipath_insert_rnr_queue(qp);
-		ipath_schedule_send(qp);
-		goto bail;
-
-	case 3:		/* NAK */
-		if (qp->s_last == qp->s_tail)
-			goto bail;
-		/* The last valid PSN is the previous PSN. */
-		update_last_psn(qp, psn - 1);
-		switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
-			IPATH_AETH_CREDIT_MASK) {
-		case 0:	/* PSN sequence error */
-			dev->n_seq_naks++;
-			/*
-			 * Back up to the responder's expected PSN.
-			 * Note that we might get a NAK in the middle of an
-			 * RDMA READ response which terminates the RDMA
-			 * READ.
-			 */
-			ipath_restart_rc(qp, psn);
-			break;
-
-		case 1:	/* Invalid Request */
-			status = IB_WC_REM_INV_REQ_ERR;
-			dev->n_other_naks++;
-			goto class_b;
-
-		case 2:	/* Remote Access Error */
-			status = IB_WC_REM_ACCESS_ERR;
-			dev->n_other_naks++;
-			goto class_b;
-
-		case 3:	/* Remote Operation Error */
-			status = IB_WC_REM_OP_ERR;
-			dev->n_other_naks++;
-		class_b:
-			ipath_send_complete(qp, wqe, status);
-			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-			break;
-
-		default:
-			/* Ignore other reserved NAK error codes */
-			goto reserved;
-		}
-		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-		goto bail;
-
-	default:		/* 2: reserved */
-	reserved:
-		/* Ignore reserved NAK codes. */
-		goto bail;
-	}
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_rc_rcv_resp - process an incoming RC response packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @hdrsize: the header length
- * @pmtu: the path MTU
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an incoming RC response
- * packet for the given QP.
- * Called at interrupt level.
- */
-static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
-				     struct ipath_other_headers *ohdr,
-				     void *data, u32 tlen,
-				     struct ipath_qp *qp,
-				     u32 opcode,
-				     u32 psn, u32 hdrsize, u32 pmtu,
-				     int header_in_data)
-{
-	struct ipath_swqe *wqe;
-	enum ib_wc_status status;
-	unsigned long flags;
-	int diff;
-	u32 pad;
-	u32 aeth;
-	u64 val;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	/* Double check we can process this now that we hold the s_lock. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-		goto ack_done;
-
-	/* Ignore invalid responses. */
-	if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
-		goto ack_done;
-
-	/* Ignore duplicate responses. */
-	diff = ipath_cmp24(psn, qp->s_last_psn);
-	if (unlikely(diff <= 0)) {
-		/* Update credits for "ghost" ACKs */
-		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
-			if (!header_in_data)
-				aeth = be32_to_cpu(ohdr->u.aeth);
-			else {
-				aeth = be32_to_cpu(((__be32 *) data)[0]);
-				data += sizeof(__be32);
-			}
-			if ((aeth >> 29) == 0)
-				ipath_get_credit(qp, aeth);
-		}
-		goto ack_done;
-	}
-
-	if (unlikely(qp->s_last == qp->s_tail))
-		goto ack_done;
-	wqe = get_swqe_ptr(qp, qp->s_last);
-	status = IB_WC_SUCCESS;
-
-	switch (opcode) {
-	case OP(ACKNOWLEDGE):
-	case OP(ATOMIC_ACKNOWLEDGE):
-	case OP(RDMA_READ_RESPONSE_FIRST):
-		if (!header_in_data)
-			aeth = be32_to_cpu(ohdr->u.aeth);
-		else {
-			aeth = be32_to_cpu(((__be32 *) data)[0]);
-			data += sizeof(__be32);
-		}
-		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-			if (!header_in_data) {
-				__be32 *p = ohdr->u.at.atomic_ack_eth;
-
-				val = ((u64) be32_to_cpu(p[0]) << 32) |
-					be32_to_cpu(p[1]);
-			} else
-				val = be64_to_cpu(((__be64 *) data)[0]);
-		} else
-			val = 0;
-		if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
-		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
-			goto ack_done;
-		hdrsize += 4;
-		wqe = get_swqe_ptr(qp, qp->s_last);
-		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-			goto ack_op_err;
-		qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
-		/*
-		 * If this is a response to a resent RDMA read, we
-		 * have to be careful to copy the data to the right
-		 * location.
-		 */
-		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-						  wqe, psn, pmtu);
-		goto read_middle;
-
-	case OP(RDMA_READ_RESPONSE_MIDDLE):
-		/* no AETH, no ACK */
-		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-			dev->n_rdma_seq++;
-			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-				goto ack_done;
-			qp->r_flags |= IPATH_R_RDMAR_SEQ;
-			ipath_restart_rc(qp, qp->s_last_psn + 1);
-			goto ack_done;
-		}
-		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-			goto ack_op_err;
-	read_middle:
-		if (unlikely(tlen != (hdrsize + pmtu + 4)))
-			goto ack_len_err;
-		if (unlikely(pmtu >= qp->s_rdma_read_len))
-			goto ack_len_err;
-
-		/* We got a response so update the timeout. */
-		spin_lock(&dev->pending_lock);
-		if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
-			list_move_tail(&qp->timerwait,
-				       &dev->pending[dev->pending_index]);
-		spin_unlock(&dev->pending_lock);
-
-		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
-			qp->s_retry = qp->s_retry_cnt;
-
-		/*
-		 * Update the RDMA receive state but do the copy w/o
-		 * holding the locks and blocking interrupts.
-		 */
-		qp->s_rdma_read_len -= pmtu;
-		update_last_psn(qp, psn);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
-		goto bail;
-
-	case OP(RDMA_READ_RESPONSE_ONLY):
-		if (!header_in_data)
-			aeth = be32_to_cpu(ohdr->u.aeth);
-		else
-			aeth = be32_to_cpu(((__be32 *) data)[0]);
-		if (!do_rc_ack(qp, aeth, psn, opcode, 0))
-			goto ack_done;
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/*
-		 * Check that the data size is >= 0 && <= pmtu.
-		 * Remember to account for the AETH header (4) and
-		 * ICRC (4).
-		 */
-		if (unlikely(tlen < (hdrsize + pad + 8)))
-			goto ack_len_err;
-		/*
-		 * If this is a response to a resent RDMA read, we
-		 * have to be careful to copy the data to the right
-		 * location.
-		 */
-		wqe = get_swqe_ptr(qp, qp->s_last);
-		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-						  wqe, psn, pmtu);
-		goto read_last;
-
-	case OP(RDMA_READ_RESPONSE_LAST):
-		/* ACKs READ req. */
-		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-			dev->n_rdma_seq++;
-			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-				goto ack_done;
-			qp->r_flags |= IPATH_R_RDMAR_SEQ;
-			ipath_restart_rc(qp, qp->s_last_psn + 1);
-			goto ack_done;
-		}
-		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-			goto ack_op_err;
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/*
-		 * Check that the data size is >= 1 && <= pmtu.
-		 * Remember to account for the AETH header (4) and
-		 * ICRC (4).
-		 */
-		if (unlikely(tlen <= (hdrsize + pad + 8)))
-			goto ack_len_err;
-	read_last:
-		tlen -= hdrsize + pad + 8;
-		if (unlikely(tlen != qp->s_rdma_read_len))
-			goto ack_len_err;
-		if (!header_in_data)
-			aeth = be32_to_cpu(ohdr->u.aeth);
-		else {
-			aeth = be32_to_cpu(((__be32 *) data)[0]);
-			data += sizeof(__be32);
-		}
-		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
-		(void) do_rc_ack(qp, aeth, psn,
-				 OP(RDMA_READ_RESPONSE_LAST), 0);
-		goto ack_done;
-	}
-
-ack_op_err:
-	status = IB_WC_LOC_QP_OP_ERR;
-	goto ack_err;
-
-ack_len_err:
-	status = IB_WC_LOC_LEN_ERR;
-ack_err:
-	ipath_send_complete(qp, wqe, status);
-	ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-ack_done:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-bail:
-	return;
-}
-
-/**
- * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @diff: the difference between the PSN and the expected PSN
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an unexpected
- * incoming RC packet for the given QP.
- * Called at interrupt level.
- * Return 1 if no more processing is needed; otherwise return 0 to
- * schedule a response to be sent.
- */
-static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
-				     struct ipath_other_headers *ohdr,
-				     void *data,
-				     struct ipath_qp *qp,
-				     u32 opcode,
-				     u32 psn,
-				     int diff,
-				     int header_in_data)
-{
-	struct ipath_ack_entry *e;
-	u8 i, prev;
-	int old_req;
-	unsigned long flags;
-
-	if (diff > 0) {
-		/*
-		 * Packet sequence error.
-		 * A NAK will ACK earlier sends and RDMA writes.
-		 * Don't queue the NAK if we already sent one.
-		 */
-		if (!qp->r_nak_state) {
-			qp->r_nak_state = IB_NAK_PSN_ERROR;
-			/* Use the expected PSN. */
-			qp->r_ack_psn = qp->r_psn;
-			goto send_ack;
-		}
-		goto done;
-	}
-
-	/*
-	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
-	 * write or atomic op.  Don't NAK errors, just silently drop
-	 * the duplicate request.  Note that r_sge, r_len, and
-	 * r_rcv_len may be in use so don't modify them.
-	 *
-	 * We are supposed to ACK the earliest duplicate PSN but we
-	 * can coalesce an outstanding duplicate ACK.  We have to
-	 * send the earliest so that RDMA reads can be restarted at
-	 * the requester's expected PSN.
-	 *
-	 * First, find where this duplicate PSN falls within the
-	 * ACKs previously sent.
-	 */
-	psn &= IPATH_PSN_MASK;
-	e = NULL;
-	old_req = 1;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-	/* Double check we can process this now that we hold the s_lock. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-		goto unlock_done;
-
-	for (i = qp->r_head_ack_queue; ; i = prev) {
-		if (i == qp->s_tail_ack_queue)
-			old_req = 0;
-		if (i)
-			prev = i - 1;
-		else
-			prev = IPATH_MAX_RDMA_ATOMIC;
-		if (prev == qp->r_head_ack_queue) {
-			e = NULL;
-			break;
-		}
-		e = &qp->s_ack_queue[prev];
-		if (!e->opcode) {
-			e = NULL;
-			break;
-		}
-		if (ipath_cmp24(psn, e->psn) >= 0) {
-			if (prev == qp->s_tail_ack_queue)
-				old_req = 0;
-			break;
-		}
-	}
-	switch (opcode) {
-	case OP(RDMA_READ_REQUEST): {
-		struct ib_reth *reth;
-		u32 offset;
-		u32 len;
-
-		/*
-		 * If we didn't find the RDMA read request in the ack queue,
-		 * or the send tasklet is already backed up to send an
-		 * earlier entry, we can ignore this request.
-		 */
-		if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
-			goto unlock_done;
-		/* RETH comes after BTH */
-		if (!header_in_data)
-			reth = &ohdr->u.rc.reth;
-		else {
-			reth = (struct ib_reth *)data;
-			data += sizeof(*reth);
-		}
-		/*
-		 * Address range must be a subset of the original
-		 * request and start on pmtu boundaries.
-		 * We reuse the old ack_queue slot since the requester
-		 * should not back up and request an earlier PSN for the
-		 * same request.
-		 */
-		offset = ((psn - e->psn) & IPATH_PSN_MASK) *
-			ib_mtu_enum_to_int(qp->path_mtu);
-		len = be32_to_cpu(reth->length);
-		if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
-			goto unlock_done;
-		if (len != 0) {
-			u32 rkey = be32_to_cpu(reth->rkey);
-			u64 vaddr = be64_to_cpu(reth->vaddr);
-			int ok;
-
-			ok = ipath_rkey_ok(qp, &e->rdma_sge,
-					   len, vaddr, rkey,
-					   IB_ACCESS_REMOTE_READ);
-			if (unlikely(!ok))
-				goto unlock_done;
-		} else {
-			e->rdma_sge.sg_list = NULL;
-			e->rdma_sge.num_sge = 0;
-			e->rdma_sge.sge.mr = NULL;
-			e->rdma_sge.sge.vaddr = NULL;
-			e->rdma_sge.sge.length = 0;
-			e->rdma_sge.sge.sge_length = 0;
-		}
-		e->psn = psn;
-		qp->s_ack_state = OP(ACKNOWLEDGE);
-		qp->s_tail_ack_queue = prev;
-		break;
-	}
-
-	case OP(COMPARE_SWAP):
-	case OP(FETCH_ADD): {
-		/*
-		 * If we didn't find the atomic request in the ack queue
-		 * or the send tasklet is already backed up to send an
-		 * earlier entry, we can ignore this request.
-		 */
-		if (!e || e->opcode != (u8) opcode || old_req)
-			goto unlock_done;
-		qp->s_ack_state = OP(ACKNOWLEDGE);
-		qp->s_tail_ack_queue = prev;
-		break;
-	}
-
-	default:
-		if (old_req)
-			goto unlock_done;
-		/*
-		 * Resend the most recent ACK if this request is
-		 * after all the previous RDMA reads and atomics.
-		 */
-		if (i == qp->r_head_ack_queue) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
-			qp->r_nak_state = 0;
-			qp->r_ack_psn = qp->r_psn - 1;
-			goto send_ack;
-		}
-		/*
-		 * Try to send a simple ACK to work around a Mellanox bug
-		 * which doesn't accept a RDMA read response or atomic
-		 * response as an ACK for earlier SENDs or RDMA writes.
-		 */
-		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
-		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
-		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
-			qp->r_nak_state = 0;
-			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
-			goto send_ack;
-		}
-		/*
-		 * Resend the RDMA read or atomic op which
-		 * ACKs this duplicate request.
-		 */
-		qp->s_ack_state = OP(ACKNOWLEDGE);
-		qp->s_tail_ack_queue = i;
-		break;
-	}
-	qp->r_nak_state = 0;
-	ipath_schedule_send(qp);
-
-unlock_done:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-	return 1;
-
-send_ack:
-	return 0;
-}
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
-{
-	unsigned long flags;
-	int lastwqe;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-	lastwqe = ipath_error_qp(qp, err);
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-
-	if (lastwqe) {
-		struct ib_event ev;
-
-		ev.device = qp->ibqp.device;
-		ev.element.qp = &qp->ibqp;
-		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-	}
-}
-
-static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
-{
-	unsigned next;
-
-	next = n + 1;
-	if (next > IPATH_MAX_RDMA_ATOMIC)
-		next = 0;
-	if (n == qp->s_tail_ack_queue) {
-		qp->s_tail_ack_queue = next;
-		qp->s_ack_state = OP(ACKNOWLEDGE);
-	}
-}
-
-/**
- * ipath_rc_rcv - process an incoming RC packet
- * @dev: the device this packet came in on
- * @hdr: the header of this packet
- * @has_grh: true if the header has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- *
- * This is called from ipath_qp_rcv() to process an incoming RC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-	struct ipath_other_headers *ohdr;
-	u32 opcode;
-	u32 hdrsize;
-	u32 psn;
-	u32 pad;
-	struct ib_wc wc;
-	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-	int diff;
-	struct ib_reth *reth;
-	int header_in_data;
-	unsigned long flags;
-
-	/* Validate the SLID. See Ch. 9.6.1.5 */
-	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-		goto done;
-
-	/* Check for GRH */
-	if (!has_grh) {
-		ohdr = &hdr->u.oth;
-		hdrsize = 8 + 12;	/* LRH + BTH */
-		psn = be32_to_cpu(ohdr->bth[2]);
-		header_in_data = 0;
-	} else {
-		ohdr = &hdr->u.l.oth;
-		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
-		/*
-		 * The header with GRH is 60 bytes and the core driver sets
-		 * the eager header buffer size to 56 bytes so the last 4
-		 * bytes of the BTH header (PSN) is in the data buffer.
-		 */
-		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-		if (header_in_data) {
-			psn = be32_to_cpu(((__be32 *) data)[0]);
-			data += sizeof(__be32);
-		} else
-			psn = be32_to_cpu(ohdr->bth[2]);
-	}
-
-	/*
-	 * Process responses (ACKs) before anything else.  Note that the
-	 * packet sequence number will be for something in the send work
-	 * queue rather than the expected receive packet sequence number.
-	 * In other words, this QP is the requester.
-	 */
-	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-		ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
-				  hdrsize, pmtu, header_in_data);
-		goto done;
-	}
-
-	/* Compute 24 bits worth of difference. */
-	diff = ipath_cmp24(psn, qp->r_psn);
-	if (unlikely(diff)) {
-		if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
-				       psn, diff, header_in_data))
-			goto done;
-		goto send_ack;
-	}
-
-	/* Check for opcode sequence errors. */
-	switch (qp->r_state) {
-	case OP(SEND_FIRST):
-	case OP(SEND_MIDDLE):
-		if (opcode == OP(SEND_MIDDLE) ||
-		    opcode == OP(SEND_LAST) ||
-		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-			break;
-		goto nack_inv;
-
-	case OP(RDMA_WRITE_FIRST):
-	case OP(RDMA_WRITE_MIDDLE):
-		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-		    opcode == OP(RDMA_WRITE_LAST) ||
-		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-			break;
-		goto nack_inv;
-
-	default:
-		if (opcode == OP(SEND_MIDDLE) ||
-		    opcode == OP(SEND_LAST) ||
-		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
-		    opcode == OP(RDMA_WRITE_MIDDLE) ||
-		    opcode == OP(RDMA_WRITE_LAST) ||
-		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-			goto nack_inv;
-		/*
-		 * Note that it is up to the requester to not send a new
-		 * RDMA read or atomic operation before receiving an ACK
-		 * for the previous operation.
-		 */
-		break;
-	}
-
-	memset(&wc, 0, sizeof wc);
-
-	/* OK, process the packet. */
-	switch (opcode) {
-	case OP(SEND_FIRST):
-		if (!ipath_get_rwqe(qp, 0))
-			goto rnr_nak;
-		qp->r_rcv_len = 0;
-		/* FALLTHROUGH */
-	case OP(SEND_MIDDLE):
-	case OP(RDMA_WRITE_MIDDLE):
-	send_middle:
-		/* Check for invalid length PMTU or posted rwqe len. */
-		if (unlikely(tlen != (hdrsize + pmtu + 4)))
-			goto nack_inv;
-		qp->r_rcv_len += pmtu;
-		if (unlikely(qp->r_rcv_len > qp->r_len))
-			goto nack_inv;
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
-		break;
-
-	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-		/* consume RWQE */
-		if (!ipath_get_rwqe(qp, 1))
-			goto rnr_nak;
-		goto send_last_imm;
-
-	case OP(SEND_ONLY):
-	case OP(SEND_ONLY_WITH_IMMEDIATE):
-		if (!ipath_get_rwqe(qp, 0))
-			goto rnr_nak;
-		qp->r_rcv_len = 0;
-		if (opcode == OP(SEND_ONLY))
-			goto send_last;
-		/* FALLTHROUGH */
-	case OP(SEND_LAST_WITH_IMMEDIATE):
-	send_last_imm:
-		if (header_in_data) {
-			wc.ex.imm_data = *(__be32 *) data;
-			data += sizeof(__be32);
-		} else {
-			/* Immediate data comes after BTH */
-			wc.ex.imm_data = ohdr->u.imm_data;
-		}
-		hdrsize += 4;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		/* FALLTHROUGH */
-	case OP(SEND_LAST):
-	case OP(RDMA_WRITE_LAST):
-	send_last:
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/* Check for invalid length. */
-		/* XXX LAST len should be >= 1 */
-		if (unlikely(tlen < (hdrsize + pad + 4)))
-			goto nack_inv;
-		/* Don't count the CRC. */
-		tlen -= (hdrsize + pad + 4);
-		wc.byte_len = tlen + qp->r_rcv_len;
-		if (unlikely(wc.byte_len > qp->r_len))
-			goto nack_inv;
-		ipath_copy_sge(&qp->r_sge, data, tlen);
-		qp->r_msn++;
-		if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-			break;
-		wc.wr_id = qp->r_wr_id;
-		wc.status = IB_WC_SUCCESS;
-		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
-		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-		else
-			wc.opcode = IB_WC_RECV;
-		wc.qp = &qp->ibqp;
-		wc.src_qp = qp->remote_qpn;
-		wc.slid = qp->remote_ah_attr.dlid;
-		wc.sl = qp->remote_ah_attr.sl;
-		/* Signal completion event if the solicited bit is set. */
-		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-			       (ohdr->bth[0] &
-				cpu_to_be32(1 << 23)) != 0);
-		break;
-
-	case OP(RDMA_WRITE_FIRST):
-	case OP(RDMA_WRITE_ONLY):
-	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-		if (unlikely(!(qp->qp_access_flags &
-			       IB_ACCESS_REMOTE_WRITE)))
-			goto nack_inv;
-		/* consume RWQE */
-		/* RETH comes after BTH */
-		if (!header_in_data)
-			reth = &ohdr->u.rc.reth;
-		else {
-			reth = (struct ib_reth *)data;
-			data += sizeof(*reth);
-		}
-		hdrsize += sizeof(*reth);
-		qp->r_len = be32_to_cpu(reth->length);
-		qp->r_rcv_len = 0;
-		if (qp->r_len != 0) {
-			u32 rkey = be32_to_cpu(reth->rkey);
-			u64 vaddr = be64_to_cpu(reth->vaddr);
-			int ok;
-
-			/* Check rkey & NAK */
-			ok = ipath_rkey_ok(qp, &qp->r_sge,
-					   qp->r_len, vaddr, rkey,
-					   IB_ACCESS_REMOTE_WRITE);
-			if (unlikely(!ok))
-				goto nack_acc;
-		} else {
-			qp->r_sge.sg_list = NULL;
-			qp->r_sge.sge.mr = NULL;
-			qp->r_sge.sge.vaddr = NULL;
-			qp->r_sge.sge.length = 0;
-			qp->r_sge.sge.sge_length = 0;
-		}
-		if (opcode == OP(RDMA_WRITE_FIRST))
-			goto send_middle;
-		else if (opcode == OP(RDMA_WRITE_ONLY))
-			goto send_last;
-		if (!ipath_get_rwqe(qp, 1))
-			goto rnr_nak;
-		goto send_last_imm;
-
-	case OP(RDMA_READ_REQUEST): {
-		struct ipath_ack_entry *e;
-		u32 len;
-		u8 next;
-
-		if (unlikely(!(qp->qp_access_flags &
-			       IB_ACCESS_REMOTE_READ)))
-			goto nack_inv;
-		next = qp->r_head_ack_queue + 1;
-		if (next > IPATH_MAX_RDMA_ATOMIC)
-			next = 0;
-		spin_lock_irqsave(&qp->s_lock, flags);
-		/* Double check we can process this while holding the s_lock. */
-		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-			goto unlock;
-		if (unlikely(next == qp->s_tail_ack_queue)) {
-			if (!qp->s_ack_queue[next].sent)
-				goto nack_inv_unlck;
-			ipath_update_ack_queue(qp, next);
-		}
-		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		/* RETH comes after BTH */
-		if (!header_in_data)
-			reth = &ohdr->u.rc.reth;
-		else {
-			reth = (struct ib_reth *)data;
-			data += sizeof(*reth);
-		}
-		len = be32_to_cpu(reth->length);
-		if (len) {
-			u32 rkey = be32_to_cpu(reth->rkey);
-			u64 vaddr = be64_to_cpu(reth->vaddr);
-			int ok;
-
-			/* Check rkey & NAK */
-			ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
-					   rkey, IB_ACCESS_REMOTE_READ);
-			if (unlikely(!ok))
-				goto nack_acc_unlck;
-			/*
-			 * Update the next expected PSN.  We add 1 later
-			 * below, so only add the remainder here.
-			 */
-			if (len > pmtu)
-				qp->r_psn += (len - 1) / pmtu;
-		} else {
-			e->rdma_sge.sg_list = NULL;
-			e->rdma_sge.num_sge = 0;
-			e->rdma_sge.sge.mr = NULL;
-			e->rdma_sge.sge.vaddr = NULL;
-			e->rdma_sge.sge.length = 0;
-			e->rdma_sge.sge.sge_length = 0;
-		}
-		e->opcode = opcode;
-		e->sent = 0;
-		e->psn = psn;
-		/*
-		 * We need to increment the MSN here instead of when we
-		 * finish sending the result since a duplicate request would
-		 * increment it more than once.
-		 */
-		qp->r_msn++;
-		qp->r_psn++;
-		qp->r_state = opcode;
-		qp->r_nak_state = 0;
-		qp->r_head_ack_queue = next;
-
-		/* Schedule the send tasklet. */
-		ipath_schedule_send(qp);
-
-		goto unlock;
-	}
-
-	case OP(COMPARE_SWAP):
-	case OP(FETCH_ADD): {
-		struct ib_atomic_eth *ateth;
-		struct ipath_ack_entry *e;
-		u64 vaddr;
-		atomic64_t *maddr;
-		u64 sdata;
-		u32 rkey;
-		u8 next;
-
-		if (unlikely(!(qp->qp_access_flags &
-			       IB_ACCESS_REMOTE_ATOMIC)))
-			goto nack_inv;
-		next = qp->r_head_ack_queue + 1;
-		if (next > IPATH_MAX_RDMA_ATOMIC)
-			next = 0;
-		spin_lock_irqsave(&qp->s_lock, flags);
-		/* Double check we can process this while holding the s_lock. */
-		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-			goto unlock;
-		if (unlikely(next == qp->s_tail_ack_queue)) {
-			if (!qp->s_ack_queue[next].sent)
-				goto nack_inv_unlck;
-			ipath_update_ack_queue(qp, next);
-		}
-		if (!header_in_data)
-			ateth = &ohdr->u.atomic_eth;
-		else
-			ateth = (struct ib_atomic_eth *)data;
-		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
-			be32_to_cpu(ateth->vaddr[1]);
-		if (unlikely(vaddr & (sizeof(u64) - 1)))
-			goto nack_inv_unlck;
-		rkey = be32_to_cpu(ateth->rkey);
-		/* Check rkey & NAK */
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
-					    sizeof(u64), vaddr, rkey,
-					    IB_ACCESS_REMOTE_ATOMIC)))
-			goto nack_acc_unlck;
-		/* Perform atomic OP and save result. */
-		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-		sdata = be64_to_cpu(ateth->swap_data);
-		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
-			(u64) atomic64_add_return(sdata, maddr) - sdata :
-			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-				      be64_to_cpu(ateth->compare_data),
-				      sdata);
-		e->opcode = opcode;
-		e->sent = 0;
-		e->psn = psn & IPATH_PSN_MASK;
-		qp->r_msn++;
-		qp->r_psn++;
-		qp->r_state = opcode;
-		qp->r_nak_state = 0;
-		qp->r_head_ack_queue = next;
-
-		/* Schedule the send tasklet. */
-		ipath_schedule_send(qp);
-
-		goto unlock;
-	}
-
-	default:
-		/* NAK unknown opcodes. */
-		goto nack_inv;
-	}
-	qp->r_psn++;
-	qp->r_state = opcode;
-	qp->r_ack_psn = psn;
-	qp->r_nak_state = 0;
-	/* Send an ACK if requested or required. */
-	if (psn & (1 << 31))
-		goto send_ack;
-	goto done;
-
-rnr_nak:
-	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
-	qp->r_ack_psn = qp->r_psn;
-	goto send_ack;
-
-nack_inv_unlck:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_inv:
-	ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
-	qp->r_ack_psn = qp->r_psn;
-	goto send_ack;
-
-nack_acc_unlck:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_acc:
-	ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
-	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
-	qp->r_ack_psn = qp->r_psn;
-send_ack:
-	send_rc_ack(qp);
-	goto done;
-
-unlock:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-	return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
deleted file mode 100644
index 8f44d0c..0000000
--- a/drivers/infiniband/hw/ipath/ipath_registers.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_REGISTERS_H
-#define _IPATH_REGISTERS_H
-
-/*
- * This file should only be included by kernel source, and by the diags.  It
- * defines the registers, and their contents, for InfiniPath chips.
- */
-
-/*
- * These are the InfiniPath register and buffer bit definitions,
- * that are visible to software, and needed only by the kernel
- * and diag code.  A few, that are visible to protocol and user
- * code are in ipath_common.h.  Some bits are specific
- * to a given chip implementation, and have been moved to the
- * chip-specific source file
- */
-
-/* kr_revision bits */
-#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
-#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
-#define INFINIPATH_R_ARCH_MASK 0xFF
-#define INFINIPATH_R_ARCH_SHIFT 16
-#define INFINIPATH_R_SOFTWARE_MASK 0xFF
-#define INFINIPATH_R_SOFTWARE_SHIFT 24
-#define INFINIPATH_R_BOARDID_MASK 0xFF
-#define INFINIPATH_R_BOARDID_SHIFT 32
-
-/* kr_control bits */
-#define INFINIPATH_C_FREEZEMODE 0x00000002
-#define INFINIPATH_C_LINKENABLE 0x00000004
-
-/* kr_sendctrl bits */
-#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
-#define INFINIPATH_S_UPDTHRESH_SHIFT 24
-#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
-
-#define IPATH_S_ABORT		0
-#define IPATH_S_PIOINTBUFAVAIL	1
-#define IPATH_S_PIOBUFAVAILUPD	2
-#define IPATH_S_PIOENABLE	3
-#define IPATH_S_SDMAINTENABLE	9
-#define IPATH_S_SDMASINGLEDESCRIPTOR	10
-#define IPATH_S_SDMAENABLE	11
-#define IPATH_S_SDMAHALT	12
-#define IPATH_S_DISARM		31
-
-#define INFINIPATH_S_ABORT		(1U << IPATH_S_ABORT)
-#define INFINIPATH_S_PIOINTBUFAVAIL	(1U << IPATH_S_PIOINTBUFAVAIL)
-#define INFINIPATH_S_PIOBUFAVAILUPD	(1U << IPATH_S_PIOBUFAVAILUPD)
-#define INFINIPATH_S_PIOENABLE		(1U << IPATH_S_PIOENABLE)
-#define INFINIPATH_S_SDMAINTENABLE	(1U << IPATH_S_SDMAINTENABLE)
-#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
-					(1U << IPATH_S_SDMASINGLEDESCRIPTOR)
-#define INFINIPATH_S_SDMAENABLE		(1U << IPATH_S_SDMAENABLE)
-#define INFINIPATH_S_SDMAHALT		(1U << IPATH_S_SDMAHALT)
-#define INFINIPATH_S_DISARM		(1U << IPATH_S_DISARM)
-
-/* kr_rcvctrl bits that are the same on multiple chips */
-#define INFINIPATH_R_PORTENABLE_SHIFT 0
-#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_SDMAINT		0x8000000000000000ULL
-#define INFINIPATH_I_SDMADISABLED	0x4000000000000000ULL
-#define INFINIPATH_I_ERROR		0x0000000080000000ULL
-#define INFINIPATH_I_SPIOSENT		0x0000000040000000ULL
-#define INFINIPATH_I_SPIOBUFAVAIL	0x0000000020000000ULL
-#define INFINIPATH_I_GPIO		0x0000000010000000ULL
-#define INFINIPATH_I_JINT		0x0000000004000000ULL
-
-/* kr_errorstatus, kr_errorclear, kr_errormask bits */
-#define INFINIPATH_E_RFORMATERR			0x0000000000000001ULL
-#define INFINIPATH_E_RVCRC			0x0000000000000002ULL
-#define INFINIPATH_E_RICRC			0x0000000000000004ULL
-#define INFINIPATH_E_RMINPKTLEN			0x0000000000000008ULL
-#define INFINIPATH_E_RMAXPKTLEN			0x0000000000000010ULL
-#define INFINIPATH_E_RLONGPKTLEN		0x0000000000000020ULL
-#define INFINIPATH_E_RSHORTPKTLEN		0x0000000000000040ULL
-#define INFINIPATH_E_RUNEXPCHAR			0x0000000000000080ULL
-#define INFINIPATH_E_RUNSUPVL			0x0000000000000100ULL
-#define INFINIPATH_E_REBP			0x0000000000000200ULL
-#define INFINIPATH_E_RIBFLOW			0x0000000000000400ULL
-#define INFINIPATH_E_RBADVERSION		0x0000000000000800ULL
-#define INFINIPATH_E_RRCVEGRFULL		0x0000000000001000ULL
-#define INFINIPATH_E_RRCVHDRFULL		0x0000000000002000ULL
-#define INFINIPATH_E_RBADTID			0x0000000000004000ULL
-#define INFINIPATH_E_RHDRLEN			0x0000000000008000ULL
-#define INFINIPATH_E_RHDR			0x0000000000010000ULL
-#define INFINIPATH_E_RIBLOSTLINK		0x0000000000020000ULL
-#define INFINIPATH_E_SENDSPECIALTRIGGER		0x0000000008000000ULL
-#define INFINIPATH_E_SDMADISABLED		0x0000000010000000ULL
-#define INFINIPATH_E_SMINPKTLEN			0x0000000020000000ULL
-#define INFINIPATH_E_SMAXPKTLEN			0x0000000040000000ULL
-#define INFINIPATH_E_SUNDERRUN			0x0000000080000000ULL
-#define INFINIPATH_E_SPKTLEN			0x0000000100000000ULL
-#define INFINIPATH_E_SDROPPEDSMPPKT		0x0000000200000000ULL
-#define INFINIPATH_E_SDROPPEDDATAPKT		0x0000000400000000ULL
-#define INFINIPATH_E_SPIOARMLAUNCH		0x0000000800000000ULL
-#define INFINIPATH_E_SUNEXPERRPKTNUM		0x0000001000000000ULL
-#define INFINIPATH_E_SUNSUPVL			0x0000002000000000ULL
-#define INFINIPATH_E_SENDBUFMISUSE		0x0000004000000000ULL
-#define INFINIPATH_E_SDMAGENMISMATCH		0x0000008000000000ULL
-#define INFINIPATH_E_SDMAOUTOFBOUND		0x0000010000000000ULL
-#define INFINIPATH_E_SDMATAILOUTOFBOUND		0x0000020000000000ULL
-#define INFINIPATH_E_SDMABASE			0x0000040000000000ULL
-#define INFINIPATH_E_SDMA1STDESC		0x0000080000000000ULL
-#define INFINIPATH_E_SDMARPYTAG			0x0000100000000000ULL
-#define INFINIPATH_E_SDMADWEN			0x0000200000000000ULL
-#define INFINIPATH_E_SDMAMISSINGDW		0x0000400000000000ULL
-#define INFINIPATH_E_SDMAUNEXPDATA		0x0000800000000000ULL
-#define INFINIPATH_E_IBSTATUSCHANGED		0x0001000000000000ULL
-#define INFINIPATH_E_INVALIDADDR		0x0002000000000000ULL
-#define INFINIPATH_E_RESET			0x0004000000000000ULL
-#define INFINIPATH_E_HARDWARE			0x0008000000000000ULL
-#define INFINIPATH_E_SDMADESCADDRMISALIGN	0x0010000000000000ULL
-#define INFINIPATH_E_INVALIDEEPCMD		0x0020000000000000ULL
-
-/*
- * this is used to print "common" packet errors only when the
- * __IPATH_ERRPKTDBG bit is set in ipath_debug.
- */
-#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
-		| INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
-		| INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
-		| INFINIPATH_E_REBP )
-
-/* Convenience for decoding Send DMA errors */
-#define INFINIPATH_E_SDMAERRS ( \
-	INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
-	INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
-	INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
-	INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
-	INFINIPATH_E_SDMAUNEXPDATA | \
-	INFINIPATH_E_SDMADESCADDRMISALIGN | \
-	INFINIPATH_E_SDMADISABLED | \
-	INFINIPATH_E_SENDBUFMISUSE)
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
- * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
- * 		bit 4: flag buffer, 5: datainfo, 6: header info */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF	0x1ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC	0x2ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
-/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
-/* waldo specific -- find the rest in ipath_6110.c */
-#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
-/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
-#define INFINIPATH_HWE_MEMBISTFAILED	0x0040000000000000ULL
-
-/* kr_hwdiagctrl bits */
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
-#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
-#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-
-/* kr_ibcctrl bits */
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
-#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
-/* cycle through TS1/TS2 till OK */
-#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
-/* wait for TS1, then go on */
-#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
-#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
-#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKCMD_DOWN 1		/* move to 0x11 */
-#define INFINIPATH_IBCC_LINKCMD_ARMED 2		/* move to 0x21 */
-#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3	/* move to 0x31 */
-#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
-#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
-#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
-#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
-#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
-#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
-#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
-
-/* kr_ibcstatus bits */
-#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
-#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
-
-#define INFINIPATH_IBCS_TXREADY       0x40000000
-#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
-/* link training states (shift by
-   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
-#define INFINIPATH_IBCS_LT_STATE_DISABLED	0x00
-#define INFINIPATH_IBCS_LT_STATE_LINKUP		0x01
-#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE	0x02
-#define INFINIPATH_IBCS_LT_STATE_POLLQUIET	0x03
-#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY	0x04
-#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET	0x05
-#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE	0x08
-#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG	0x09
-#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT	0x0a
-#define INFINIPATH_IBCS_LT_STATE_CFGIDLE	0x0b
-#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN	0x0c
-#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT	0x0e
-#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE	0x0f
-/* link state machine states (shift by ibcs_ls_shift) */
-#define INFINIPATH_IBCS_L_STATE_DOWN		0x0
-#define INFINIPATH_IBCS_L_STATE_INIT		0x1
-#define INFINIPATH_IBCS_L_STATE_ARM		0x2
-#define INFINIPATH_IBCS_L_STATE_ACTIVE		0x3
-#define INFINIPATH_IBCS_L_STATE_ACT_DEFER	0x4
-
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
-#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
-#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
-
-/* kr_extctrl bits */
-#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
-#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
-#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
-#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
-#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
-#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
-#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
-#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
-#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
-#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
-#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
-#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
-#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
-
-/* kr_partitionkey bits */
-#define INFINIPATH_PKEY_SIZE 16
-#define INFINIPATH_PKEY_MASK 0xFFFF
-#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
-
-/* kr_serdesconfig0 bits */
-#define INFINIPATH_SERDC0_RESET_MASK  0xfULL	/* overal reset bits */
-#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL	/* pll reset */
-/* tx idle enables (per lane) */
-#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
-/* rx detect enables (per lane) */
-#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
-/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
-#define INFINIPATH_SERDC0_L1PWR_DN	 0xF0ULL
-
-/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
-#define INFINIPATH_XGXS_RX_POL_SHIFT 19
-#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
-
-
-/*
- * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
- * PIO send buffers.  This is well beyond anything currently
- * defined in the InfiniBand spec.
- */
-#define IPATH_PIO_MAXIBHDR 128
-
-typedef u64 ipath_err_t;
-
-/* The following change with the type of device, so
- * need to be part of the ipath_devdata struct, or
- * we could have problems plugging in devices of
- * different types (e.g. one HT, one PCIE)
- * in one system, to be managed by one driver.
- * On the other hand, this file is may also be included
- * by other code, so leave the declarations here
- * temporarily. Minor footprint issue if common-model
- * linker used, none if C89+ linker used.
- */
-
-/* mask of defined bits for various registers */
-extern u64 infinipath_i_bitsextant;
-extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
-
-/* masks that are different in various chips, or only exist in some chips */
-extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
-
-/*
- * These are the infinipath general register numbers (not offsets).
- * The kernel registers are used directly, those beyond the kernel
- * registers are calculated from one of the base registers.  The use of
- * an integer type doesn't allow type-checking as thorough as, say,
- * an enum but allows for better hiding of chip differences.
- */
-typedef const u16 ipath_kreg,	/* infinipath general registers */
- ipath_creg,			/* infinipath counter registers */
- ipath_sreg;			/* kernel-only, infinipath send registers */
-
-/*
- * These are the chip registers common to all infinipath chips, and
- * used both by the kernel and the diagnostics or other user code.
- * They are all implemented such that 64 bit accesses work.
- * Some implement no more than 32 bits.  Because 64 bit reads
- * require 2 HT cmds on opteron, we access those with 32 bit
- * reads for efficiency (they are written as 64 bits, since
- * the extra 32 bits are nearly free on writes, and it slightly reduces
- * complexity).  The rest are all accessed as 64 bits.
- */
-struct ipath_kregs {
-	/* These are the 32 bit group */
-	ipath_kreg kr_control;
-	ipath_kreg kr_counterregbase;
-	ipath_kreg kr_intmask;
-	ipath_kreg kr_intstatus;
-	ipath_kreg kr_pagealign;
-	ipath_kreg kr_portcnt;
-	ipath_kreg kr_rcvtidbase;
-	ipath_kreg kr_rcvtidcnt;
-	ipath_kreg kr_rcvegrbase;
-	ipath_kreg kr_rcvegrcnt;
-	ipath_kreg kr_scratch;
-	ipath_kreg kr_sendctrl;
-	ipath_kreg kr_sendpiobufbase;
-	ipath_kreg kr_sendpiobufcnt;
-	ipath_kreg kr_sendpiosize;
-	ipath_kreg kr_sendregbase;
-	ipath_kreg kr_userregbase;
-	/* These are the 64 bit group */
-	ipath_kreg kr_debugport;
-	ipath_kreg kr_debugportselect;
-	ipath_kreg kr_errorclear;
-	ipath_kreg kr_errormask;
-	ipath_kreg kr_errorstatus;
-	ipath_kreg kr_extctrl;
-	ipath_kreg kr_extstatus;
-	ipath_kreg kr_gpio_clear;
-	ipath_kreg kr_gpio_mask;
-	ipath_kreg kr_gpio_out;
-	ipath_kreg kr_gpio_status;
-	ipath_kreg kr_hwdiagctrl;
-	ipath_kreg kr_hwerrclear;
-	ipath_kreg kr_hwerrmask;
-	ipath_kreg kr_hwerrstatus;
-	ipath_kreg kr_ibcctrl;
-	ipath_kreg kr_ibcstatus;
-	ipath_kreg kr_intblocked;
-	ipath_kreg kr_intclear;
-	ipath_kreg kr_interruptconfig;
-	ipath_kreg kr_mdio;
-	ipath_kreg kr_partitionkey;
-	ipath_kreg kr_rcvbthqp;
-	ipath_kreg kr_rcvbufbase;
-	ipath_kreg kr_rcvbufsize;
-	ipath_kreg kr_rcvctrl;
-	ipath_kreg kr_rcvhdrcnt;
-	ipath_kreg kr_rcvhdrentsize;
-	ipath_kreg kr_rcvhdrsize;
-	ipath_kreg kr_rcvintmembase;
-	ipath_kreg kr_rcvintmemsize;
-	ipath_kreg kr_revision;
-	ipath_kreg kr_sendbuffererror;
-	ipath_kreg kr_sendpioavailaddr;
-	ipath_kreg kr_serdesconfig0;
-	ipath_kreg kr_serdesconfig1;
-	ipath_kreg kr_serdesstatus;
-	ipath_kreg kr_txintmembase;
-	ipath_kreg kr_txintmemsize;
-	ipath_kreg kr_xgxsconfig;
-	ipath_kreg kr_ibpllcfg;
-	/* use these two (and the following N ports) only with
-	 * ipath_k*_kreg64_port(); not *kreg64() */
-	ipath_kreg kr_rcvhdraddr;
-	ipath_kreg kr_rcvhdrtailaddr;
-
-	/* remaining registers are not present on all types of infinipath
-	   chips  */
-	ipath_kreg kr_rcvpktledcnt;
-	ipath_kreg kr_pcierbuftestreg0;
-	ipath_kreg kr_pcierbuftestreg1;
-	ipath_kreg kr_pcieq0serdesconfig0;
-	ipath_kreg kr_pcieq0serdesconfig1;
-	ipath_kreg kr_pcieq0serdesstatus;
-	ipath_kreg kr_pcieq1serdesconfig0;
-	ipath_kreg kr_pcieq1serdesconfig1;
-	ipath_kreg kr_pcieq1serdesstatus;
-	ipath_kreg kr_hrtbt_guid;
-	ipath_kreg kr_ibcddrctrl;
-	ipath_kreg kr_ibcddrstatus;
-	ipath_kreg kr_jintreload;
-
-	/* send dma related regs */
-	ipath_kreg kr_senddmabase;
-	ipath_kreg kr_senddmalengen;
-	ipath_kreg kr_senddmatail;
-	ipath_kreg kr_senddmahead;
-	ipath_kreg kr_senddmaheadaddr;
-	ipath_kreg kr_senddmabufmask0;
-	ipath_kreg kr_senddmabufmask1;
-	ipath_kreg kr_senddmabufmask2;
-	ipath_kreg kr_senddmastatus;
-
-	/* SerDes related regs (IBA7220-only) */
-	ipath_kreg kr_ibserdesctrl;
-	ipath_kreg kr_ib_epbacc;
-	ipath_kreg kr_ib_epbtrans;
-	ipath_kreg kr_pcie_epbacc;
-	ipath_kreg kr_pcie_epbtrans;
-	ipath_kreg kr_ib_ddsrxeq;
-};
-
-struct ipath_cregs {
-	ipath_creg cr_badformatcnt;
-	ipath_creg cr_erricrccnt;
-	ipath_creg cr_errlinkcnt;
-	ipath_creg cr_errlpcrccnt;
-	ipath_creg cr_errpkey;
-	ipath_creg cr_errrcvflowctrlcnt;
-	ipath_creg cr_err_rlencnt;
-	ipath_creg cr_errslencnt;
-	ipath_creg cr_errtidfull;
-	ipath_creg cr_errtidvalid;
-	ipath_creg cr_errvcrccnt;
-	ipath_creg cr_ibstatuschange;
-	ipath_creg cr_intcnt;
-	ipath_creg cr_invalidrlencnt;
-	ipath_creg cr_invalidslencnt;
-	ipath_creg cr_lbflowstallcnt;
-	ipath_creg cr_iblinkdowncnt;
-	ipath_creg cr_iblinkerrrecovcnt;
-	ipath_creg cr_ibsymbolerrcnt;
-	ipath_creg cr_pktrcvcnt;
-	ipath_creg cr_pktrcvflowctrlcnt;
-	ipath_creg cr_pktsendcnt;
-	ipath_creg cr_pktsendflowcnt;
-	ipath_creg cr_portovflcnt;
-	ipath_creg cr_rcvebpcnt;
-	ipath_creg cr_rcvovflcnt;
-	ipath_creg cr_rxdroppktcnt;
-	ipath_creg cr_senddropped;
-	ipath_creg cr_sendstallcnt;
-	ipath_creg cr_sendunderruncnt;
-	ipath_creg cr_unsupvlcnt;
-	ipath_creg cr_wordrcvcnt;
-	ipath_creg cr_wordsendcnt;
-	ipath_creg cr_vl15droppedpktcnt;
-	ipath_creg cr_rxotherlocalphyerrcnt;
-	ipath_creg cr_excessbufferovflcnt;
-	ipath_creg cr_locallinkintegrityerrcnt;
-	ipath_creg cr_rxvlerrcnt;
-	ipath_creg cr_rxdlidfltrcnt;
-	ipath_creg cr_psstat;
-	ipath_creg cr_psstart;
-	ipath_creg cr_psinterval;
-	ipath_creg cr_psrcvdatacount;
-	ipath_creg cr_psrcvpktscount;
-	ipath_creg cr_psxmitdatacount;
-	ipath_creg cr_psxmitpktscount;
-	ipath_creg cr_psxmitwaitcount;
-};
-
-#endif				/* _IPATH_REGISTERS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
deleted file mode 100644
index 1f95bba..0000000
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/*
- * Convert the AETH RNR timeout code into the number of milliseconds.
- */
-const u32 ib_ipath_rnr_table[32] = {
-	656,			/* 0 */
-	1,			/* 1 */
-	1,			/* 2 */
-	1,			/* 3 */
-	1,			/* 4 */
-	1,			/* 5 */
-	1,			/* 6 */
-	1,			/* 7 */
-	1,			/* 8 */
-	1,			/* 9 */
-	1,			/* A */
-	1,			/* B */
-	1,			/* C */
-	1,			/* D */
-	2,			/* E */
-	2,			/* F */
-	3,			/* 10 */
-	4,			/* 11 */
-	6,			/* 12 */
-	8,			/* 13 */
-	11,			/* 14 */
-	16,			/* 15 */
-	21,			/* 16 */
-	31,			/* 17 */
-	41,			/* 18 */
-	62,			/* 19 */
-	82,			/* 1A */
-	123,			/* 1B */
-	164,			/* 1C */
-	246,			/* 1D */
-	328,			/* 1E */
-	492			/* 1F */
-};
-
-/**
- * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
- * @qp: the QP
- *
- * Called with the QP s_lock held and interrupts disabled.
- * XXX Use a simple list for now.  We might need a priority
- * queue if we have lots of QPs waiting for RNR timeouts
- * but that should be rare.
- */
-void ipath_insert_rnr_queue(struct ipath_qp *qp)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-
-	/* We already did a spin_lock_irqsave(), so just use spin_lock */
-	spin_lock(&dev->pending_lock);
-	if (list_empty(&dev->rnrwait))
-		list_add(&qp->timerwait, &dev->rnrwait);
-	else {
-		struct list_head *l = &dev->rnrwait;
-		struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
-						  timerwait);
-
-		while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
-			qp->s_rnr_timeout -= nqp->s_rnr_timeout;
-			l = l->next;
-			if (l->next == &dev->rnrwait) {
-				nqp = NULL;
-				break;
-			}
-			nqp = list_entry(l->next, struct ipath_qp,
-					 timerwait);
-		}
-		if (nqp)
-			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
-		list_add(&qp->timerwait, l);
-	}
-	spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_init_sge - Validate a RWQE and fill in the SGE state
- * @qp: the QP
- *
- * Return 1 if OK.
- */
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-		   u32 *lengthp, struct ipath_sge_state *ss)
-{
-	int i, j, ret;
-	struct ib_wc wc;
-
-	*lengthp = 0;
-	for (i = j = 0; i < wqe->num_sge; i++) {
-		if (wqe->sg_list[i].length == 0)
-			continue;
-		/* Check LKEY */
-		if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
-				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-			goto bad_lkey;
-		*lengthp += wqe->sg_list[i].length;
-		j++;
-	}
-	ss->num_sge = j;
-	ret = 1;
-	goto bail;
-
-bad_lkey:
-	memset(&wc, 0, sizeof(wc));
-	wc.wr_id = wqe->wr_id;
-	wc.status = IB_WC_LOC_PROT_ERR;
-	wc.opcode = IB_WC_RECV;
-	wc.qp = &qp->ibqp;
-	/* Signal solicited completion event. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return 0 if no RWQE is available, otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
-{
-	unsigned long flags;
-	struct ipath_rq *rq;
-	struct ipath_rwq *wq;
-	struct ipath_srq *srq;
-	struct ipath_rwqe *wqe;
-	void (*handler)(struct ib_event *, void *);
-	u32 tail;
-	int ret;
-
-	if (qp->ibqp.srq) {
-		srq = to_isrq(qp->ibqp.srq);
-		handler = srq->ibsrq.event_handler;
-		rq = &srq->rq;
-	} else {
-		srq = NULL;
-		handler = NULL;
-		rq = &qp->r_rq;
-	}
-
-	spin_lock_irqsave(&rq->lock, flags);
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	wq = rq->wq;
-	tail = wq->tail;
-	/* Validate tail before using it since it is user writable. */
-	if (tail >= rq->size)
-		tail = 0;
-	do {
-		if (unlikely(tail == wq->head)) {
-			ret = 0;
-			goto unlock;
-		}
-		/* Make sure entry is read after head index is read. */
-		smp_rmb();
-		wqe = get_rwqe_ptr(rq, tail);
-		if (++tail >= rq->size)
-			tail = 0;
-		if (wr_id_only)
-			break;
-		qp->r_sge.sg_list = qp->r_sg_list;
-	} while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
-	qp->r_wr_id = wqe->wr_id;
-	wq->tail = tail;
-
-	ret = 1;
-	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
-	if (handler) {
-		u32 n;
-
-		/*
-		 * validate head pointer value and compute
-		 * the number of remaining WQEs.
-		 */
-		n = wq->head;
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
-
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-			goto bail;
-		}
-	}
-unlock:
-	spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-	return ret;
-}
-
-/**
- * ipath_ruc_loopback - handle UC and RC lookback requests
- * @sqp: the sending QP
- *
- * This is called from ipath_do_send() to
- * forward a WQE addressed to the same HCA.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ipath_ruc_loopback(struct ipath_qp *sqp)
-{
-	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-	struct ipath_qp *qp;
-	struct ipath_swqe *wqe;
-	struct ipath_sge *sge;
-	unsigned long flags;
-	struct ib_wc wc;
-	u64 sdata;
-	atomic64_t *maddr;
-	enum ib_wc_status send_status;
-
-	/*
-	 * Note that we check the responder QP state after
-	 * checking the requester's state.
-	 */
-	qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-
-	spin_lock_irqsave(&sqp->s_lock, flags);
-
-	/* Return if we are already busy processing a work request. */
-	if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-	    !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-		goto unlock;
-
-	sqp->s_flags |= IPATH_S_BUSY;
-
-again:
-	if (sqp->s_last == sqp->s_head)
-		goto clr_busy;
-	wqe = get_swqe_ptr(sqp, sqp->s_last);
-
-	/* Return if it is not OK to start a new work reqeust. */
-	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-		if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
-			goto clr_busy;
-		/* We are in the error state, flush the work request. */
-		send_status = IB_WC_WR_FLUSH_ERR;
-		goto flush_send;
-	}
-
-	/*
-	 * We can rely on the entry not changing without the s_lock
-	 * being held until we update s_last.
-	 * We increment s_cur to indicate s_last is in progress.
-	 */
-	if (sqp->s_last == sqp->s_cur) {
-		if (++sqp->s_cur >= sqp->s_size)
-			sqp->s_cur = 0;
-	}
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-		dev->n_pkt_drops++;
-		/*
-		 * For RC, the requester would timeout and retry so
-		 * shortcut the timeouts and just signal too many retries.
-		 */
-		if (sqp->ibqp.qp_type == IB_QPT_RC)
-			send_status = IB_WC_RETRY_EXC_ERR;
-		else
-			send_status = IB_WC_SUCCESS;
-		goto serr;
-	}
-
-	memset(&wc, 0, sizeof wc);
-	send_status = IB_WC_SUCCESS;
-
-	sqp->s_sge.sge = wqe->sg_list[0];
-	sqp->s_sge.sg_list = wqe->sg_list + 1;
-	sqp->s_sge.num_sge = wqe->wr.num_sge;
-	sqp->s_len = wqe->length;
-	switch (wqe->wr.opcode) {
-	case IB_WR_SEND_WITH_IMM:
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		/* FALLTHROUGH */
-	case IB_WR_SEND:
-		if (!ipath_get_rwqe(qp, 0))
-			goto rnr_nak;
-		break;
-
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		if (!ipath_get_rwqe(qp, 1))
-			goto rnr_nak;
-		/* FALLTHROUGH */
-	case IB_WR_RDMA_WRITE:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		if (wqe->length == 0)
-			break;
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
-					    wqe->wr.wr.rdma.remote_addr,
-					    wqe->wr.wr.rdma.rkey,
-					    IB_ACCESS_REMOTE_WRITE)))
-			goto acc_err;
-		break;
-
-	case IB_WR_RDMA_READ:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-			goto inv_err;
-		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
-					    wqe->wr.wr.rdma.remote_addr,
-					    wqe->wr.wr.rdma.rkey,
-					    IB_ACCESS_REMOTE_READ)))
-			goto acc_err;
-		qp->r_sge.sge = wqe->sg_list[0];
-		qp->r_sge.sg_list = wqe->sg_list + 1;
-		qp->r_sge.num_sge = wqe->wr.num_sge;
-		break;
-
-	case IB_WR_ATOMIC_CMP_AND_SWP:
-	case IB_WR_ATOMIC_FETCH_AND_ADD:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-			goto inv_err;
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
-					    wqe->wr.wr.atomic.remote_addr,
-					    wqe->wr.wr.atomic.rkey,
-					    IB_ACCESS_REMOTE_ATOMIC)))
-			goto acc_err;
-		/* Perform atomic OP and save result. */
-		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-		sdata = wqe->wr.wr.atomic.compare_add;
-		*(u64 *) sqp->s_sge.sge.vaddr =
-			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-			(u64) atomic64_add_return(sdata, maddr) - sdata :
-			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-				      sdata, wqe->wr.wr.atomic.swap);
-		goto send_comp;
-
-	default:
-		send_status = IB_WC_LOC_QP_OP_ERR;
-		goto serr;
-	}
-
-	sge = &sqp->s_sge.sge;
-	while (sqp->s_len) {
-		u32 len = sqp->s_len;
-
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--sqp->s_sge.num_sge)
-				*sge = *sqp->s_sge.sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		sqp->s_len -= len;
-	}
-
-	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-		goto send_comp;
-
-	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-	else
-		wc.opcode = IB_WC_RECV;
-	wc.wr_id = qp->r_wr_id;
-	wc.status = IB_WC_SUCCESS;
-	wc.byte_len = wqe->length;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = qp->remote_qpn;
-	wc.slid = qp->remote_ah_attr.dlid;
-	wc.sl = qp->remote_ah_attr.sl;
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-		       wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-flush_send:
-	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-	ipath_send_complete(sqp, wqe, send_status);
-	goto again;
-
-rnr_nak:
-	/* Handle RNR NAK */
-	if (qp->ibqp.qp_type == IB_QPT_UC)
-		goto send_comp;
-	/*
-	 * Note: we don't need the s_lock held since the BUSY flag
-	 * makes this single threaded.
-	 */
-	if (sqp->s_rnr_retry == 0) {
-		send_status = IB_WC_RNR_RETRY_EXC_ERR;
-		goto serr;
-	}
-	if (sqp->s_rnr_retry_cnt < 7)
-		sqp->s_rnr_retry--;
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
-		goto clr_busy;
-	sqp->s_flags |= IPATH_S_WAITING;
-	dev->n_rnr_naks++;
-	sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
-	ipath_insert_rnr_queue(sqp);
-	goto clr_busy;
-
-inv_err:
-	send_status = IB_WC_REM_INV_REQ_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-acc_err:
-	send_status = IB_WC_REM_ACCESS_ERR;
-	wc.status = IB_WC_LOC_PROT_ERR;
-err:
-	/* responder goes to error state */
-	ipath_rc_error(qp, wc.status);
-
-serr:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	ipath_send_complete(sqp, wqe, send_status);
-	if (sqp->ibqp.qp_type == IB_QPT_RC) {
-		int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-		sqp->s_flags &= ~IPATH_S_BUSY;
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		if (lastwqe) {
-			struct ib_event ev;
-
-			ev.device = sqp->ibqp.device;
-			ev.element.qp = &sqp->ibqp;
-			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-		}
-		goto done;
-	}
-clr_busy:
-	sqp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-	if (qp && atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-}
-
-static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
-{
-	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
-	    qp->ibqp.qp_type == IB_QPT_SMI) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-				 dd->ipath_sendctrl);
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-	}
-}
-
-/**
- * ipath_no_bufs_available - tell the layer driver we need buffers
- * @qp: the QP that caused the problem
- * @dev: the device we ran out of buffers on
- *
- * Called when we run out of PIO buffers.
- * If we are now in the error state, return zero to flush the
- * send work request.
- */
-static int ipath_no_bufs_available(struct ipath_qp *qp,
-				    struct ipath_ibdev *dev)
-{
-	unsigned long flags;
-	int ret = 1;
-
-	/*
-	 * Note that as soon as want_buffer() is called and
-	 * possibly before it returns, ipath_ib_piobufavail()
-	 * could be called. Therefore, put QP on the piowait list before
-	 * enabling the PIO avail interrupt.
-	 */
-	spin_lock_irqsave(&qp->s_lock, flags);
-	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-		dev->n_piowait++;
-		qp->s_flags |= IPATH_S_WAITING;
-		qp->s_flags &= ~IPATH_S_BUSY;
-		spin_lock(&dev->pending_lock);
-		if (list_empty(&qp->piowait))
-			list_add_tail(&qp->piowait, &dev->piowait);
-		spin_unlock(&dev->pending_lock);
-	} else
-		ret = 0;
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (ret)
-		want_buffer(dev->dd, qp);
-	return ret;
-}
-
-/**
- * ipath_make_grh - construct a GRH header
- * @dev: a pointer to the ipath device
- * @hdr: a pointer to the GRH header being constructed
- * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
- * @nwords: the number of 32 bit words of data being sent
- *
- * Return the size of the header in 32 bit words.
- */
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-		   struct ib_global_route *grh, u32 hwords, u32 nwords)
-{
-	hdr->version_tclass_flow =
-		cpu_to_be32((6 << 28) |
-			    (grh->traffic_class << 20) |
-			    grh->flow_label);
-	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
-	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
-	hdr->next_hdr = 0x1B;
-	hdr->hop_limit = grh->hop_limit;
-	/* The SGID is 32-bit aligned. */
-	hdr->sgid.global.subnet_prefix = dev->gid_prefix;
-	hdr->sgid.global.interface_id = dev->dd->ipath_guid;
-	hdr->dgid = grh->dgid;
-
-	/* GRH header size in 32-bit words. */
-	return sizeof(struct ib_grh) / sizeof(u32);
-}
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-			   struct ipath_other_headers *ohdr,
-			   u32 bth0, u32 bth2)
-{
-	u16 lrh0;
-	u32 nwords;
-	u32 extra_bytes;
-
-	/* Construct the header. */
-	extra_bytes = -qp->s_cur_size & 3;
-	nwords = (qp->s_cur_size + extra_bytes) >> 2;
-	lrh0 = IPATH_LRH_BTH;
-	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-						 &qp->remote_ah_attr.grh,
-						 qp->s_hdrwords, nwords);
-		lrh0 = IPATH_LRH_GRH;
-	}
-	lrh0 |= qp->remote_ah_attr.sl << 4;
-	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
-				       qp->remote_ah_attr.src_path_bits);
-	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
-	bth0 |= extra_bytes << 20;
-	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
-	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-	ohdr->bth[2] = cpu_to_be32(bth2);
-}
-
-/**
- * ipath_do_send - perform a send on a QP
- * @data: contains a pointer to the QP
- *
- * Process entries in the send work queue until credit or queue is
- * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, two threads could send packets out of order.
- */
-void ipath_do_send(unsigned long data)
-{
-	struct ipath_qp *qp = (struct ipath_qp *)data;
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	int (*make_req)(struct ipath_qp *qp);
-	unsigned long flags;
-
-	if ((qp->ibqp.qp_type == IB_QPT_RC ||
-	     qp->ibqp.qp_type == IB_QPT_UC) &&
-	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
-		ipath_ruc_loopback(qp);
-		goto bail;
-	}
-
-	if (qp->ibqp.qp_type == IB_QPT_RC)
-	       make_req = ipath_make_rc_req;
-	else if (qp->ibqp.qp_type == IB_QPT_UC)
-	       make_req = ipath_make_uc_req;
-	else
-	       make_req = ipath_make_ud_req;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	/* Return if we are already busy processing a work request. */
-	if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-	    !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		goto bail;
-	}
-
-	qp->s_flags |= IPATH_S_BUSY;
-
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-
-again:
-	/* Check for a constructed packet to be sent. */
-	if (qp->s_hdrwords != 0) {
-		/*
-		 * If no PIO bufs are available, return.  An interrupt will
-		 * call ipath_ib_piobufavail() when one is available.
-		 */
-		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
-				     qp->s_cur_sge, qp->s_cur_size)) {
-			if (ipath_no_bufs_available(qp, dev))
-				goto bail;
-		}
-		dev->n_unicast_xmit++;
-		/* Record that we sent the packet and s_hdr is empty. */
-		qp->s_hdrwords = 0;
-	}
-
-	if (make_req(qp))
-		goto again;
-
-bail:;
-}
-
-/*
- * This should be called with s_lock held.
- */
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-			 enum ib_wc_status status)
-{
-	u32 old_last, last;
-
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-		return;
-
-	/* See ch. 11.2.4.1 and 10.7.3.1 */
-	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-	    status != IB_WC_SUCCESS) {
-		struct ib_wc wc;
-
-		memset(&wc, 0, sizeof wc);
-		wc.wr_id = wqe->wr.wr_id;
-		wc.status = status;
-		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-		wc.qp = &qp->ibqp;
-		if (status == IB_WC_SUCCESS)
-			wc.byte_len = wqe->length;
-		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
-			       status != IB_WC_SUCCESS);
-	}
-
-	old_last = last = qp->s_last;
-	if (++last >= qp->s_size)
-		last = 0;
-	qp->s_last = last;
-	if (qp->s_cur == old_last)
-		qp->s_cur = last;
-	if (qp->s_tail == old_last)
-		qp->s_tail = last;
-	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-		qp->s_draining = 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
deleted file mode 100644
index 17a5177..0000000
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
-
-static void vl15_watchdog_enq(struct ipath_devdata *dd)
-{
-	/* ipath_sdma_lock must already be held */
-	if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
-		unsigned long interval = (HZ + 19) / 20;
-		dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
-		add_timer(&dd->ipath_sdma_vl15_timer);
-	}
-}
-
-static void vl15_watchdog_deq(struct ipath_devdata *dd)
-{
-	/* ipath_sdma_lock must already be held */
-	if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
-		unsigned long interval = (HZ + 19) / 20;
-		mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
-	} else {
-		del_timer(&dd->ipath_sdma_vl15_timer);
-	}
-}
-
-static void vl15_watchdog_timeout(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-	if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
-		ipath_dbg("vl15 watchdog timeout - clearing\n");
-		ipath_cancel_sends(dd, 1);
-		ipath_hol_down(dd);
-	} else {
-		ipath_dbg("vl15 watchdog timeout - "
-			  "condition already cleared\n");
-	}
-}
-
-static void unmap_desc(struct ipath_devdata *dd, unsigned head)
-{
-	__le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
-	u64 desc[2];
-	dma_addr_t addr;
-	size_t len;
-
-	desc[0] = le64_to_cpu(descqp[0]);
-	desc[1] = le64_to_cpu(descqp[1]);
-
-	addr = (desc[1] << 32) | (desc[0] >> 32);
-	len = (desc[0] >> 14) & (0x7ffULL << 2);
-	dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
-}
-
-/*
- * ipath_sdma_lock should be locked before calling this.
- */
-int ipath_sdma_make_progress(struct ipath_devdata *dd)
-{
-	struct list_head *lp = NULL;
-	struct ipath_sdma_txreq *txp = NULL;
-	u16 dmahead;
-	u16 start_idx = 0;
-	int progress = 0;
-
-	if (!list_empty(&dd->ipath_sdma_activelist)) {
-		lp = dd->ipath_sdma_activelist.next;
-		txp = list_entry(lp, struct ipath_sdma_txreq, list);
-		start_idx = txp->start_idx;
-	}
-
-	/*
-	 * Read the SDMA head register in order to know that the
-	 * interrupt clear has been written to the chip.
-	 * Otherwise, we may not get an interrupt for the last
-	 * descriptor in the queue.
-	 */
-	dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
-	/* sanity check return value for error handling (chip reset, etc.) */
-	if (dmahead >= dd->ipath_sdma_descq_cnt)
-		goto done;
-
-	while (dd->ipath_sdma_descq_head != dmahead) {
-		if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
-		    dd->ipath_sdma_descq_head == start_idx) {
-			unmap_desc(dd, dd->ipath_sdma_descq_head);
-			start_idx++;
-			if (start_idx == dd->ipath_sdma_descq_cnt)
-				start_idx = 0;
-		}
-
-		/* increment free count and head */
-		dd->ipath_sdma_descq_removed++;
-		if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
-			dd->ipath_sdma_descq_head = 0;
-
-		if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
-			/* move to notify list */
-			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-				vl15_watchdog_deq(dd);
-			list_move_tail(lp, &dd->ipath_sdma_notifylist);
-			if (!list_empty(&dd->ipath_sdma_activelist)) {
-				lp = dd->ipath_sdma_activelist.next;
-				txp = list_entry(lp, struct ipath_sdma_txreq,
-						 list);
-				start_idx = txp->start_idx;
-			} else {
-				lp = NULL;
-				txp = NULL;
-			}
-		}
-		progress = 1;
-	}
-
-	if (progress)
-		tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-done:
-	return progress;
-}
-
-static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
-{
-	struct ipath_sdma_txreq *txp, *txp_next;
-
-	list_for_each_entry_safe(txp, txp_next, list, list) {
-		list_del_init(&txp->list);
-
-		if (txp->callback)
-			(*txp->callback)(txp->callback_cookie,
-					 txp->callback_status);
-	}
-}
-
-static void sdma_notify_taskbody(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-	struct list_head list;
-
-	INIT_LIST_HEAD(&list);
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	list_splice_init(&dd->ipath_sdma_notifylist, &list);
-
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	ipath_sdma_notify(dd, &list);
-
-	/*
-	 * The IB verbs layer needs to see the callback before getting
-	 * the call to ipath_ib_piobufavail() because the callback
-	 * handles releasing resources the next send will need.
-	 * Otherwise, we could do these calls in
-	 * ipath_sdma_make_progress().
-	 */
-	ipath_ib_piobufavail(dd->verbs_dev);
-}
-
-static void sdma_notify_task(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-		sdma_notify_taskbody(dd);
-}
-
-static void dump_sdma_state(struct ipath_devdata *dd)
-{
-	unsigned long reg;
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
-	ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
-	ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
-	ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
-	ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
-	ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-	ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
-
-	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-	ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
-}
-
-static void sdma_abort_task(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-	u64 status;
-	unsigned long flags;
-
-	if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-		return;
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
-
-	/* nothing to do */
-	if (status == IPATH_SDMA_ABORT_NONE)
-		goto unlock;
-
-	/* ipath_sdma_abort() is done, waiting for interrupt */
-	if (status == IPATH_SDMA_ABORT_DISARMED) {
-		if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
-			goto resched_noprint;
-		/* give up, intr got lost somewhere */
-		ipath_dbg("give up waiting for SDMADISABLED intr\n");
-		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-		status = IPATH_SDMA_ABORT_ABORTED;
-	}
-
-	/* everything is stopped, time to clean up and restart */
-	if (status == IPATH_SDMA_ABORT_ABORTED) {
-		struct ipath_sdma_txreq *txp, *txpnext;
-		u64 hwstatus;
-		int notify = 0;
-
-		hwstatus = ipath_read_kreg64(dd,
-				dd->ipath_kregs->kr_senddmastatus);
-
-		if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
-				 IPATH_SDMA_STATUS_ABORT_IN_PROG	     |
-				 IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
-		    !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
-			if (dd->ipath_sdma_reset_wait > 0) {
-				/* not done shutting down sdma */
-				--dd->ipath_sdma_reset_wait;
-				goto resched;
-			}
-			ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
-				"status after SDMA reset, continuing\n");
-			dump_sdma_state(dd);
-		}
-
-		/* dequeue all "sent" requests */
-		list_for_each_entry_safe(txp, txpnext,
-					 &dd->ipath_sdma_activelist, list) {
-			txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
-			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-				vl15_watchdog_deq(dd);
-			list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-			notify = 1;
-		}
-		if (notify)
-			tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-		/* reset our notion of head and tail */
-		dd->ipath_sdma_descq_tail = 0;
-		dd->ipath_sdma_descq_head = 0;
-		dd->ipath_sdma_head_dma[0] = 0;
-		dd->ipath_sdma_generation = 0;
-		dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
-
-		/* Reset SendDmaLenGen */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
-			(u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
-
-		/* done with sdma state for a bit */
-		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-		/*
-		 * Don't restart sdma here (with the exception
-		 * below). Wait until link is up to ACTIVE.  VL15 MADs
-		 * used to bring the link up use PIO, and multiple link
-		 * transitions otherwise cause the sdma engine to be
-		 * stopped and started multiple times.
-		 * The disable is done here, including the shadow,
-		 * so the state is kept consistent.
-		 * See ipath_restart_sdma() for the actual starting
-		 * of sdma.
-		 */
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-				 dd->ipath_sendctrl);
-		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-		/* make sure I see next message */
-		dd->ipath_sdma_abort_jiffies = 0;
-
-		/*
-		 * Not everything that takes SDMA offline is a link
-		 * status change.  If the link was up, restart SDMA.
-		 */
-		if (dd->ipath_flags & IPATH_LINKACTIVE)
-			ipath_restart_sdma(dd);
-
-		goto done;
-	}
-
-resched:
-	/*
-	 * for now, keep spinning
-	 * JAG - this is bad to just have default be a loop without
-	 * state change
-	 */
-	if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
-		ipath_dbg("looping with status 0x%08lx\n",
-			  dd->ipath_sdma_status);
-		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
-	}
-resched_noprint:
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-		tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-	return;
-
-unlock:
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-done:
-	return;
-}
-
-/*
- * This is called from interrupt context.
- */
-void ipath_sdma_intr(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	(void) ipath_sdma_make_progress(dd);
-
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-}
-
-static int alloc_sdma(struct ipath_devdata *dd)
-{
-	int ret = 0;
-
-	/* Allocate memory for SendDMA descriptor FIFO */
-	dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
-		SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
-
-	if (!dd->ipath_sdma_descq) {
-		ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
-			"FIFO memory\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	dd->ipath_sdma_descq_cnt =
-		SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
-
-	/* Allocate memory for DMA of head register to memory */
-	dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
-		PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
-	if (!dd->ipath_sdma_head_dma) {
-		ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
-		ret = -ENOMEM;
-		goto cleanup_descq;
-	}
-	dd->ipath_sdma_head_dma[0] = 0;
-
-	init_timer(&dd->ipath_sdma_vl15_timer);
-	dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
-	dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
-	atomic_set(&dd->ipath_sdma_vl15_count, 0);
-
-	goto done;
-
-cleanup_descq:
-	dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-		(void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
-	dd->ipath_sdma_descq = NULL;
-	dd->ipath_sdma_descq_phys = 0;
-done:
-	return ret;
-}
-
-int setup_sdma(struct ipath_devdata *dd)
-{
-	int ret = 0;
-	unsigned i, n;
-	u64 tmp64;
-	u64 senddmabufmask[3] = { 0 };
-	unsigned long flags;
-
-	ret = alloc_sdma(dd);
-	if (ret)
-		goto done;
-
-	if (!dd->ipath_sdma_descq) {
-		ipath_dev_err(dd, "SendDMA memory not allocated\n");
-		goto done;
-	}
-
-	/*
-	 * Set initial status as if we had been up, then gone down.
-	 * This lets initial start on transition to ACTIVE be the
-	 * same as restart after link flap.
-	 */
-	dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
-	dd->ipath_sdma_abort_jiffies = 0;
-	dd->ipath_sdma_generation = 0;
-	dd->ipath_sdma_descq_tail = 0;
-	dd->ipath_sdma_descq_head = 0;
-	dd->ipath_sdma_descq_removed = 0;
-	dd->ipath_sdma_descq_added = 0;
-
-	/* Set SendDmaBase */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
-			 dd->ipath_sdma_descq_phys);
-	/* Set SendDmaLenGen */
-	tmp64 = dd->ipath_sdma_descq_cnt;
-	tmp64 |= 1<<18; /* enable generation checking */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
-	/* Set SendDmaTail */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
-			 dd->ipath_sdma_descq_tail);
-	/* Set SendDmaHeadAddr */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
-			 dd->ipath_sdma_head_phys);
-
-	/*
-	 * Reserve all the former "kernel" piobufs, using high number range
-	 * so we get as many 4K buffers as possible
-	 */
-	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-	i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
-	ipath_chg_pioavailkernel(dd, i, n - i , 0);
-	for (; i < n; ++i) {
-		unsigned word = i / 64;
-		unsigned bit = i & 63;
-		BUG_ON(word >= 3);
-		senddmabufmask[word] |= 1ULL << bit;
-	}
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
-			 senddmabufmask[0]);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
-			 senddmabufmask[1]);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
-			 senddmabufmask[2]);
-
-	INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
-	INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
-
-	tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
-		     (unsigned long) dd);
-	tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
-		     (unsigned long) dd);
-
-	/*
-	 * No use to turn on SDMA here, as link is probably not ACTIVE
-	 * Just mark it RUNNING and enable the interrupt, and let the
-	 * ipath_restart_sdma() on link transition to ACTIVE actually
-	 * enable it.
-	 */
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	__set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-done:
-	return ret;
-}
-
-void teardown_sdma(struct ipath_devdata *dd)
-{
-	struct ipath_sdma_txreq *txp, *txpnext;
-	unsigned long flags;
-	dma_addr_t sdma_head_phys = 0;
-	dma_addr_t sdma_descq_phys = 0;
-	void *sdma_descq = NULL;
-	void *sdma_head_dma = NULL;
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	__clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-	__set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-	__set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	tasklet_kill(&dd->ipath_sdma_abort_task);
-	tasklet_kill(&dd->ipath_sdma_notify_task);
-
-	/* turn off sdma */
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-		dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	/* dequeue all "sent" requests */
-	list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
-				 list) {
-		txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
-		if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-			vl15_watchdog_deq(dd);
-		list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-	}
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	sdma_notify_taskbody(dd);
-
-	del_timer_sync(&dd->ipath_sdma_vl15_timer);
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	dd->ipath_sdma_abort_jiffies = 0;
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
-
-	if (dd->ipath_sdma_head_dma) {
-		sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
-		sdma_head_phys = dd->ipath_sdma_head_phys;
-		dd->ipath_sdma_head_dma = NULL;
-		dd->ipath_sdma_head_phys = 0;
-	}
-
-	if (dd->ipath_sdma_descq) {
-		sdma_descq = dd->ipath_sdma_descq;
-		sdma_descq_phys = dd->ipath_sdma_descq_phys;
-		dd->ipath_sdma_descq = NULL;
-		dd->ipath_sdma_descq_phys = 0;
-	}
-
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	if (sdma_head_dma)
-		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-				  sdma_head_dma, sdma_head_phys);
-
-	if (sdma_descq)
-		dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-				  sdma_descq, sdma_descq_phys);
-}
-
-/*
- * [Re]start SDMA, if we use it, and it's not already OK.
- * This is called on transition to link ACTIVE, either the first or
- * subsequent times.
- */
-void ipath_restart_sdma(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-	int needed = 1;
-
-	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-		goto bail;
-
-	/*
-	 * First, make sure we should, which is to say,
-	 * check that we are "RUNNING" (not in teardown)
-	 * and not "SHUTDOWN"
-	 */
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
-		|| test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-			needed = 0;
-	else {
-		__clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-		__clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-		__clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-	}
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-	if (!needed) {
-		ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
-			dd->ipath_sdma_status);
-		goto bail;
-	}
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	/*
-	 * First clear, just to be safe. Enable is only done
-	 * in chip on 0->1 transition
-	 */
-	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-	/* notify upper layers */
-	ipath_ib_piobufavail(dd->verbs_dev);
-
-bail:
-	return;
-}
-
-static inline void make_sdma_desc(struct ipath_devdata *dd,
-	u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
-{
-	WARN_ON(addr & 3);
-	/* SDmaPhyAddr[47:32] */
-	sdmadesc[1] = addr >> 32;
-	/* SDmaPhyAddr[31:0] */
-	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
-	/* SDmaGeneration[1:0] */
-	sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
-	/* SDmaDwordCount[10:0] */
-	sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
-	/* SDmaBufOffset[12:2] */
-	sdmadesc[0] |= dwoffset & 0x7ffULL;
-}
-
-/*
- * This function queues one IB packet onto the send DMA queue per call.
- * The caller is responsible for checking:
- * 1) The number of send DMA descriptor entries is less than the size of
- *    the descriptor queue.
- * 2) The IB SGE addresses and lengths are 32-bit aligned
- *    (except possibly the last SGE's length)
- * 3) The SGE addresses are suitable for passing to dma_map_single().
- */
-int ipath_sdma_verbs_send(struct ipath_devdata *dd,
-	struct ipath_sge_state *ss, u32 dwords,
-	struct ipath_verbs_txreq *tx)
-{
-
-	unsigned long flags;
-	struct ipath_sge *sge;
-	int ret = 0;
-	u16 tail;
-	__le64 *descqp;
-	u64 sdmadesc[2];
-	u32 dwoffset;
-	dma_addr_t addr;
-
-	if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
-		ipath_dbg("packet size %X > ibmax %X, fail\n",
-			tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
-		ret = -EMSGSIZE;
-		goto fail;
-	}
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-retry:
-	if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
-		ret = -EBUSY;
-		goto unlock;
-	}
-
-	if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
-		if (ipath_sdma_make_progress(dd))
-			goto retry;
-		ret = -ENOBUFS;
-		goto unlock;
-	}
-
-	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
-			      tx->map_len, DMA_TO_DEVICE);
-	if (dma_mapping_error(&dd->pcidev->dev, addr))
-		goto ioerr;
-
-	dwoffset = tx->map_len >> 2;
-	make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
-
-	/* SDmaFirstDesc */
-	sdmadesc[0] |= 1ULL << 12;
-	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-		sdmadesc[0] |= 1ULL << 14;	/* SDmaUseLargeBuf */
-
-	/* write to the descq */
-	tail = dd->ipath_sdma_descq_tail;
-	descqp = &dd->ipath_sdma_descq[tail].qw[0];
-	*descqp++ = cpu_to_le64(sdmadesc[0]);
-	*descqp++ = cpu_to_le64(sdmadesc[1]);
-
-	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
-		tx->txreq.start_idx = tail;
-
-	/* increment the tail */
-	if (++tail == dd->ipath_sdma_descq_cnt) {
-		tail = 0;
-		descqp = &dd->ipath_sdma_descq[0].qw[0];
-		++dd->ipath_sdma_generation;
-	}
-
-	sge = &ss->sge;
-	while (dwords) {
-		u32 dw;
-		u32 len;
-
-		len = dwords << 2;
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		dw = (len + 3) >> 2;
-		addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
-				      DMA_TO_DEVICE);
-		if (dma_mapping_error(&dd->pcidev->dev, addr))
-			goto unmap;
-		make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
-		/* SDmaUseLargeBuf has to be set in every descriptor */
-		if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-			sdmadesc[0] |= 1ULL << 14;
-		/* write to the descq */
-		*descqp++ = cpu_to_le64(sdmadesc[0]);
-		*descqp++ = cpu_to_le64(sdmadesc[1]);
-
-		/* increment the tail */
-		if (++tail == dd->ipath_sdma_descq_cnt) {
-			tail = 0;
-			descqp = &dd->ipath_sdma_descq[0].qw[0];
-			++dd->ipath_sdma_generation;
-		}
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-
-		dwoffset += dw;
-		dwords -= dw;
-	}
-
-	if (!tail)
-		descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
-	descqp -= 2;
-	/* SDmaLastDesc */
-	descqp[0] |= cpu_to_le64(1ULL << 11);
-	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
-		/* SDmaIntReq */
-		descqp[0] |= cpu_to_le64(1ULL << 15);
-	}
-
-	/* Commit writes to memory and advance the tail on the chip */
-	wmb();
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-
-	tx->txreq.next_descq_idx = tail;
-	tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
-	dd->ipath_sdma_descq_tail = tail;
-	dd->ipath_sdma_descq_added += tx->txreq.sg_count;
-	list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
-	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
-		vl15_watchdog_enq(dd);
-	goto unlock;
-
-unmap:
-	while (tail != dd->ipath_sdma_descq_tail) {
-		if (!tail)
-			tail = dd->ipath_sdma_descq_cnt - 1;
-		else
-			tail--;
-		unmap_desc(dd, tail);
-	}
-ioerr:
-	ret = -EIO;
-unlock:
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-fail:
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
deleted file mode 100644
index 2627198..0000000
--- a/drivers/infiniband/hw/ipath/ipath_srq.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_post_srq_receive - post a receive on a shared receive queue
- * @ibsrq: the SRQ to post the receive on
- * @wr: the list of work requests to post
- * @bad_wr: the first WR to cause a problem is put here
- *
- * This may be called from interrupt context.
- */
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-			   struct ib_recv_wr **bad_wr)
-{
-	struct ipath_srq *srq = to_isrq(ibsrq);
-	struct ipath_rwq *wq;
-	unsigned long flags;
-	int ret;
-
-	for (; wr; wr = wr->next) {
-		struct ipath_rwqe *wqe;
-		u32 next;
-		int i;
-
-		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
-			*bad_wr = wr;
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		spin_lock_irqsave(&srq->rq.lock, flags);
-		wq = srq->rq.wq;
-		next = wq->head + 1;
-		if (next >= srq->rq.size)
-			next = 0;
-		if (next == wq->tail) {
-			spin_unlock_irqrestore(&srq->rq.lock, flags);
-			*bad_wr = wr;
-			ret = -ENOMEM;
-			goto bail;
-		}
-
-		wqe = get_rwqe_ptr(&srq->rq, wq->head);
-		wqe->wr_id = wr->wr_id;
-		wqe->num_sge = wr->num_sge;
-		for (i = 0; i < wr->num_sge; i++)
-			wqe->sg_list[i] = wr->sg_list[i];
-		/* Make sure queue entry is written before the head index. */
-		smp_wmb();
-		wq->head = next;
-		spin_unlock_irqrestore(&srq->rq.lock, flags);
-	}
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_create_srq - create a shared receive queue
- * @ibpd: the protection domain of the SRQ to create
- * @srq_init_attr: the attributes of the SRQ
- * @udata: data from libipathverbs when creating a user SRQ
- */
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-				struct ib_srq_init_attr *srq_init_attr,
-				struct ib_udata *udata)
-{
-	struct ipath_ibdev *dev = to_idev(ibpd->device);
-	struct ipath_srq *srq;
-	u32 sz;
-	struct ib_srq *ret;
-
-	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
-		ret = ERR_PTR(-ENOSYS);
-		goto done;
-	}
-
-	if (srq_init_attr->attr.max_wr == 0) {
-		ret = ERR_PTR(-EINVAL);
-		goto done;
-	}
-
-	if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
-	    (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
-		ret = ERR_PTR(-EINVAL);
-		goto done;
-	}
-
-	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq) {
-		ret = ERR_PTR(-ENOMEM);
-		goto done;
-	}
-
-	/*
-	 * Need to use vmalloc() if we want to support large #s of entries.
-	 */
-	srq->rq.size = srq_init_attr->attr.max_wr + 1;
-	srq->rq.max_sge = srq_init_attr->attr.max_sge;
-	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
-		sizeof(struct ipath_rwqe);
-	srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
-	if (!srq->rq.wq) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_srq;
-	}
-
-	/*
-	 * Return the address of the RWQ as the offset to mmap.
-	 * See ipath_mmap() for details.
-	 */
-	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
-		u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
-
-		srq->ip =
-		    ipath_create_mmap_info(dev, s,
-					   ibpd->uobject->context,
-					   srq->rq.wq);
-		if (!srq->ip) {
-			ret = ERR_PTR(-ENOMEM);
-			goto bail_wq;
-		}
-
-		err = ib_copy_to_udata(udata, &srq->ip->offset,
-				       sizeof(srq->ip->offset));
-		if (err) {
-			ret = ERR_PTR(err);
-			goto bail_ip;
-		}
-	} else
-		srq->ip = NULL;
-
-	/*
-	 * ib_create_srq() will initialize srq->ibsrq.
-	 */
-	spin_lock_init(&srq->rq.lock);
-	srq->rq.wq->head = 0;
-	srq->rq.wq->tail = 0;
-	srq->limit = srq_init_attr->attr.srq_limit;
-
-	spin_lock(&dev->n_srqs_lock);
-	if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
-		spin_unlock(&dev->n_srqs_lock);
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_ip;
-	}
-
- 	dev->n_srqs_allocated++;
-	spin_unlock(&dev->n_srqs_lock);
-
-	if (srq->ip) {
-		spin_lock_irq(&dev->pending_lock);
-		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
-		spin_unlock_irq(&dev->pending_lock);
-	}
-
-	ret = &srq->ibsrq;
-	goto done;
-
-bail_ip:
-	kfree(srq->ip);
-bail_wq:
-	vfree(srq->rq.wq);
-bail_srq:
-	kfree(srq);
-done:
-	return ret;
-}
-
-/**
- * ipath_modify_srq - modify a shared receive queue
- * @ibsrq: the SRQ to modify
- * @attr: the new attributes of the SRQ
- * @attr_mask: indicates which attributes to modify
- * @udata: user data for ipathverbs.so
- */
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-		     enum ib_srq_attr_mask attr_mask,
-		     struct ib_udata *udata)
-{
-	struct ipath_srq *srq = to_isrq(ibsrq);
-	struct ipath_rwq *wq;
-	int ret = 0;
-
-	if (attr_mask & IB_SRQ_MAX_WR) {
-		struct ipath_rwq *owq;
-		struct ipath_rwqe *p;
-		u32 sz, size, n, head, tail;
-
-		/* Check that the requested sizes are below the limits. */
-		if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
-		    ((attr_mask & IB_SRQ_LIMIT) ?
-		     attr->srq_limit : srq->limit) > attr->max_wr) {
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		sz = sizeof(struct ipath_rwqe) +
-			srq->rq.max_sge * sizeof(struct ib_sge);
-		size = attr->max_wr + 1;
-		wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
-		if (!wq) {
-			ret = -ENOMEM;
-			goto bail;
-		}
-
-		/* Check that we can write the offset to mmap. */
-		if (udata && udata->inlen >= sizeof(__u64)) {
-			__u64 offset_addr;
-			__u64 offset = 0;
-
-			ret = ib_copy_from_udata(&offset_addr, udata,
-						 sizeof(offset_addr));
-			if (ret)
-				goto bail_free;
-			udata->outbuf =
-				(void __user *) (unsigned long) offset_addr;
-			ret = ib_copy_to_udata(udata, &offset,
-					       sizeof(offset));
-			if (ret)
-				goto bail_free;
-		}
-
-		spin_lock_irq(&srq->rq.lock);
-		/*
-		 * validate head pointer value and compute
-		 * the number of remaining WQEs.
-		 */
-		owq = srq->rq.wq;
-		head = owq->head;
-		if (head >= srq->rq.size)
-			head = 0;
-		tail = owq->tail;
-		if (tail >= srq->rq.size)
-			tail = 0;
-		n = head;
-		if (n < tail)
-			n += srq->rq.size - tail;
-		else
-			n -= tail;
-		if (size <= n) {
-			ret = -EINVAL;
-			goto bail_unlock;
-		}
-		n = 0;
-		p = wq->wq;
-		while (tail != head) {
-			struct ipath_rwqe *wqe;
-			int i;
-
-			wqe = get_rwqe_ptr(&srq->rq, tail);
-			p->wr_id = wqe->wr_id;
-			p->num_sge = wqe->num_sge;
-			for (i = 0; i < wqe->num_sge; i++)
-				p->sg_list[i] = wqe->sg_list[i];
-			n++;
-			p = (struct ipath_rwqe *)((char *) p + sz);
-			if (++tail >= srq->rq.size)
-				tail = 0;
-		}
-		srq->rq.wq = wq;
-		srq->rq.size = size;
-		wq->head = n;
-		wq->tail = 0;
-		if (attr_mask & IB_SRQ_LIMIT)
-			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
-
-		vfree(owq);
-
-		if (srq->ip) {
-			struct ipath_mmap_info *ip = srq->ip;
-			struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
-			u32 s = sizeof(struct ipath_rwq) + size * sz;
-
-			ipath_update_mmap_info(dev, ip, s, wq);
-
-			/*
-			 * Return the offset to mmap.
-			 * See ipath_mmap() for details.
-			 */
-			if (udata && udata->inlen >= sizeof(__u64)) {
-				ret = ib_copy_to_udata(udata, &ip->offset,
-						       sizeof(ip->offset));
-				if (ret)
-					goto bail;
-			}
-
-			spin_lock_irq(&dev->pending_lock);
-			if (list_empty(&ip->pending_mmaps))
-				list_add(&ip->pending_mmaps,
-					 &dev->pending_mmaps);
-			spin_unlock_irq(&dev->pending_lock);
-		}
-	} else if (attr_mask & IB_SRQ_LIMIT) {
-		spin_lock_irq(&srq->rq.lock);
-		if (attr->srq_limit >= srq->rq.size)
-			ret = -EINVAL;
-		else
-			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
-	}
-	goto bail;
-
-bail_unlock:
-	spin_unlock_irq(&srq->rq.lock);
-bail_free:
-	vfree(wq);
-bail:
-	return ret;
-}
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
-{
-	struct ipath_srq *srq = to_isrq(ibsrq);
-
-	attr->max_wr = srq->rq.size - 1;
-	attr->max_sge = srq->rq.max_sge;
-	attr->srq_limit = srq->limit;
-	return 0;
-}
-
-/**
- * ipath_destroy_srq - destroy a shared receive queue
- * @ibsrq: the SRQ to destroy
- */
-int ipath_destroy_srq(struct ib_srq *ibsrq)
-{
-	struct ipath_srq *srq = to_isrq(ibsrq);
-	struct ipath_ibdev *dev = to_idev(ibsrq->device);
-
-	spin_lock(&dev->n_srqs_lock);
-	dev->n_srqs_allocated--;
-	spin_unlock(&dev->n_srqs_lock);
-	if (srq->ip)
-		kref_put(&srq->ip->ref, ipath_release_mmap_info);
-	else
-		vfree(srq->rq.wq);
-	kfree(srq);
-
-	return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
deleted file mode 100644
index f63e143..0000000
--- a/drivers/infiniband/hw/ipath/ipath_stats.c
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_kernel.h"
-
-struct infinipath_stats ipath_stats;
-
-/**
- * ipath_snap_cntr - snapshot a chip counter
- * @dd: the infinipath device
- * @creg: the counter to snapshot
- *
- * called from add_timer and user counter read calls, to deal with
- * counters that wrap in "human time".  The words sent and received, and
- * the packets sent and received are all that we worry about.  For now,
- * at least, we don't worry about error counters, because if they wrap
- * that quickly, we probably don't care.  We may eventually just make this
- * handle all the counters.  word counters can wrap in about 20 seconds
- * of full bandwidth traffic, packet counters in a few hours.
- */
-
-u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
-{
-	u32 val, reg64 = 0;
-	u64 val64;
-	unsigned long t0, t1;
-	u64 ret;
-
-	t0 = jiffies;
-	/* If fast increment counters are only 32 bits, snapshot them,
-	 * and maintain them as 64bit values in the driver */
-	if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
-	    (creg == dd->ipath_cregs->cr_wordsendcnt ||
-	     creg == dd->ipath_cregs->cr_wordrcvcnt ||
-	     creg == dd->ipath_cregs->cr_pktsendcnt ||
-	     creg == dd->ipath_cregs->cr_pktrcvcnt)) {
-		val64 = ipath_read_creg(dd, creg);
-		val = val64 == ~0ULL ? ~0U : 0;
-		reg64 = 1;
-	} else			/* val64 just to keep gcc quiet... */
-		val64 = val = ipath_read_creg32(dd, creg);
-	/*
-	 * See if a second has passed.  This is just a way to detect things
-	 * that are quite broken.  Normally this should take just a few
-	 * cycles (the check is for long enough that we don't care if we get
-	 * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
-	 * normal NB values
-	 */
-	t1 = jiffies;
-	if (time_before(t0 + HZ, t1) && val == -1) {
-		ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
-			      creg);
-		ret = 0ULL;
-		goto bail;
-	}
-	if (reg64) {
-		ret = val64;
-		goto bail;
-	}
-
-	if (creg == dd->ipath_cregs->cr_wordsendcnt) {
-		if (val != dd->ipath_lastsword) {
-			dd->ipath_sword += val - dd->ipath_lastsword;
-			dd->ipath_lastsword = val;
-		}
-		val64 = dd->ipath_sword;
-	} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
-		if (val != dd->ipath_lastrword) {
-			dd->ipath_rword += val - dd->ipath_lastrword;
-			dd->ipath_lastrword = val;
-		}
-		val64 = dd->ipath_rword;
-	} else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
-		if (val != dd->ipath_lastspkts) {
-			dd->ipath_spkts += val - dd->ipath_lastspkts;
-			dd->ipath_lastspkts = val;
-		}
-		val64 = dd->ipath_spkts;
-	} else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
-		if (val != dd->ipath_lastrpkts) {
-			dd->ipath_rpkts += val - dd->ipath_lastrpkts;
-			dd->ipath_lastrpkts = val;
-		}
-		val64 = dd->ipath_rpkts;
-	} else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
-		if (dd->ibdeltainprog)
-			val64 -= val64 - dd->ibsymsnap;
-		val64 -= dd->ibsymdelta;
-	} else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
-		if (dd->ibdeltainprog)
-			val64 -= val64 - dd->iblnkerrsnap;
-		val64 -= dd->iblnkerrdelta;
-	} else
-		val64 = (u64) val;
-
-	ret = val64;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
- * @dd: the infinipath device
- *
- * print the delta of egrfull/hdrqfull errors for kernel ports no more than
- * every 5 seconds.  User processes are printed at close, but kernel doesn't
- * close, so...  Separate routine so may call from other places someday, and
- * so function name when printed by _IPATH_INFO is meaningfull
- */
-static void ipath_qcheck(struct ipath_devdata *dd)
-{
-	static u64 last_tot_hdrqfull;
-	struct ipath_portdata *pd = dd->ipath_pd[0];
-	size_t blen = 0;
-	char buf[128];
-	u32 hdrqtail;
-
-	*buf = 0;
-	if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
-		blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
-				pd->port_hdrqfull -
-				dd->ipath_p0_hdrqfull);
-		dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
-	}
-	if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
-		blen += snprintf(buf + blen, sizeof buf - blen,
-				 "%srcvegrfull %llu",
-				 blen ? ", " : "",
-				 (unsigned long long)
-				 (ipath_stats.sps_etidfull -
-				  dd->ipath_last_tidfull));
-		dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
-	}
-
-	/*
-	 * this is actually the number of hdrq full interrupts, not actual
-	 * events, but at the moment that's mostly what I'm interested in.
-	 * Actual count, etc. is in the counters, if needed.  For production
-	 * users this won't ordinarily be printed.
-	 */
-
-	if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
-	    ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
-		blen += snprintf(buf + blen, sizeof buf - blen,
-				 "%shdrqfull %llu (all ports)",
-				 blen ? ", " : "",
-				 (unsigned long long)
-				 (ipath_stats.sps_hdrqfull -
-				  last_tot_hdrqfull));
-		last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
-	}
-	if (blen)
-		ipath_dbg("%s\n", buf);
-
-	hdrqtail = ipath_get_hdrqtail(pd);
-	if (pd->port_head != hdrqtail) {
-		if (dd->ipath_lastport0rcv_cnt ==
-		    ipath_stats.sps_port0pkts) {
-			ipath_cdbg(PKT, "missing rcv interrupts? "
-				   "port0 hd=%x tl=%x; port0pkts %llx; write"
-				   " hd (w/intr)\n",
-				   pd->port_head, hdrqtail,
-				   (unsigned long long)
-				   ipath_stats.sps_port0pkts);
-			ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
-				dd->ipath_rhdrhead_intr_off, pd->port_port);
-		}
-		dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
-	}
-}
-
-static void ipath_chk_errormask(struct ipath_devdata *dd)
-{
-	static u32 fixed;
-	u32 ctrl;
-	unsigned long errormask;
-	unsigned long hwerrs;
-
-	if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
-		return;
-
-	errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-
-	if (errormask == dd->ipath_errormask)
-		return;
-	fixed++;
-
-	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-		dd->ipath_errormask);
-
-	if ((hwerrs & dd->ipath_hwerrmask) ||
-		(ctrl & INFINIPATH_C_FREEZEMODE)) {
-		/* force re-interrupt of pending events, just in case */
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-		dev_info(&dd->pcidev->dev,
-			"errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
-			fixed, errormask, (unsigned long)dd->ipath_errormask,
-			ctrl, hwerrs);
-	} else
-		ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
-			fixed, errormask,
-			(unsigned long)dd->ipath_errormask);
-}
-
-
-/**
- * ipath_get_faststats - get word counters from chip before they overflow
- * @opaque - contains a pointer to the infinipath device ipath_devdata
- *
- * called from add_timer
- */
-void ipath_get_faststats(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-	int i;
-	static unsigned cnt;
-	unsigned long flags;
-	u64 traffic_wds;
-
-	/*
-	 * don't access the chip while running diags, or memory diags can
-	 * fail
-	 */
-	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
-	    ipath_diag_inuse)
-		/* but re-arm the timer, for diags case; won't hurt other */
-		goto done;
-
-	/*
-	 * We now try to maintain a "active timer", based on traffic
-	 * exceeding a threshold, so we need to check the word-counts
-	 * even if they are 64-bit.
-	 */
-	traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-	traffic_wds -= dd->ipath_traffic_wds;
-	dd->ipath_traffic_wds += traffic_wds;
-	if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
-		atomic_add(5, &dd->ipath_active_time); /* S/B #define */
-	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-
-	if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-	}
-
-	ipath_qcheck(dd);
-
-	/*
-	 * deal with repeat error suppression.  Doesn't really matter if
-	 * last error was almost a full interval ago, or just a few usecs
-	 * ago; still won't get more than 2 per interval.  We may want
-	 * longer intervals for this eventually, could do with mod, counter
-	 * or separate timer.  Also see code in ipath_handle_errors() and
-	 * ipath_handle_hwerrors().
-	 */
-
-	if (dd->ipath_lasterror)
-		dd->ipath_lasterror = 0;
-	if (dd->ipath_lasthwerror)
-		dd->ipath_lasthwerror = 0;
-	if (dd->ipath_maskederrs
-	    && time_after(jiffies, dd->ipath_unmasktime)) {
-		char ebuf[256];
-		int iserr;
-		iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
-					 dd->ipath_maskederrs);
-		if (dd->ipath_maskederrs &
-		    ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-		      INFINIPATH_E_PKTERRS))
-			ipath_dev_err(dd, "Re-enabling masked errors "
-				      "(%s)\n", ebuf);
-		else {
-			/*
-			 * rcvegrfull and rcvhdrqfull are "normal", for some
-			 * types of processes (mostly benchmarks) that send
-			 * huge numbers of messages, while not processing
-			 * them.  So only complain about these at debug
-			 * level.
-			 */
-			if (iserr)
-				ipath_dbg(
-					"Re-enabling queue full errors (%s)\n",
-					ebuf);
-			else
-				ipath_cdbg(ERRPKT, "Re-enabling packet"
-					" problem interrupt (%s)\n", ebuf);
-		}
-
-		/* re-enable masked errors */
-		dd->ipath_errormask |= dd->ipath_maskederrs;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-				 dd->ipath_errormask);
-		dd->ipath_maskederrs = 0;
-	}
-
-	/* limit qfull messages to ~one per minute per port */
-	if ((++cnt & 0x10)) {
-		for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
-			struct ipath_portdata *pd = dd->ipath_pd[i];
-
-			if (pd && pd->port_lastrcvhdrqtail != -1)
-				pd->port_lastrcvhdrqtail = -1;
-		}
-	}
-
-	ipath_chk_errormask(dd);
-done:
-	mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
deleted file mode 100644
index 75558f3..0000000
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ /dev/null
@@ -1,1238 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-/**
- * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
- * @str: the string containing the number
- * @valp: where to put the result
- *
- * returns the number of bytes consumed, or negative value on error
- */
-int ipath_parse_ushort(const char *str, unsigned short *valp)
-{
-	unsigned long val;
-	char *end;
-	int ret;
-
-	if (!isdigit(str[0])) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	val = simple_strtoul(str, &end, 0);
-
-	if (val > 0xffff) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	*valp = val;
-
-	ret = end + 1 - str;
-	if (ret == 0)
-		ret = -EINVAL;
-
-bail:
-	return ret;
-}
-
-static ssize_t show_version(struct device_driver *dev, char *buf)
-{
-	/* The string printed here is already newline-terminated. */
-	return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
-}
-
-static ssize_t show_num_units(struct device_driver *dev, char *buf)
-{
-	return scnprintf(buf, PAGE_SIZE, "%d\n",
-			 ipath_count_units(NULL, NULL, NULL));
-}
-
-static ssize_t show_status(struct device *dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	ssize_t ret;
-
-	if (!dd->ipath_statusp) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
-			(unsigned long long) *(dd->ipath_statusp));
-
-bail:
-	return ret;
-}
-
-static const char *ipath_status_str[] = {
-	"Initted",
-	"Disabled",
-	"Admin_Disabled",
-	"", /* This used to be the old "OIB_SMA" status. */
-	"", /* This used to be the old "SMA" status. */
-	"Present",
-	"IB_link_up",
-	"IB_configured",
-	"NoIBcable",
-	"Fatal_Hardware_Error",
-	NULL,
-};
-
-static ssize_t show_status_str(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int i, any;
-	u64 s;
-	ssize_t ret;
-
-	if (!dd->ipath_statusp) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	s = *(dd->ipath_statusp);
-	*buf = '\0';
-	for (any = i = 0; s && ipath_status_str[i]; i++) {
-		if (s & 1) {
-			if (any && strlcat(buf, " ", PAGE_SIZE) >=
-			    PAGE_SIZE)
-				/* overflow */
-				break;
-			if (strlcat(buf, ipath_status_str[i],
-				    PAGE_SIZE) >= PAGE_SIZE)
-				break;
-			any = 1;
-		}
-		s >>= 1;
-	}
-	if (any)
-		strlcat(buf, "\n", PAGE_SIZE);
-
-	ret = strlen(buf);
-
-bail:
-	return ret;
-}
-
-static ssize_t show_boardversion(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	/* The string printed here is already newline-terminated. */
-	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
-}
-
-static ssize_t show_localbus_info(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	/* The string printed here is already newline-terminated. */
-	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
-}
-
-static ssize_t show_lmc(struct device *dev,
-			struct device_attribute *attr,
-			char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
-}
-
-static ssize_t store_lmc(struct device *dev,
-			 struct device_attribute *attr,
-			 const char *buf,
-			 size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u16 lmc = 0;
-	int ret;
-
-	ret = ipath_parse_ushort(buf, &lmc);
-	if (ret < 0)
-		goto invalid;
-
-	if (lmc > 7) {
-		ret = -EINVAL;
-		goto invalid;
-	}
-
-	ipath_set_lid(dd, dd->ipath_lid, lmc);
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
-bail:
-	return ret;
-}
-
-static ssize_t show_lid(struct device *dev,
-			struct device_attribute *attr,
-			char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
-}
-
-static ssize_t store_lid(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u16 lid = 0;
-	int ret;
-
-	ret = ipath_parse_ushort(buf, &lid);
-	if (ret < 0)
-		goto invalid;
-
-	if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
-		ret = -EINVAL;
-		goto invalid;
-	}
-
-	ipath_set_lid(dd, lid, dd->ipath_lmc);
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
-bail:
-	return ret;
-}
-
-static ssize_t show_mlid(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
-}
-
-static ssize_t store_mlid(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u16 mlid;
-	int ret;
-
-	ret = ipath_parse_ushort(buf, &mlid);
-	if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
-		goto invalid;
-
-	dd->ipath_mlid = mlid;
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid MLID\n");
-bail:
-	return ret;
-}
-
-static ssize_t show_guid(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u8 *guid;
-
-	guid = (u8 *) & (dd->ipath_guid);
-
-	return scnprintf(buf, PAGE_SIZE,
-			 "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
-			 guid[0], guid[1], guid[2], guid[3],
-			 guid[4], guid[5], guid[6], guid[7]);
-}
-
-static ssize_t store_guid(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	ssize_t ret;
-	unsigned short guid[8];
-	__be64 new_guid;
-	u8 *ng;
-	int i;
-
-	if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
-		   &guid[0], &guid[1], &guid[2], &guid[3],
-		   &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
-		goto invalid;
-
-	ng = (u8 *) &new_guid;
-
-	for (i = 0; i < 8; i++) {
-		if (guid[i] > 0xff)
-			goto invalid;
-		ng[i] = guid[i];
-	}
-
-	if (new_guid == 0)
-		goto invalid;
-
-	dd->ipath_guid = new_guid;
-	dd->ipath_nguid = 1;
-	if (dd->verbs_dev)
-		dd->verbs_dev->ibdev.node_guid = new_guid;
-
-	ret = strlen(buf);
-	goto bail;
-
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid GUID\n");
-	ret = -EINVAL;
-
-bail:
-	return ret;
-}
-
-static ssize_t show_nguid(struct device *dev,
-			  struct device_attribute *attr,
-			  char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
-}
-
-static ssize_t show_nports(struct device *dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	/* Return the number of user ports available. */
-	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
-}
-
-static ssize_t show_serial(struct device *dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	buf[sizeof dd->ipath_serial] = '\0';
-	memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
-	strcat(buf, "\n");
-	return strlen(buf);
-}
-
-static ssize_t show_unit(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
-}
-
-static ssize_t show_jint_max_packets(struct device *dev,
-				     struct device_attribute *attr,
-				     char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
-}
-
-static ssize_t store_jint_max_packets(struct device *dev,
-				      struct device_attribute *attr,
-				      const char *buf,
-				      size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u16 v = 0;
-	int ret;
-
-	ret = ipath_parse_ushort(buf, &v);
-	if (ret < 0)
-		ipath_dev_err(dd, "invalid jint_max_packets.\n");
-	else
-		dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
-
-	return ret;
-}
-
-static ssize_t show_jint_idle_ticks(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
-}
-
-static ssize_t store_jint_idle_ticks(struct device *dev,
-				     struct device_attribute *attr,
-				     const char *buf,
-				     size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	u16 v = 0;
-	int ret;
-
-	ret = ipath_parse_ushort(buf, &v);
-	if (ret < 0)
-		ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
-	else
-		dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
-
-	return ret;
-}
-
-#define DEVICE_COUNTER(name, attr) \
-	static ssize_t show_counter_##name(struct device *dev, \
-					   struct device_attribute *attr, \
-					   char *buf) \
-	{ \
-		struct ipath_devdata *dd = dev_get_drvdata(dev); \
-		return scnprintf(\
-			buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
-			ipath_snap_cntr( \
-				dd, offsetof(struct infinipath_counters, \
-					     attr) / sizeof(u64)));	\
-	} \
-	static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
-
-DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
-DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
-DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
-DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
-DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
-DEVICE_COUNTER(lb_ints, LBIntCnt);
-DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
-DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
-DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
-DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
-DEVICE_COUNTER(rx_dwords, RxDwordCnt);
-DEVICE_COUNTER(rx_ebps, RxEBPCnt);
-DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
-DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
-DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
-DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
-DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
-DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
-DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
-DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
-DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
-DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
-DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
-DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
-DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
-DEVICE_COUNTER(tx_dwords, TxDwordCnt);
-DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
-DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
-DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
-DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
-DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
-DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
-
-static struct attribute *dev_counter_attributes[] = {
-	&dev_attr_ib_link_downeds.attr,
-	&dev_attr_ib_link_err_recoveries.attr,
-	&dev_attr_ib_status_changes.attr,
-	&dev_attr_ib_symbol_errs.attr,
-	&dev_attr_lb_flow_stalls.attr,
-	&dev_attr_lb_ints.attr,
-	&dev_attr_rx_bad_formats.attr,
-	&dev_attr_rx_buf_ovfls.attr,
-	&dev_attr_rx_data_pkts.attr,
-	&dev_attr_rx_dropped_pkts.attr,
-	&dev_attr_rx_dwords.attr,
-	&dev_attr_rx_ebps.attr,
-	&dev_attr_rx_flow_ctrl_errs.attr,
-	&dev_attr_rx_flow_pkts.attr,
-	&dev_attr_rx_icrc_errs.attr,
-	&dev_attr_rx_len_errs.attr,
-	&dev_attr_rx_link_problems.attr,
-	&dev_attr_rx_lpcrc_errs.attr,
-	&dev_attr_rx_max_min_len_errs.attr,
-	&dev_attr_rx_p0_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p1_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p2_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p3_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p4_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p5_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p6_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p7_hdr_egr_ovfls.attr,
-	&dev_attr_rx_p8_hdr_egr_ovfls.attr,
-	&dev_attr_rx_pkey_mismatches.attr,
-	&dev_attr_rx_tid_full_errs.attr,
-	&dev_attr_rx_tid_valid_errs.attr,
-	&dev_attr_rx_vcrc_errs.attr,
-	&dev_attr_tx_data_pkts.attr,
-	&dev_attr_tx_dropped_pkts.attr,
-	&dev_attr_tx_dwords.attr,
-	&dev_attr_tx_flow_pkts.attr,
-	&dev_attr_tx_flow_stalls.attr,
-	&dev_attr_tx_len_errs.attr,
-	&dev_attr_tx_max_min_len_errs.attr,
-	&dev_attr_tx_underruns.attr,
-	&dev_attr_tx_unsup_vl_errs.attr,
-	NULL
-};
-
-static struct attribute_group dev_counter_attr_group = {
-	.name = "counters",
-	.attrs = dev_counter_attributes
-};
-
-static ssize_t store_reset(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	if (count < 5 || memcmp(buf, "reset", 5)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (dd->ipath_flags & IPATH_DISABLED) {
-		/*
-		 * post-reset init would re-enable interrupts, etc.
-		 * so don't allow reset on disabled devices.  Not
-		 * perfect error, but about the best choice.
-		 */
-		dev_info(dev,"Unit %d is disabled, can't reset\n",
-			 dd->ipath_unit);
-		ret = -EINVAL;
-		goto bail;
-	}
-	ret = ipath_reset_device(dd->ipath_unit);
-bail:
-	return ret<0 ? ret : count;
-}
-
-static ssize_t store_link_state(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 state;
-
-	ret = ipath_parse_ushort(buf, &state);
-	if (ret < 0)
-		goto invalid;
-
-	r = ipath_set_linkstate(dd, state);
-	if (r < 0) {
-		ret = r;
-		goto bail;
-	}
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid link state\n");
-bail:
-	return ret;
-}
-
-static ssize_t show_mtu(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
-}
-
-static ssize_t store_mtu(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	ssize_t ret;
-	u16 mtu = 0;
-	int r;
-
-	ret = ipath_parse_ushort(buf, &mtu);
-	if (ret < 0)
-		goto invalid;
-
-	r = ipath_set_mtu(dd, mtu);
-	if (r < 0)
-		ret = r;
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid MTU\n");
-bail:
-	return ret;
-}
-
-static ssize_t show_enabled(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	return scnprintf(buf, PAGE_SIZE, "%u\n",
-			 (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
-}
-
-static ssize_t store_enabled(struct device *dev,
-			 struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	ssize_t ret;
-	u16 enable = 0;
-
-	ret = ipath_parse_ushort(buf, &enable);
-	if (ret < 0) {
-		ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
-		goto bail;
-	}
-
-	if (enable) {
-		if (!(dd->ipath_flags & IPATH_DISABLED))
-			goto bail;
-
-		dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
-		/* same as post-reset */
-		ret = ipath_init_chip(dd, 1);
-		if (ret)
-			ipath_dev_err(dd, "Failed to enable unit %d\n",
-				      dd->ipath_unit);
-		else {
-			dd->ipath_flags &= ~IPATH_DISABLED;
-			*dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
-		}
-	}
-	else if (!(dd->ipath_flags & IPATH_DISABLED)) {
-		dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
-		ipath_shutdown_device(dd);
-		dd->ipath_flags |= IPATH_DISABLED;
-		*dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
-	}
-
-bail:
-	return ret;
-}
-
-static ssize_t store_rx_pol_inv(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret < 0)
-		goto invalid;
-
-	r = ipath_set_rx_pol_inv(dd, val);
-	if (r < 0) {
-		ret = r;
-		goto bail;
-	}
-
-	goto bail;
-invalid:
-	ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
-bail:
-	return ret;
-}
-
-static ssize_t store_led_override(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret > 0)
-		ipath_set_led_override(dd, val);
-	else
-		ipath_dev_err(dd, "attempt to set invalid LED override\n");
-	return ret;
-}
-
-static ssize_t show_logged_errs(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int idx, count;
-
-	/* force consistency with actual EEPROM */
-	if (ipath_update_eeprom_log(dd) != 0)
-		return -ENXIO;
-
-	count = 0;
-	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-		count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
-			dd->ipath_eep_st_errs[idx],
-			idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
-	}
-
-	return count;
-}
-
-/*
- * New sysfs entries to control various IB config. These all turn into
- * accesses via ipath_f_get/set_ib_cfg.
- *
- * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
- */
-static ssize_t show_hrtbt_enb(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-static ssize_t store_hrtbt_enb(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret >= 0 && val > 3)
-		ret = -EINVAL;
-	if (ret < 0) {
-		ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
-		goto bail;
-	}
-
-	/*
-	 * Set the "intentional" heartbeat enable per either of
-	 * "Enable" and "Auto", as these are normally set together.
-	 * This bit is consulted when leaving loopback mode,
-	 * because entering loopback mode overrides it and automatically
-	 * disables heartbeat.
-	 */
-	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
-	if (r < 0)
-		ret = r;
-	else if (val == IPATH_IB_HRTBT_OFF)
-		dd->ipath_flags |= IPATH_NO_HRTBT;
-	else
-		dd->ipath_flags &= ~IPATH_NO_HRTBT;
-
-bail:
-	return ret;
-}
-
-/*
- * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
- * _not_ the particular encoding of any given chip)
- */
-static ssize_t show_lwid_enb(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-static ssize_t store_lwid_enb(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret >= 0 && (val == 0 || val > 3))
-		ret = -EINVAL;
-	if (ret < 0) {
-		ipath_dev_err(dd,
-			"attempt to set invalid Link Width (enable)\n");
-		goto bail;
-	}
-
-	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
-	if (r < 0)
-		ret = r;
-
-bail:
-	return ret;
-}
-
-/* Get current link width */
-static ssize_t show_lwid(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-/*
- * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
- */
-static ssize_t show_spd_enb(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-static ssize_t store_spd_enb(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
-		ret = -EINVAL;
-	if (ret < 0) {
-		ipath_dev_err(dd,
-			"attempt to set invalid Link Speed (enable)\n");
-		goto bail;
-	}
-
-	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
-	if (r < 0)
-		ret = r;
-
-bail:
-	return ret;
-}
-
-/* Get current link speed */
-static ssize_t show_spd(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-/*
- * Get/Set RX polarity-invert enable. 0=no, 1=yes.
- */
-static ssize_t show_rx_polinv_enb(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-static ssize_t store_rx_polinv_enb(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret >= 0 && val > 1) {
-		ipath_dev_err(dd,
-			"attempt to set invalid Rx Polarity (enable)\n");
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
-	if (r < 0)
-		ret = r;
-
-bail:
-	return ret;
-}
-
-/*
- * Get/Set RX lane-reversal enable. 0=no, 1=yes.
- */
-static ssize_t show_lanerev_enb(struct device *dev,
-			 struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-
-	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
-	if (ret >= 0)
-		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-	return ret;
-}
-
-static ssize_t store_lanerev_enb(struct device *dev,
-			  struct device_attribute *attr,
-			  const char *buf,
-			  size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, r;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret >= 0 && val > 1) {
-		ret = -EINVAL;
-		ipath_dev_err(dd,
-			"attempt to set invalid Lane reversal (enable)\n");
-		goto bail;
-	}
-
-	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
-	if (r < 0)
-		ret = r;
-
-bail:
-	return ret;
-}
-
-static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
-static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
-
-static struct attribute *driver_attributes[] = {
-	&driver_attr_num_units.attr,
-	&driver_attr_version.attr,
-	NULL
-};
-
-static struct attribute_group driver_attr_group = {
-	.attrs = driver_attributes
-};
-
-static ssize_t store_tempsense(struct device *dev,
-			       struct device_attribute *attr,
-			       const char *buf,
-			       size_t count)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret, stat;
-	u16 val;
-
-	ret = ipath_parse_ushort(buf, &val);
-	if (ret <= 0) {
-		ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
-		goto bail;
-	}
-	/* If anything but the highest limit, enable T_CRIT_A "interrupt" */
-	stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
-	if (stat) {
-		ipath_dev_err(dd, "Unable to set tempsense config\n");
-		ret = -1;
-		goto bail;
-	}
-	stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
-	if (stat) {
-		ipath_dev_err(dd, "Unable to set local Tcrit\n");
-		ret = -1;
-		goto bail;
-	}
-	stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
-	if (stat) {
-		ipath_dev_err(dd, "Unable to set remote Tcrit\n");
-		ret = -1;
-		goto bail;
-	}
-
-bail:
-	return ret;
-}
-
-/*
- * dump tempsense regs. in decimal, to ease shell-scripts.
- */
-static ssize_t show_tempsense(struct device *dev,
-			      struct device_attribute *attr,
-			      char *buf)
-{
-	struct ipath_devdata *dd = dev_get_drvdata(dev);
-	int ret;
-	int idx;
-	u8 regvals[8];
-
-	ret = -ENXIO;
-	for (idx = 0; idx < 8; ++idx) {
-		if (idx == 6)
-			continue;
-		ret = ipath_tempsense_read(dd, idx);
-		if (ret < 0)
-			break;
-		regvals[idx] = ret;
-	}
-	if (idx == 8)
-		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
-			*(signed char *)(regvals),
-			*(signed char *)(regvals + 1),
-			regvals[2], regvals[3],
-			*(signed char *)(regvals + 5),
-			*(signed char *)(regvals + 7));
-	return ret;
-}
-
-const struct attribute_group *ipath_driver_attr_groups[] = {
-	&driver_attr_group,
-	NULL,
-};
-
-static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
-static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
-static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
-static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
-static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
-static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
-static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
-static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
-static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
-static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
-static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
-static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
-static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
-static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
-static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
-		   show_jint_max_packets, store_jint_max_packets);
-static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
-		   show_jint_idle_ticks, store_jint_idle_ticks);
-static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
-		   show_tempsense, store_tempsense);
-
-static struct attribute *dev_attributes[] = {
-	&dev_attr_guid.attr,
-	&dev_attr_lmc.attr,
-	&dev_attr_lid.attr,
-	&dev_attr_link_state.attr,
-	&dev_attr_mlid.attr,
-	&dev_attr_mtu.attr,
-	&dev_attr_nguid.attr,
-	&dev_attr_nports.attr,
-	&dev_attr_serial.attr,
-	&dev_attr_status.attr,
-	&dev_attr_status_str.attr,
-	&dev_attr_boardversion.attr,
-	&dev_attr_unit.attr,
-	&dev_attr_enabled.attr,
-	&dev_attr_rx_pol_inv.attr,
-	&dev_attr_led_override.attr,
-	&dev_attr_logged_errors.attr,
-	&dev_attr_tempsense.attr,
-	&dev_attr_localbus_info.attr,
-	NULL
-};
-
-static struct attribute_group dev_attr_group = {
-	.attrs = dev_attributes
-};
-
-static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
-		   store_hrtbt_enb);
-static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
-		   store_lwid_enb);
-static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
-static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
-		   store_spd_enb);
-static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
-static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
-		   store_rx_polinv_enb);
-static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
-		   store_lanerev_enb);
-
-static struct attribute *dev_ibcfg_attributes[] = {
-	&dev_attr_hrtbt_enable.attr,
-	&dev_attr_link_width_enable.attr,
-	&dev_attr_link_width.attr,
-	&dev_attr_link_speed_enable.attr,
-	&dev_attr_link_speed.attr,
-	&dev_attr_rx_pol_inv_enable.attr,
-	&dev_attr_rx_lane_rev_enable.attr,
-	NULL
-};
-
-static struct attribute_group dev_ibcfg_attr_group = {
-	.attrs = dev_ibcfg_attributes
-};
-
-/**
- * ipath_expose_reset - create a device reset file
- * @dev: the device structure
- *
- * Only expose a file that lets us reset the device after someone
- * enters diag mode.  A device reset is quite likely to crash the
- * machine entirely, so we don't want to normally make it
- * available.
- *
- * Called with ipath_mutex held.
- */
-int ipath_expose_reset(struct device *dev)
-{
-	static int exposed;
-	int ret;
-
-	if (!exposed) {
-		ret = device_create_file(dev, &dev_attr_reset);
-		exposed = 1;
-	}
-	else
-		ret = 0;
-
-	return ret;
-}
-
-int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
-{
-	int ret;
-
-	ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
-	if (ret)
-		goto bail;
-
-	ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
-	if (ret)
-		goto bail_attrs;
-
-	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
-		if (ret)
-			goto bail_counter;
-		ret = device_create_file(dev, &dev_attr_jint_max_packets);
-		if (ret)
-			goto bail_idle;
-
-		ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
-		if (ret)
-			goto bail_max;
-	}
-
-	return 0;
-
-bail_max:
-	device_remove_file(dev, &dev_attr_jint_max_packets);
-bail_idle:
-	device_remove_file(dev, &dev_attr_jint_idle_ticks);
-bail_counter:
-	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-bail_attrs:
-	sysfs_remove_group(&dev->kobj, &dev_attr_group);
-bail:
-	return ret;
-}
-
-void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
-{
-	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-
-	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-		sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
-		device_remove_file(dev, &dev_attr_jint_idle_ticks);
-		device_remove_file(dev, &dev_attr_jint_max_packets);
-	}
-
-	sysfs_remove_group(&dev->kobj, &dev_attr_group);
-
-	device_remove_file(dev, &dev_attr_reset);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
deleted file mode 100644
index 22e6099..0000000
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_UC_##x
-
-/**
- * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_uc_req(struct ipath_qp *qp)
-{
-	struct ipath_other_headers *ohdr;
-	struct ipath_swqe *wqe;
-	unsigned long flags;
-	u32 hwords;
-	u32 bth0;
-	u32 len;
-	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-	int ret = 0;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-			goto bail;
-		/* We are in the error state, flush the work request. */
-		if (qp->s_last == qp->s_head)
-			goto bail;
-		/* If DMAs are in progress, we can't flush immediately. */
-		if (atomic_read(&qp->s_dma_busy)) {
-			qp->s_flags |= IPATH_S_WAIT_DMA;
-			goto bail;
-		}
-		wqe = get_swqe_ptr(qp, qp->s_last);
-		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-		goto done;
-	}
-
-	ohdr = &qp->s_hdr.u.oth;
-	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-		ohdr = &qp->s_hdr.u.l.oth;
-
-	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
-	hwords = 5;
-	bth0 = 1 << 22; /* Set M bit */
-
-	/* Get the next send request. */
-	wqe = get_swqe_ptr(qp, qp->s_cur);
-	qp->s_wqe = NULL;
-	switch (qp->s_state) {
-	default:
-		if (!(ib_ipath_state_ops[qp->state] &
-		    IPATH_PROCESS_NEXT_SEND_OK))
-			goto bail;
-		/* Check if send work queue is empty. */
-		if (qp->s_cur == qp->s_head)
-			goto bail;
-		/*
-		 * Start a new request.
-		 */
-		qp->s_psn = wqe->psn = qp->s_next_psn;
-		qp->s_sge.sge = wqe->sg_list[0];
-		qp->s_sge.sg_list = wqe->sg_list + 1;
-		qp->s_sge.num_sge = wqe->wr.num_sge;
-		qp->s_len = len = wqe->length;
-		switch (wqe->wr.opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_IMM:
-			if (len > pmtu) {
-				qp->s_state = OP(SEND_FIRST);
-				len = pmtu;
-				break;
-			}
-			if (wqe->wr.opcode == IB_WR_SEND)
-				qp->s_state = OP(SEND_ONLY);
-			else {
-				qp->s_state =
-					OP(SEND_ONLY_WITH_IMMEDIATE);
-				/* Immediate data comes after the BTH */
-				ohdr->u.imm_data = wqe->wr.ex.imm_data;
-				hwords += 1;
-			}
-			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-				bth0 |= 1 << 23;
-			qp->s_wqe = wqe;
-			if (++qp->s_cur >= qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		case IB_WR_RDMA_WRITE:
-		case IB_WR_RDMA_WRITE_WITH_IMM:
-			ohdr->u.rc.reth.vaddr =
-				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-			ohdr->u.rc.reth.rkey =
-				cpu_to_be32(wqe->wr.wr.rdma.rkey);
-			ohdr->u.rc.reth.length = cpu_to_be32(len);
-			hwords += sizeof(struct ib_reth) / 4;
-			if (len > pmtu) {
-				qp->s_state = OP(RDMA_WRITE_FIRST);
-				len = pmtu;
-				break;
-			}
-			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-				qp->s_state = OP(RDMA_WRITE_ONLY);
-			else {
-				qp->s_state =
-					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-				/* Immediate data comes after the RETH */
-				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-				hwords += 1;
-				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-					bth0 |= 1 << 23;
-			}
-			qp->s_wqe = wqe;
-			if (++qp->s_cur >= qp->s_size)
-				qp->s_cur = 0;
-			break;
-
-		default:
-			goto bail;
-		}
-		break;
-
-	case OP(SEND_FIRST):
-		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
-	case OP(SEND_MIDDLE):
-		len = qp->s_len;
-		if (len > pmtu) {
-			len = pmtu;
-			break;
-		}
-		if (wqe->wr.opcode == IB_WR_SEND)
-			qp->s_state = OP(SEND_LAST);
-		else {
-			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.ex.imm_data;
-			hwords += 1;
-		}
-		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-			bth0 |= 1 << 23;
-		qp->s_wqe = wqe;
-		if (++qp->s_cur >= qp->s_size)
-			qp->s_cur = 0;
-		break;
-
-	case OP(RDMA_WRITE_FIRST):
-		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
-	case OP(RDMA_WRITE_MIDDLE):
-		len = qp->s_len;
-		if (len > pmtu) {
-			len = pmtu;
-			break;
-		}
-		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-			qp->s_state = OP(RDMA_WRITE_LAST);
-		else {
-			qp->s_state =
-				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.ex.imm_data;
-			hwords += 1;
-			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-				bth0 |= 1 << 23;
-		}
-		qp->s_wqe = wqe;
-		if (++qp->s_cur >= qp->s_size)
-			qp->s_cur = 0;
-		break;
-	}
-	qp->s_len -= len;
-	qp->s_hdrwords = hwords;
-	qp->s_cur_sge = &qp->s_sge;
-	qp->s_cur_size = len;
-	ipath_make_ruc_header(to_idev(qp->ibqp.device),
-			      qp, ohdr, bth0 | (qp->s_state << 24),
-			      qp->s_next_psn++ & IPATH_PSN_MASK);
-done:
-	ret = 1;
-	goto unlock;
-
-bail:
-	qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-	return ret;
-}
-
-/**
- * ipath_uc_rcv - handle an incoming UC packet
- * @dev: the device the packet came in on
- * @hdr: the header of the packet
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the length of the packet
- * @qp: the QP for this packet.
- *
- * This is called from ipath_qp_rcv() to process an incoming UC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-	struct ipath_other_headers *ohdr;
-	int opcode;
-	u32 hdrsize;
-	u32 psn;
-	u32 pad;
-	struct ib_wc wc;
-	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-	struct ib_reth *reth;
-	int header_in_data;
-
-	/* Validate the SLID. See Ch. 9.6.1.5 */
-	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-		goto done;
-
-	/* Check for GRH */
-	if (!has_grh) {
-		ohdr = &hdr->u.oth;
-		hdrsize = 8 + 12;	/* LRH + BTH */
-		psn = be32_to_cpu(ohdr->bth[2]);
-		header_in_data = 0;
-	} else {
-		ohdr = &hdr->u.l.oth;
-		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
-		/*
-		 * The header with GRH is 60 bytes and the
-		 * core driver sets the eager header buffer
-		 * size to 56 bytes so the last 4 bytes of
-		 * the BTH header (PSN) is in the data buffer.
-		 */
-		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-		if (header_in_data) {
-			psn = be32_to_cpu(((__be32 *) data)[0]);
-			data += sizeof(__be32);
-		} else
-			psn = be32_to_cpu(ohdr->bth[2]);
-	}
-	/*
-	 * The opcode is in the low byte when its in network order
-	 * (top byte when in host order).
-	 */
-	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-	memset(&wc, 0, sizeof wc);
-
-	/* Compare the PSN verses the expected PSN. */
-	if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
-		/*
-		 * Handle a sequence error.
-		 * Silently drop any current message.
-		 */
-		qp->r_psn = psn;
-	inv:
-		qp->r_state = OP(SEND_LAST);
-		switch (opcode) {
-		case OP(SEND_FIRST):
-		case OP(SEND_ONLY):
-		case OP(SEND_ONLY_WITH_IMMEDIATE):
-			goto send_first;
-
-		case OP(RDMA_WRITE_FIRST):
-		case OP(RDMA_WRITE_ONLY):
-		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-			goto rdma_first;
-
-		default:
-			dev->n_pkt_drops++;
-			goto done;
-		}
-	}
-
-	/* Check for opcode sequence errors. */
-	switch (qp->r_state) {
-	case OP(SEND_FIRST):
-	case OP(SEND_MIDDLE):
-		if (opcode == OP(SEND_MIDDLE) ||
-		    opcode == OP(SEND_LAST) ||
-		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-			break;
-		goto inv;
-
-	case OP(RDMA_WRITE_FIRST):
-	case OP(RDMA_WRITE_MIDDLE):
-		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-		    opcode == OP(RDMA_WRITE_LAST) ||
-		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-			break;
-		goto inv;
-
-	default:
-		if (opcode == OP(SEND_FIRST) ||
-		    opcode == OP(SEND_ONLY) ||
-		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
-		    opcode == OP(RDMA_WRITE_FIRST) ||
-		    opcode == OP(RDMA_WRITE_ONLY) ||
-		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-			break;
-		goto inv;
-	}
-
-	/* OK, process the packet. */
-	switch (opcode) {
-	case OP(SEND_FIRST):
-	case OP(SEND_ONLY):
-	case OP(SEND_ONLY_WITH_IMMEDIATE):
-	send_first:
-		if (qp->r_flags & IPATH_R_REUSE_SGE) {
-			qp->r_flags &= ~IPATH_R_REUSE_SGE;
-			qp->r_sge = qp->s_rdma_read_sge;
-		} else if (!ipath_get_rwqe(qp, 0)) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		/* Save the WQE so we can reuse it in case of an error. */
-		qp->s_rdma_read_sge = qp->r_sge;
-		qp->r_rcv_len = 0;
-		if (opcode == OP(SEND_ONLY))
-			goto send_last;
-		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
-			goto send_last_imm;
-		/* FALLTHROUGH */
-	case OP(SEND_MIDDLE):
-		/* Check for invalid length PMTU or posted rwqe len. */
-		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-			qp->r_flags |= IPATH_R_REUSE_SGE;
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		qp->r_rcv_len += pmtu;
-		if (unlikely(qp->r_rcv_len > qp->r_len)) {
-			qp->r_flags |= IPATH_R_REUSE_SGE;
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
-		break;
-
-	case OP(SEND_LAST_WITH_IMMEDIATE):
-	send_last_imm:
-		if (header_in_data) {
-			wc.ex.imm_data = *(__be32 *) data;
-			data += sizeof(__be32);
-		} else {
-			/* Immediate data comes after BTH */
-			wc.ex.imm_data = ohdr->u.imm_data;
-		}
-		hdrsize += 4;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		/* FALLTHROUGH */
-	case OP(SEND_LAST):
-	send_last:
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/* Check for invalid length. */
-		/* XXX LAST len should be >= 1 */
-		if (unlikely(tlen < (hdrsize + pad + 4))) {
-			qp->r_flags |= IPATH_R_REUSE_SGE;
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		/* Don't count the CRC. */
-		tlen -= (hdrsize + pad + 4);
-		wc.byte_len = tlen + qp->r_rcv_len;
-		if (unlikely(wc.byte_len > qp->r_len)) {
-			qp->r_flags |= IPATH_R_REUSE_SGE;
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		wc.opcode = IB_WC_RECV;
-	last_imm:
-		ipath_copy_sge(&qp->r_sge, data, tlen);
-		wc.wr_id = qp->r_wr_id;
-		wc.status = IB_WC_SUCCESS;
-		wc.qp = &qp->ibqp;
-		wc.src_qp = qp->remote_qpn;
-		wc.slid = qp->remote_ah_attr.dlid;
-		wc.sl = qp->remote_ah_attr.sl;
-		/* Signal completion event if the solicited bit is set. */
-		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-			       (ohdr->bth[0] &
-				cpu_to_be32(1 << 23)) != 0);
-		break;
-
-	case OP(RDMA_WRITE_FIRST):
-	case OP(RDMA_WRITE_ONLY):
-	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
-	rdma_first:
-		/* RETH comes after BTH */
-		if (!header_in_data)
-			reth = &ohdr->u.rc.reth;
-		else {
-			reth = (struct ib_reth *)data;
-			data += sizeof(*reth);
-		}
-		hdrsize += sizeof(*reth);
-		qp->r_len = be32_to_cpu(reth->length);
-		qp->r_rcv_len = 0;
-		if (qp->r_len != 0) {
-			u32 rkey = be32_to_cpu(reth->rkey);
-			u64 vaddr = be64_to_cpu(reth->vaddr);
-			int ok;
-
-			/* Check rkey */
-			ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
-					   vaddr, rkey,
-					   IB_ACCESS_REMOTE_WRITE);
-			if (unlikely(!ok)) {
-				dev->n_pkt_drops++;
-				goto done;
-			}
-		} else {
-			qp->r_sge.sg_list = NULL;
-			qp->r_sge.sge.mr = NULL;
-			qp->r_sge.sge.vaddr = NULL;
-			qp->r_sge.sge.length = 0;
-			qp->r_sge.sge.sge_length = 0;
-		}
-		if (unlikely(!(qp->qp_access_flags &
-			       IB_ACCESS_REMOTE_WRITE))) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		if (opcode == OP(RDMA_WRITE_ONLY))
-			goto rdma_last;
-		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-			goto rdma_last_imm;
-		/* FALLTHROUGH */
-	case OP(RDMA_WRITE_MIDDLE):
-		/* Check for invalid length PMTU or posted rwqe len. */
-		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		qp->r_rcv_len += pmtu;
-		if (unlikely(qp->r_rcv_len > qp->r_len)) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
-		break;
-
-	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-	rdma_last_imm:
-		if (header_in_data) {
-			wc.ex.imm_data = *(__be32 *) data;
-			data += sizeof(__be32);
-		} else {
-			/* Immediate data comes after BTH */
-			wc.ex.imm_data = ohdr->u.imm_data;
-		}
-		hdrsize += 4;
-		wc.wc_flags = IB_WC_WITH_IMM;
-
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/* Check for invalid length. */
-		/* XXX LAST len should be >= 1 */
-		if (unlikely(tlen < (hdrsize + pad + 4))) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		/* Don't count the CRC. */
-		tlen -= (hdrsize + pad + 4);
-		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		if (qp->r_flags & IPATH_R_REUSE_SGE)
-			qp->r_flags &= ~IPATH_R_REUSE_SGE;
-		else if (!ipath_get_rwqe(qp, 1)) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		wc.byte_len = qp->r_len;
-		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-		goto last_imm;
-
-	case OP(RDMA_WRITE_LAST):
-	rdma_last:
-		/* Get the number of bytes the message was padded by. */
-		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-		/* Check for invalid length. */
-		/* XXX LAST len should be >= 1 */
-		if (unlikely(tlen < (hdrsize + pad + 4))) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		/* Don't count the CRC. */
-		tlen -= (hdrsize + pad + 4);
-		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-			dev->n_pkt_drops++;
-			goto done;
-		}
-		ipath_copy_sge(&qp->r_sge, data, tlen);
-		break;
-
-	default:
-		/* Drop packet for unknown opcodes. */
-		dev->n_pkt_drops++;
-		goto done;
-	}
-	qp->r_psn++;
-	qp->r_state = opcode;
-done:
-	return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
deleted file mode 100644
index e8a2a91..0000000
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/sched.h>
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_ud_loopback - handle send on loopback QPs
- * @sqp: the sending QP
- * @swqe: the send work request
- *
- * This is called from ipath_make_ud_req() to forward a WQE addressed
- * to the same HCA.
- * Note that the receive interrupt handler may be calling ipath_ud_rcv()
- * while this is being called.
- */
-static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
-{
-	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-	struct ipath_qp *qp;
-	struct ib_ah_attr *ah_attr;
-	unsigned long flags;
-	struct ipath_rq *rq;
-	struct ipath_srq *srq;
-	struct ipath_sge_state rsge;
-	struct ipath_sge *sge;
-	struct ipath_rwq *wq;
-	struct ipath_rwqe *wqe;
-	void (*handler)(struct ib_event *, void *);
-	struct ib_wc wc;
-	u32 tail;
-	u32 rlen;
-	u32 length;
-
-	qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
-	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-		dev->n_pkt_drops++;
-		goto done;
-	}
-
-	/*
-	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
-	 * Qkeys with the high order bit set mean use the
-	 * qkey from the QP context instead of the WR (see 10.2.5).
-	 */
-	if (unlikely(qp->ibqp.qp_num &&
-		     ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
-		      sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
-		/* XXX OK to lose a count once in a while. */
-		dev->qkey_violations++;
-		dev->n_pkt_drops++;
-		goto drop;
-	}
-
-	/*
-	 * A GRH is expected to precede the data even if not
-	 * present on the wire.
-	 */
-	length = swqe->length;
-	memset(&wc, 0, sizeof wc);
-	wc.byte_len = length + sizeof(struct ib_grh);
-
-	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = swqe->wr.ex.imm_data;
-	}
-
-	/*
-	 * This would be a lot simpler if we could call ipath_get_rwqe()
-	 * but that uses state that the receive interrupt handler uses
-	 * so we would need to lock out receive interrupts while doing
-	 * local loopback.
-	 */
-	if (qp->ibqp.srq) {
-		srq = to_isrq(qp->ibqp.srq);
-		handler = srq->ibsrq.event_handler;
-		rq = &srq->rq;
-	} else {
-		srq = NULL;
-		handler = NULL;
-		rq = &qp->r_rq;
-	}
-
-	/*
-	 * Get the next work request entry to find where to put the data.
-	 * Note that it is safe to drop the lock after changing rq->tail
-	 * since ipath_post_receive() won't fill the empty slot.
-	 */
-	spin_lock_irqsave(&rq->lock, flags);
-	wq = rq->wq;
-	tail = wq->tail;
-	/* Validate tail before using it since it is user writable. */
-	if (tail >= rq->size)
-		tail = 0;
-	if (unlikely(tail == wq->head)) {
-		spin_unlock_irqrestore(&rq->lock, flags);
-		dev->n_pkt_drops++;
-		goto drop;
-	}
-	wqe = get_rwqe_ptr(rq, tail);
-	rsge.sg_list = qp->r_ud_sg_list;
-	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
-		spin_unlock_irqrestore(&rq->lock, flags);
-		dev->n_pkt_drops++;
-		goto drop;
-	}
-	/* Silently drop packets which are too big. */
-	if (wc.byte_len > rlen) {
-		spin_unlock_irqrestore(&rq->lock, flags);
-		dev->n_pkt_drops++;
-		goto drop;
-	}
-	if (++tail >= rq->size)
-		tail = 0;
-	wq->tail = tail;
-	wc.wr_id = wqe->wr_id;
-	if (handler) {
-		u32 n;
-
-		/*
-		 * validate head pointer value and compute
-		 * the number of remaining WQEs.
-		 */
-		n = wq->head;
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
-
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-		} else
-			spin_unlock_irqrestore(&rq->lock, flags);
-	} else
-		spin_unlock_irqrestore(&rq->lock, flags);
-
-	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
-	if (ah_attr->ah_flags & IB_AH_GRH) {
-		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
-		wc.wc_flags |= IB_WC_GRH;
-	} else
-		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
-	sge = swqe->sg_list;
-	while (length) {
-		u32 len = sge->length;
-
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		ipath_copy_sge(&rsge, sge->vaddr, len);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--swqe->wr.num_sge)
-				sge++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		length -= len;
-	}
-	wc.status = IB_WC_SUCCESS;
-	wc.opcode = IB_WC_RECV;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = sqp->ibqp.qp_num;
-	/* XXX do we know which pkey matched? Only needed for GSI. */
-	wc.pkey_index = 0;
-	wc.slid = dev->dd->ipath_lid |
-		(ah_attr->src_path_bits &
-		 ((1 << dev->dd->ipath_lmc) - 1));
-	wc.sl = ah_attr->sl;
-	wc.dlid_path_bits =
-		ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-		       swqe->wr.send_flags & IB_SEND_SOLICITED);
-drop:
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-done:;
-}
-
-/**
- * ipath_make_ud_req - construct a UD request packet
- * @qp: the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_ud_req(struct ipath_qp *qp)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ipath_other_headers *ohdr;
-	struct ib_ah_attr *ah_attr;
-	struct ipath_swqe *wqe;
-	unsigned long flags;
-	u32 nwords;
-	u32 extra_bytes;
-	u32 bth0;
-	u16 lrh0;
-	u16 lid;
-	int ret = 0;
-	int next_cur;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-			goto bail;
-		/* We are in the error state, flush the work request. */
-		if (qp->s_last == qp->s_head)
-			goto bail;
-		/* If DMAs are in progress, we can't flush immediately. */
-		if (atomic_read(&qp->s_dma_busy)) {
-			qp->s_flags |= IPATH_S_WAIT_DMA;
-			goto bail;
-		}
-		wqe = get_swqe_ptr(qp, qp->s_last);
-		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-		goto done;
-	}
-
-	if (qp->s_cur == qp->s_head)
-		goto bail;
-
-	wqe = get_swqe_ptr(qp, qp->s_cur);
-	next_cur = qp->s_cur + 1;
-	if (next_cur >= qp->s_size)
-		next_cur = 0;
-
-	/* Construct the header. */
-	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
-	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
-		if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
-			dev->n_multicast_xmit++;
-		else
-			dev->n_unicast_xmit++;
-	} else {
-		dev->n_unicast_xmit++;
-		lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
-		if (unlikely(lid == dev->dd->ipath_lid)) {
-			/*
-			 * If DMAs are in progress, we can't generate
-			 * a completion for the loopback packet since
-			 * it would be out of order.
-			 * XXX Instead of waiting, we could queue a
-			 * zero length descriptor so we get a callback.
-			 */
-			if (atomic_read(&qp->s_dma_busy)) {
-				qp->s_flags |= IPATH_S_WAIT_DMA;
-				goto bail;
-			}
-			qp->s_cur = next_cur;
-			spin_unlock_irqrestore(&qp->s_lock, flags);
-			ipath_ud_loopback(qp, wqe);
-			spin_lock_irqsave(&qp->s_lock, flags);
-			ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
-			goto done;
-		}
-	}
-
-	qp->s_cur = next_cur;
-	extra_bytes = -wqe->length & 3;
-	nwords = (wqe->length + extra_bytes) >> 2;
-
-	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-	qp->s_hdrwords = 7;
-	qp->s_cur_size = wqe->length;
-	qp->s_cur_sge = &qp->s_sge;
-	qp->s_dmult = ah_attr->static_rate;
-	qp->s_wqe = wqe;
-	qp->s_sge.sge = wqe->sg_list[0];
-	qp->s_sge.sg_list = wqe->sg_list + 1;
-	qp->s_sge.num_sge = wqe->wr.num_sge;
-
-	if (ah_attr->ah_flags & IB_AH_GRH) {
-		/* Header size in 32-bit words. */
-		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-						 &ah_attr->grh,
-						 qp->s_hdrwords, nwords);
-		lrh0 = IPATH_LRH_GRH;
-		ohdr = &qp->s_hdr.u.l.oth;
-		/*
-		 * Don't worry about sending to locally attached multicast
-		 * QPs.  It is unspecified by the spec. what happens.
-		 */
-	} else {
-		/* Header size in 32-bit words. */
-		lrh0 = IPATH_LRH_BTH;
-		ohdr = &qp->s_hdr.u.oth;
-	}
-	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-		qp->s_hdrwords++;
-		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
-		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-	} else
-		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-	lrh0 |= ah_attr->sl << 4;
-	if (qp->ibqp.qp_type == IB_QPT_SMI)
-		lrh0 |= 0xF000;	/* Set VL (see ch. 13.5.3.1) */
-	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-	qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);	/* DEST LID */
-	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
-					   SIZE_OF_CRC);
-	lid = dev->dd->ipath_lid;
-	if (lid) {
-		lid |= ah_attr->src_path_bits &
-			((1 << dev->dd->ipath_lmc) - 1);
-		qp->s_hdr.lrh[3] = cpu_to_be16(lid);
-	} else
-		qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
-	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-		bth0 |= 1 << 23;
-	bth0 |= extra_bytes << 20;
-	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
-		ipath_get_pkey(dev->dd, qp->s_pkey_index);
-	ohdr->bth[0] = cpu_to_be32(bth0);
-	/*
-	 * Use the multicast QP if the destination LID is a multicast LID.
-	 */
-	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
-		cpu_to_be32(IPATH_MULTICAST_QPN) :
-		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
-	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
-	/*
-	 * Qkeys with the high order bit set mean use the
-	 * qkey from the QP context instead of the WR (see 10.2.5).
-	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
-					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
-	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-
-done:
-	ret = 1;
-	goto unlock;
-
-bail:
-	qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-	return ret;
-}
-
-/**
- * ipath_ud_rcv - receive an incoming UD packet
- * @dev: the device the packet came in on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_qp_rcv() to process an incoming UD packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-	struct ipath_other_headers *ohdr;
-	int opcode;
-	u32 hdrsize;
-	u32 pad;
-	struct ib_wc wc;
-	u32 qkey;
-	u32 src_qp;
-	u16 dlid;
-	int header_in_data;
-
-	/* Check for GRH */
-	if (!has_grh) {
-		ohdr = &hdr->u.oth;
-		hdrsize = 8 + 12 + 8;	/* LRH + BTH + DETH */
-		qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-		src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-		header_in_data = 0;
-	} else {
-		ohdr = &hdr->u.l.oth;
-		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
-		/*
-		 * The header with GRH is 68 bytes and the core driver sets
-		 * the eager header buffer size to 56 bytes so the last 12
-		 * bytes of the IB header is in the data buffer.
-		 */
-		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-		if (header_in_data) {
-			qkey = be32_to_cpu(((__be32 *) data)[1]);
-			src_qp = be32_to_cpu(((__be32 *) data)[2]);
-			data += 12;
-		} else {
-			qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-			src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-		}
-	}
-	src_qp &= IPATH_QPN_MASK;
-
-	/*
-	 * Check that the permissive LID is only used on QP0
-	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
-	 */
-	if (qp->ibqp.qp_num) {
-		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-			     hdr->lrh[3] == IB_LID_PERMISSIVE)) {
-			dev->n_pkt_drops++;
-			goto bail;
-		}
-		if (unlikely(qkey != qp->qkey)) {
-			/* XXX OK to lose a count once in a while. */
-			dev->qkey_violations++;
-			dev->n_pkt_drops++;
-			goto bail;
-		}
-	} else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
-		   hdr->lrh[3] == IB_LID_PERMISSIVE) {
-		struct ib_smp *smp = (struct ib_smp *) data;
-
-		if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-			dev->n_pkt_drops++;
-			goto bail;
-		}
-	}
-
-	/*
-	 * The opcode is in the low byte when its in network order
-	 * (top byte when in host order).
-	 */
-	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-	if (qp->ibqp.qp_num > 1 &&
-	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
-		if (header_in_data) {
-			wc.ex.imm_data = *(__be32 *) data;
-			data += sizeof(__be32);
-		} else
-			wc.ex.imm_data = ohdr->u.ud.imm_data;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		hdrsize += sizeof(u32);
-	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-		wc.ex.imm_data = 0;
-		wc.wc_flags = 0;
-	} else {
-		dev->n_pkt_drops++;
-		goto bail;
-	}
-
-	/* Get the number of bytes the message was padded by. */
-	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-	if (unlikely(tlen < (hdrsize + pad + 4))) {
-		/* Drop incomplete packets. */
-		dev->n_pkt_drops++;
-		goto bail;
-	}
-	tlen -= hdrsize + pad + 4;
-
-	/* Drop invalid MAD packets (see 13.5.3.1). */
-	if (unlikely((qp->ibqp.qp_num == 0 &&
-		      (tlen != 256 ||
-		       (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
-		     (qp->ibqp.qp_num == 1 &&
-		      (tlen != 256 ||
-		       (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
-		dev->n_pkt_drops++;
-		goto bail;
-	}
-
-	/*
-	 * A GRH is expected to precede the data even if not
-	 * present on the wire.
-	 */
-	wc.byte_len = tlen + sizeof(struct ib_grh);
-
-	/*
-	 * Get the next work request entry to find where to put the data.
-	 */
-	if (qp->r_flags & IPATH_R_REUSE_SGE)
-		qp->r_flags &= ~IPATH_R_REUSE_SGE;
-	else if (!ipath_get_rwqe(qp, 0)) {
-		/*
-		 * Count VL15 packets dropped due to no receive buffer.
-		 * Otherwise, count them as buffer overruns since usually,
-		 * the HW will be able to receive packets even if there are
-		 * no QPs with posted receive buffers.
-		 */
-		if (qp->ibqp.qp_num == 0)
-			dev->n_vl15_dropped++;
-		else
-			dev->rcv_errors++;
-		goto bail;
-	}
-	/* Silently drop packets which are too big. */
-	if (wc.byte_len > qp->r_len) {
-		qp->r_flags |= IPATH_R_REUSE_SGE;
-		dev->n_pkt_drops++;
-		goto bail;
-	}
-	if (has_grh) {
-		ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-			       sizeof(struct ib_grh));
-		wc.wc_flags |= IB_WC_GRH;
-	} else
-		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
-	ipath_copy_sge(&qp->r_sge, data,
-		       wc.byte_len - sizeof(struct ib_grh));
-	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-		goto bail;
-	wc.wr_id = qp->r_wr_id;
-	wc.status = IB_WC_SUCCESS;
-	wc.opcode = IB_WC_RECV;
-	wc.vendor_err = 0;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = src_qp;
-	/* XXX do we know which pkey matched? Only needed for GSI. */
-	wc.pkey_index = 0;
-	wc.slid = be16_to_cpu(hdr->lrh[3]);
-	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
-	dlid = be16_to_cpu(hdr->lrh[1]);
-	/*
-	 * Save the LMC lower bits if the destination LID is a unicast LID.
-	 */
-	wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
-		dlid & ((1 << dev->dd->ipath_lmc) - 1);
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-		       (ohdr->bth[0] &
-			cpu_to_be32(1 << 23)) != 0);
-
-bail:;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
deleted file mode 100644
index 1da1252..0000000
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/mm.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-#include "ipath_kernel.h"
-
-static void __ipath_release_user_pages(struct page **p, size_t num_pages,
-				   int dirty)
-{
-	size_t i;
-
-	for (i = 0; i < num_pages; i++) {
-		ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
-			   (unsigned long) num_pages, p[i]);
-		if (dirty)
-			set_page_dirty_lock(p[i]);
-		put_page(p[i]);
-	}
-}
-
-/* call with current->mm->mmap_sem held */
-static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-				  struct page **p)
-{
-	unsigned long lock_limit;
-	size_t got;
-	int ret;
-
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	if (num_pages > lock_limit) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
-		   (unsigned long) num_pages, start_page);
-
-	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages(current, current->mm,
-				     start_page + got * PAGE_SIZE,
-				     num_pages - got, 1, 1,
-				     p + got, NULL);
-		if (ret < 0)
-			goto bail_release;
-	}
-
-	current->mm->pinned_vm += num_pages;
-
-	ret = 0;
-	goto bail;
-
-bail_release:
-	__ipath_release_user_pages(p, got, 0);
-bail:
-	return ret;
-}
-
-/**
- * ipath_map_page - a safety wrapper around pci_map_page()
- *
- * A dma_addr of all 0's is interpreted by the chip as "disabled".
- * Unfortunately, it can also be a valid dma_addr returned on some
- * architectures.
- *
- * The powerpc iommu assigns dma_addrs in ascending order, so we don't
- * have to bother with retries or mapping a dummy page to insure we
- * don't just get the same mapping again.
- *
- * I'm sure we won't be so lucky with other iommu's, so FIXME.
- */
-dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
-	unsigned long offset, size_t size, int direction)
-{
-	dma_addr_t phys;
-
-	phys = pci_map_page(hwdev, page, offset, size, direction);
-
-	if (phys == 0) {
-		pci_unmap_page(hwdev, phys, size, direction);
-		phys = pci_map_page(hwdev, page, offset, size, direction);
-		/*
-		 * FIXME: If we get 0 again, we should keep this page,
-		 * map another, then free the 0 page.
-		 */
-	}
-
-	return phys;
-}
-
-/**
- * ipath_map_single - a safety wrapper around pci_map_single()
- *
- * Same idea as ipath_map_page().
- */
-dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
-	int direction)
-{
-	dma_addr_t phys;
-
-	phys = pci_map_single(hwdev, ptr, size, direction);
-
-	if (phys == 0) {
-		pci_unmap_single(hwdev, phys, size, direction);
-		phys = pci_map_single(hwdev, ptr, size, direction);
-		/*
-		 * FIXME: If we get 0 again, we should keep this page,
-		 * map another, then free the 0 page.
-		 */
-	}
-
-	return phys;
-}
-
-/**
- * ipath_get_user_pages - lock user pages into memory
- * @start_page: the start page
- * @num_pages: the number of pages
- * @p: the output page structures
- *
- * This function takes a given start page (page aligned user virtual
- * address) and pins it and the following specified number of pages.  For
- * now, num_pages is always 1, but that will probably change at some point
- * (because caller is doing expected sends on a single virtually contiguous
- * buffer, so we can do all pages at once).
- */
-int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-			 struct page **p)
-{
-	int ret;
-
-	down_write(&current->mm->mmap_sem);
-
-	ret = __ipath_get_user_pages(start_page, num_pages, p);
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
-void ipath_release_user_pages(struct page **p, size_t num_pages)
-{
-	down_write(&current->mm->mmap_sem);
-
-	__ipath_release_user_pages(p, num_pages, 1);
-
-	current->mm->pinned_vm -= num_pages;
-
-	up_write(&current->mm->mmap_sem);
-}
-
-struct ipath_user_pages_work {
-	struct work_struct work;
-	struct mm_struct *mm;
-	unsigned long num_pages;
-};
-
-static void user_pages_account(struct work_struct *_work)
-{
-	struct ipath_user_pages_work *work =
-		container_of(_work, struct ipath_user_pages_work, work);
-
-	down_write(&work->mm->mmap_sem);
-	work->mm->pinned_vm -= work->num_pages;
-	up_write(&work->mm->mmap_sem);
-	mmput(work->mm);
-	kfree(work);
-}
-
-void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
-{
-	struct ipath_user_pages_work *work;
-	struct mm_struct *mm;
-
-	__ipath_release_user_pages(p, num_pages, 1);
-
-	mm = get_task_mm(current);
-	if (!mm)
-		return;
-
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
-	if (!work)
-		goto bail_mm;
-
-	INIT_WORK(&work->work, user_pages_account);
-	work->mm = mm;
-	work->num_pages = num_pages;
-
-	queue_work(ib_wq, &work->work);
-	return;
-
-bail_mm:
-	mmput(mm);
-	return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
deleted file mode 100644
index cc04b7b..0000000
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ /dev/null
@@ -1,875 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/uio.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-
-#include "ipath_kernel.h"
-#include "ipath_user_sdma.h"
-
-/* minimum size of header */
-#define IPATH_USER_SDMA_MIN_HEADER_LENGTH	64
-/* expected size of headers (for dma_pool) */
-#define IPATH_USER_SDMA_EXP_HEADER_LENGTH	64
-/* length mask in PBC (lower 11 bits) */
-#define IPATH_PBC_LENGTH_MASK			((1 << 11) - 1)
-
-struct ipath_user_sdma_pkt {
-	u8 naddr;		/* dimension of addr (1..3) ... */
-	u32 counter;		/* sdma pkts queued counter for this entry */
-	u64 added;		/* global descq number of entries */
-
-	struct {
-		u32 offset;			/* offset for kvaddr, addr */
-		u32 length;			/* length in page */
-		u8  put_page;			/* should we put_page? */
-		u8  dma_mapped;			/* is page dma_mapped? */
-		struct page *page;		/* may be NULL (coherent mem) */
-		void *kvaddr;			/* FIXME: only for pio hack */
-		dma_addr_t addr;
-	} addr[4];   /* max pages, any more and we coalesce */
-	struct list_head list;	/* list element */
-};
-
-struct ipath_user_sdma_queue {
-	/*
-	 * pkts sent to dma engine are queued on this
-	 * list head.  the type of the elements of this
-	 * list are struct ipath_user_sdma_pkt...
-	 */
-	struct list_head sent;
-
-	/* headers with expected length are allocated from here... */
-	char header_cache_name[64];
-	struct dma_pool *header_cache;
-
-	/* packets are allocated from the slab cache... */
-	char pkt_slab_name[64];
-	struct kmem_cache *pkt_slab;
-
-	/* as packets go on the queued queue, they are counted... */
-	u32 counter;
-	u32 sent_counter;
-
-	/* dma page table */
-	struct rb_root dma_pages_root;
-
-	/* protect everything above... */
-	struct mutex lock;
-};
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
-{
-	struct ipath_user_sdma_queue *pq =
-		kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
-
-	if (!pq)
-		goto done;
-
-	pq->counter = 0;
-	pq->sent_counter = 0;
-	INIT_LIST_HEAD(&pq->sent);
-
-	mutex_init(&pq->lock);
-
-	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
-		 "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
-	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
-					 sizeof(struct ipath_user_sdma_pkt),
-					 0, 0, NULL);
-
-	if (!pq->pkt_slab)
-		goto err_kfree;
-
-	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
-		 "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
-	pq->header_cache = dma_pool_create(pq->header_cache_name,
-					   dev,
-					   IPATH_USER_SDMA_EXP_HEADER_LENGTH,
-					   4, 0);
-	if (!pq->header_cache)
-		goto err_slab;
-
-	pq->dma_pages_root = RB_ROOT;
-
-	goto done;
-
-err_slab:
-	kmem_cache_destroy(pq->pkt_slab);
-err_kfree:
-	kfree(pq);
-	pq = NULL;
-
-done:
-	return pq;
-}
-
-static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
-				      int i, size_t offset, size_t len,
-				      int put_page, int dma_mapped,
-				      struct page *page,
-				      void *kvaddr, dma_addr_t dma_addr)
-{
-	pkt->addr[i].offset = offset;
-	pkt->addr[i].length = len;
-	pkt->addr[i].put_page = put_page;
-	pkt->addr[i].dma_mapped = dma_mapped;
-	pkt->addr[i].page = page;
-	pkt->addr[i].kvaddr = kvaddr;
-	pkt->addr[i].addr = dma_addr;
-}
-
-static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
-					u32 counter, size_t offset,
-					size_t len, int dma_mapped,
-					struct page *page,
-					void *kvaddr, dma_addr_t dma_addr)
-{
-	pkt->naddr = 1;
-	pkt->counter = counter;
-	ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
-				  kvaddr, dma_addr);
-}
-
-/* we've too many pages in the iovec, coalesce to a single page */
-static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
-				    struct ipath_user_sdma_pkt *pkt,
-				    const struct iovec *iov,
-				    unsigned long niov) {
-	int ret = 0;
-	struct page *page = alloc_page(GFP_KERNEL);
-	void *mpage_save;
-	char *mpage;
-	int i;
-	int len = 0;
-	dma_addr_t dma_addr;
-
-	if (!page) {
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	mpage = kmap(page);
-	mpage_save = mpage;
-	for (i = 0; i < niov; i++) {
-		int cfur;
-
-		cfur = copy_from_user(mpage,
-				      iov[i].iov_base, iov[i].iov_len);
-		if (cfur) {
-			ret = -EFAULT;
-			goto free_unmap;
-		}
-
-		mpage += iov[i].iov_len;
-		len += iov[i].iov_len;
-	}
-
-	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
-				DMA_TO_DEVICE);
-	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-		ret = -ENOMEM;
-		goto free_unmap;
-	}
-
-	ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
-				  dma_addr);
-	pkt->naddr = 2;
-
-	goto done;
-
-free_unmap:
-	kunmap(page);
-	__free_page(page);
-done:
-	return ret;
-}
-
-/* how many pages in this iovec element? */
-static int ipath_user_sdma_num_pages(const struct iovec *iov)
-{
-	const unsigned long addr  = (unsigned long) iov->iov_base;
-	const unsigned long  len  = iov->iov_len;
-	const unsigned long spage = addr & PAGE_MASK;
-	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
-
-	return 1 + ((epage - spage) >> PAGE_SHIFT);
-}
-
-/* truncate length to page boundary */
-static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
-{
-	const unsigned long offset = addr & ~PAGE_MASK;
-
-	return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
-}
-
-static void ipath_user_sdma_free_pkt_frag(struct device *dev,
-					  struct ipath_user_sdma_queue *pq,
-					  struct ipath_user_sdma_pkt *pkt,
-					  int frag)
-{
-	const int i = frag;
-
-	if (pkt->addr[i].page) {
-		if (pkt->addr[i].dma_mapped)
-			dma_unmap_page(dev,
-				       pkt->addr[i].addr,
-				       pkt->addr[i].length,
-				       DMA_TO_DEVICE);
-
-		if (pkt->addr[i].kvaddr)
-			kunmap(pkt->addr[i].page);
-
-		if (pkt->addr[i].put_page)
-			put_page(pkt->addr[i].page);
-		else
-			__free_page(pkt->addr[i].page);
-	} else if (pkt->addr[i].kvaddr)
-		/* free coherent mem from cache... */
-		dma_pool_free(pq->header_cache,
-			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
-}
-
-/* return number of pages pinned... */
-static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
-				     struct ipath_user_sdma_pkt *pkt,
-				     unsigned long addr, int tlen, int npages)
-{
-	struct page *pages[2];
-	int j;
-	int ret;
-
-	ret = get_user_pages_fast(addr, npages, 0, pages);
-	if (ret != npages) {
-		int i;
-
-		for (i = 0; i < ret; i++)
-			put_page(pages[i]);
-
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	for (j = 0; j < npages; j++) {
-		/* map the pages... */
-		const int flen =
-			ipath_user_sdma_page_length(addr, tlen);
-		dma_addr_t dma_addr =
-			dma_map_page(&dd->pcidev->dev,
-				     pages[j], 0, flen, DMA_TO_DEVICE);
-		unsigned long fofs = addr & ~PAGE_MASK;
-
-		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-			ret = -ENOMEM;
-			goto done;
-		}
-
-		ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
-					  pages[j], kmap(pages[j]),
-					  dma_addr);
-
-		pkt->naddr++;
-		addr += flen;
-		tlen -= flen;
-	}
-
-done:
-	return ret;
-}
-
-static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
-				   struct ipath_user_sdma_queue *pq,
-				   struct ipath_user_sdma_pkt *pkt,
-				   const struct iovec *iov,
-				   unsigned long niov)
-{
-	int ret = 0;
-	unsigned long idx;
-
-	for (idx = 0; idx < niov; idx++) {
-		const int npages = ipath_user_sdma_num_pages(iov + idx);
-		const unsigned long addr = (unsigned long) iov[idx].iov_base;
-
-		ret = ipath_user_sdma_pin_pages(dd, pkt,
-						addr, iov[idx].iov_len,
-						npages);
-		if (ret < 0)
-			goto free_pkt;
-	}
-
-	goto done;
-
-free_pkt:
-	for (idx = 0; idx < pkt->naddr; idx++)
-		ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
-
-done:
-	return ret;
-}
-
-static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
-					struct ipath_user_sdma_queue *pq,
-					struct ipath_user_sdma_pkt *pkt,
-					const struct iovec *iov,
-					unsigned long niov, int npages)
-{
-	int ret = 0;
-
-	if (npages >= ARRAY_SIZE(pkt->addr))
-		ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
-	else
-		ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
-
-	return ret;
-}
-
-/* free a packet list -- return counter value of last packet */
-static void ipath_user_sdma_free_pkt_list(struct device *dev,
-					  struct ipath_user_sdma_queue *pq,
-					  struct list_head *list)
-{
-	struct ipath_user_sdma_pkt *pkt, *pkt_next;
-
-	list_for_each_entry_safe(pkt, pkt_next, list, list) {
-		int i;
-
-		for (i = 0; i < pkt->naddr; i++)
-			ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
-
-		kmem_cache_free(pq->pkt_slab, pkt);
-	}
-}
-
-/*
- * copy headers, coalesce etc -- pq->lock must be held
- *
- * we queue all the packets to list, returning the
- * number of bytes total.  list must be empty initially,
- * as, if there is an error we clean it...
- */
-static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
-				      struct ipath_user_sdma_queue *pq,
-				      struct list_head *list,
-				      const struct iovec *iov,
-				      unsigned long niov,
-				      int maxpkts)
-{
-	unsigned long idx = 0;
-	int ret = 0;
-	int npkts = 0;
-	struct page *page = NULL;
-	__le32 *pbc;
-	dma_addr_t dma_addr;
-	struct ipath_user_sdma_pkt *pkt = NULL;
-	size_t len;
-	size_t nw;
-	u32 counter = pq->counter;
-	int dma_mapped = 0;
-
-	while (idx < niov && npkts < maxpkts) {
-		const unsigned long addr = (unsigned long) iov[idx].iov_base;
-		const unsigned long idx_save = idx;
-		unsigned pktnw;
-		unsigned pktnwc;
-		int nfrags = 0;
-		int npages = 0;
-		int cfur;
-
-		dma_mapped = 0;
-		len = iov[idx].iov_len;
-		nw = len >> 2;
-		page = NULL;
-
-		pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
-		if (!pkt) {
-			ret = -ENOMEM;
-			goto free_list;
-		}
-
-		if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
-		    len > PAGE_SIZE || len & 3 || addr & 3) {
-			ret = -EINVAL;
-			goto free_pkt;
-		}
-
-		if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
-			pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
-					     &dma_addr);
-		else
-			pbc = NULL;
-
-		if (!pbc) {
-			page = alloc_page(GFP_KERNEL);
-			if (!page) {
-				ret = -ENOMEM;
-				goto free_pkt;
-			}
-			pbc = kmap(page);
-		}
-
-		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
-		if (cfur) {
-			ret = -EFAULT;
-			goto free_pbc;
-		}
-
-		/*
-		 * this assignment is a bit strange.  it's because the
-		 * the pbc counts the number of 32 bit words in the full
-		 * packet _except_ the first word of the pbc itself...
-		 */
-		pktnwc = nw - 1;
-
-		/*
-		 * pktnw computation yields the number of 32 bit words
-		 * that the caller has indicated in the PBC.  note that
-		 * this is one less than the total number of words that
-		 * goes to the send DMA engine as the first 32 bit word
-		 * of the PBC itself is not counted.  Armed with this count,
-		 * we can verify that the packet is consistent with the
-		 * iovec lengths.
-		 */
-		pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
-		if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
-			ret = -EINVAL;
-			goto free_pbc;
-		}
-
-
-		idx++;
-		while (pktnwc < pktnw && idx < niov) {
-			const size_t slen = iov[idx].iov_len;
-			const unsigned long faddr =
-				(unsigned long) iov[idx].iov_base;
-
-			if (slen & 3 || faddr & 3 || !slen ||
-			    slen > PAGE_SIZE) {
-				ret = -EINVAL;
-				goto free_pbc;
-			}
-
-			npages++;
-			if ((faddr & PAGE_MASK) !=
-			    ((faddr + slen - 1) & PAGE_MASK))
-				npages++;
-
-			pktnwc += slen >> 2;
-			idx++;
-			nfrags++;
-		}
-
-		if (pktnwc != pktnw) {
-			ret = -EINVAL;
-			goto free_pbc;
-		}
-
-		if (page) {
-			dma_addr = dma_map_page(&dd->pcidev->dev,
-						page, 0, len, DMA_TO_DEVICE);
-			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-				ret = -ENOMEM;
-				goto free_pbc;
-			}
-
-			dma_mapped = 1;
-		}
-
-		ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
-					    page, pbc, dma_addr);
-
-		if (nfrags) {
-			ret = ipath_user_sdma_init_payload(dd, pq, pkt,
-							   iov + idx_save + 1,
-							   nfrags, npages);
-			if (ret < 0)
-				goto free_pbc_dma;
-		}
-
-		counter++;
-		npkts++;
-
-		list_add_tail(&pkt->list, list);
-	}
-
-	ret = idx;
-	goto done;
-
-free_pbc_dma:
-	if (dma_mapped)
-		dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
-free_pbc:
-	if (page) {
-		kunmap(page);
-		__free_page(page);
-	} else
-		dma_pool_free(pq->header_cache, pbc, dma_addr);
-free_pkt:
-	kmem_cache_free(pq->pkt_slab, pkt);
-free_list:
-	ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
-done:
-	return ret;
-}
-
-static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
-						 u32 c)
-{
-	pq->sent_counter = c;
-}
-
-/* try to clean out queue -- needs pq->lock */
-static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
-				       struct ipath_user_sdma_queue *pq)
-{
-	struct list_head free_list;
-	struct ipath_user_sdma_pkt *pkt;
-	struct ipath_user_sdma_pkt *pkt_prev;
-	int ret = 0;
-
-	INIT_LIST_HEAD(&free_list);
-
-	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
-		s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
-
-		if (descd < 0)
-			break;
-
-		list_move_tail(&pkt->list, &free_list);
-
-		/* one more packet cleaned */
-		ret++;
-	}
-
-	if (!list_empty(&free_list)) {
-		u32 counter;
-
-		pkt = list_entry(free_list.prev,
-				 struct ipath_user_sdma_pkt, list);
-		counter = pkt->counter;
-
-		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-		ipath_user_sdma_set_complete_counter(pq, counter);
-	}
-
-	return ret;
-}
-
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
-{
-	if (!pq)
-		return;
-
-	kmem_cache_destroy(pq->pkt_slab);
-	dma_pool_destroy(pq->header_cache);
-	kfree(pq);
-}
-
-/* clean descriptor queue, returns > 0 if some elements cleaned */
-static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	ret = ipath_sdma_make_progress(dd);
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	return ret;
-}
-
-/* we're in close, drain packets so that we can cleanup successfully... */
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-				 struct ipath_user_sdma_queue *pq)
-{
-	int i;
-
-	if (!pq)
-		return;
-
-	for (i = 0; i < 100; i++) {
-		mutex_lock(&pq->lock);
-		if (list_empty(&pq->sent)) {
-			mutex_unlock(&pq->lock);
-			break;
-		}
-		ipath_user_sdma_hwqueue_clean(dd);
-		ipath_user_sdma_queue_clean(dd, pq);
-		mutex_unlock(&pq->lock);
-		msleep(10);
-	}
-
-	if (!list_empty(&pq->sent)) {
-		struct list_head free_list;
-
-		printk(KERN_INFO "drain: lists not empty: forcing!\n");
-		INIT_LIST_HEAD(&free_list);
-		mutex_lock(&pq->lock);
-		list_splice_init(&pq->sent, &free_list);
-		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-		mutex_unlock(&pq->lock);
-	}
-}
-
-static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
-					   u64 addr, u64 dwlen, u64 dwoffset)
-{
-	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
-			   ((addr & 0xfffffffcULL) << 32) |
-			   /* SDmaGeneration[1:0] */
-			   ((dd->ipath_sdma_generation & 3ULL) << 30) |
-			   /* SDmaDwordCount[10:0] */
-			   ((dwlen & 0x7ffULL) << 16) |
-			   /* SDmaBufOffset[12:2] */
-			   (dwoffset & 0x7ffULL));
-}
-
-static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
-{
-	return descq | cpu_to_le64(1ULL << 12);
-}
-
-static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
-{
-					      /* last */  /* dma head */
-	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
-}
-
-static inline __le64 ipath_sdma_make_desc1(u64 addr)
-{
-	/* SDmaPhyAddr[47:32] */
-	return cpu_to_le64(addr >> 32);
-}
-
-static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
-				      struct ipath_user_sdma_pkt *pkt, int idx,
-				      unsigned ofs, u16 tail)
-{
-	const u64 addr = (u64) pkt->addr[idx].addr +
-		(u64) pkt->addr[idx].offset;
-	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
-	__le64 *descqp;
-	__le64 descq0;
-
-	descqp = &dd->ipath_sdma_descq[tail].qw[0];
-
-	descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
-	if (idx == 0)
-		descq0 = ipath_sdma_make_first_desc0(descq0);
-	if (idx == pkt->naddr - 1)
-		descq0 = ipath_sdma_make_last_desc0(descq0);
-
-	descqp[0] = descq0;
-	descqp[1] = ipath_sdma_make_desc1(addr);
-}
-
-/* pq->lock must be held, get packets on the wire... */
-static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
-				     struct ipath_user_sdma_queue *pq,
-				     struct list_head *pktlist)
-{
-	int ret = 0;
-	unsigned long flags;
-	u16 tail;
-
-	if (list_empty(pktlist))
-		return 0;
-
-	if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
-		return -ECOMM;
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
-		ret = -ECOMM;
-		goto unlock;
-	}
-
-	tail = dd->ipath_sdma_descq_tail;
-	while (!list_empty(pktlist)) {
-		struct ipath_user_sdma_pkt *pkt =
-			list_entry(pktlist->next, struct ipath_user_sdma_pkt,
-				   list);
-		int i;
-		unsigned ofs = 0;
-		u16 dtail = tail;
-
-		if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
-			goto unlock_check_tail;
-
-		for (i = 0; i < pkt->naddr; i++) {
-			ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
-			ofs += pkt->addr[i].length >> 2;
-
-			if (++tail == dd->ipath_sdma_descq_cnt) {
-				tail = 0;
-				++dd->ipath_sdma_generation;
-			}
-		}
-
-		if ((ofs<<2) > dd->ipath_ibmaxlen) {
-			ipath_dbg("packet size %X > ibmax %X, fail\n",
-				ofs<<2, dd->ipath_ibmaxlen);
-			ret = -EMSGSIZE;
-			goto unlock;
-		}
-
-		/*
-		 * if the packet is >= 2KB mtu equivalent, we have to use
-		 * the large buffers, and have to mark each descriptor as
-		 * part of a large buffer packet.
-		 */
-		if (ofs >= IPATH_SMALLBUF_DWORDS) {
-			for (i = 0; i < pkt->naddr; i++) {
-				dd->ipath_sdma_descq[dtail].qw[0] |=
-					cpu_to_le64(1ULL << 14);
-				if (++dtail == dd->ipath_sdma_descq_cnt)
-					dtail = 0;
-			}
-		}
-
-		dd->ipath_sdma_descq_added += pkt->naddr;
-		pkt->added = dd->ipath_sdma_descq_added;
-		list_move_tail(&pkt->list, &pq->sent);
-		ret++;
-	}
-
-unlock_check_tail:
-	/* advance the tail on the chip if necessary */
-	if (dd->ipath_sdma_descq_tail != tail) {
-		wmb();
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-		dd->ipath_sdma_descq_tail = tail;
-	}
-
-unlock:
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	return ret;
-}
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-			   struct ipath_user_sdma_queue *pq,
-			   const struct iovec *iov,
-			   unsigned long dim)
-{
-	int ret = 0;
-	struct list_head list;
-	int npkts = 0;
-
-	INIT_LIST_HEAD(&list);
-
-	mutex_lock(&pq->lock);
-
-	if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
-		ipath_user_sdma_hwqueue_clean(dd);
-		ipath_user_sdma_queue_clean(dd, pq);
-	}
-
-	while (dim) {
-		const int mxp = 8;
-
-		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
-		if (ret <= 0)
-			goto done_unlock;
-		else {
-			dim -= ret;
-			iov += ret;
-		}
-
-		/* force packets onto the sdma hw queue... */
-		if (!list_empty(&list)) {
-			/*
-			 * lazily clean hw queue.  the 4 is a guess of about
-			 * how many sdma descriptors a packet will take (it
-			 * doesn't have to be perfect).
-			 */
-			if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
-				ipath_user_sdma_hwqueue_clean(dd);
-				ipath_user_sdma_queue_clean(dd, pq);
-			}
-
-			ret = ipath_user_sdma_push_pkts(dd, pq, &list);
-			if (ret < 0)
-				goto done_unlock;
-			else {
-				npkts += ret;
-				pq->counter += ret;
-
-				if (!list_empty(&list))
-					goto done_unlock;
-			}
-		}
-	}
-
-done_unlock:
-	if (!list_empty(&list))
-		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
-	mutex_unlock(&pq->lock);
-
-	return (ret < 0) ? ret : npkts;
-}
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-				  struct ipath_user_sdma_queue *pq)
-{
-	int ret = 0;
-
-	mutex_lock(&pq->lock);
-	ipath_user_sdma_hwqueue_clean(dd);
-	ret = ipath_user_sdma_queue_clean(dd, pq);
-	mutex_unlock(&pq->lock);
-
-	return ret;
-}
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
-{
-	return pq->sent_counter;
-}
-
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
-{
-	return pq->counter;
-}
-
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
deleted file mode 100644
index fc76316..0000000
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/device.h>
-
-struct ipath_user_sdma_queue;
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-			   struct ipath_user_sdma_queue *pq,
-			   const struct iovec *iov,
-			   unsigned long dim);
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-				  struct ipath_user_sdma_queue *pq);
-
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-				 struct ipath_user_sdma_queue *pq);
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
deleted file mode 100644
index 30ba49c..0000000
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ /dev/null
@@ -1,2364 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_user_verbs.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/utsname.h>
-#include <linux/rculist.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-static unsigned int ib_ipath_qp_table_size = 251;
-module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
-MODULE_PARM_DESC(qp_table_size, "QP table size");
-
-unsigned int ib_ipath_lkey_table_size = 12;
-module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
-		   S_IRUGO);
-MODULE_PARM_DESC(lkey_table_size,
-		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
-
-static unsigned int ib_ipath_max_pds = 0xFFFF;
-module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_pds,
-		 "Maximum number of protection domains to support");
-
-static unsigned int ib_ipath_max_ahs = 0xFFFF;
-module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
-
-unsigned int ib_ipath_max_cqes = 0x2FFFF;
-module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqes,
-		 "Maximum number of completion queue entries to support");
-
-unsigned int ib_ipath_max_cqs = 0x1FFFF;
-module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
-
-unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
-module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
-		   S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
-
-unsigned int ib_ipath_max_qps = 16384;
-module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
-
-unsigned int ib_ipath_max_sges = 0x60;
-module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
-
-unsigned int ib_ipath_max_mcast_grps = 16384;
-module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
-		   S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_grps,
-		 "Maximum number of multicast groups to support");
-
-unsigned int ib_ipath_max_mcast_qp_attached = 16;
-module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
-		   uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_qp_attached,
-		 "Maximum number of attached QPs to support");
-
-unsigned int ib_ipath_max_srqs = 1024;
-module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
-
-unsigned int ib_ipath_max_srq_sges = 128;
-module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
-		   uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
-
-unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
-module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
-		   uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
-
-static unsigned int ib_ipath_disable_sma;
-module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(disable_sma, "Disable the SMA");
-
-/*
- * Note that it is OK to post send work requests in the SQE and ERR
- * states; ipath_do_send() will process them and generate error
- * completions as per IB 1.2 C10-96.
- */
-const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
-	[IB_QPS_RESET] = 0,
-	[IB_QPS_INIT] = IPATH_POST_RECV_OK,
-	[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
-	[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
-	    IPATH_PROCESS_NEXT_SEND_OK,
-	[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
-	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-	[IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
-	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-};
-
-struct ipath_ucontext {
-	struct ib_ucontext ibucontext;
-};
-
-static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
-						  *ibucontext)
-{
-	return container_of(ibucontext, struct ipath_ucontext, ibucontext);
-}
-
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
-	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-	[IB_WR_SEND] = IB_WC_SEND,
-	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
-/*
- * System image GUID.
- */
-static __be64 sys_image_guid;
-
-/**
- * ipath_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- */
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
-{
-	struct ipath_sge *sge = &ss->sge;
-
-	while (length) {
-		u32 len = sge->length;
-
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		memcpy(sge->vaddr, data, len);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		data += len;
-		length -= len;
-	}
-}
-
-/**
- * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
- * @ss: the SGE state
- * @length: the number of bytes to skip
- */
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
-{
-	struct ipath_sge *sge = &ss->sge;
-
-	while (length) {
-		u32 len = sge->length;
-
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		length -= len;
-	}
-}
-
-/*
- * Count the number of DMA descriptors needed to send length bytes of data.
- * Don't modify the ipath_sge_state to get the count.
- * Return zero if any of the segments is not aligned.
- */
-static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
-{
-	struct ipath_sge *sg_list = ss->sg_list;
-	struct ipath_sge sge = ss->sge;
-	u8 num_sge = ss->num_sge;
-	u32 ndesc = 1;	/* count the header */
-
-	while (length) {
-		u32 len = sge.length;
-
-		if (len > length)
-			len = length;
-		if (len > sge.sge_length)
-			len = sge.sge_length;
-		BUG_ON(len == 0);
-		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
-		    (len != length && (len & (sizeof(u32) - 1)))) {
-			ndesc = 0;
-			break;
-		}
-		ndesc++;
-		sge.vaddr += len;
-		sge.length -= len;
-		sge.sge_length -= len;
-		if (sge.sge_length == 0) {
-			if (--num_sge)
-				sge = *sg_list++;
-		} else if (sge.length == 0 && sge.mr != NULL) {
-			if (++sge.n >= IPATH_SEGSZ) {
-				if (++sge.m >= sge.mr->mapsz)
-					break;
-				sge.n = 0;
-			}
-			sge.vaddr =
-				sge.mr->map[sge.m]->segs[sge.n].vaddr;
-			sge.length =
-				sge.mr->map[sge.m]->segs[sge.n].length;
-		}
-		length -= len;
-	}
-	return ndesc;
-}
-
-/*
- * Copy from the SGEs to the data buffer.
- */
-static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
-				u32 length)
-{
-	struct ipath_sge *sge = &ss->sge;
-
-	while (length) {
-		u32 len = sge->length;
-
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		memcpy(data, sge->vaddr, len);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
-			if (++sge->n >= IPATH_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		data += len;
-		length -= len;
-	}
-}
-
-/**
- * ipath_post_one_send - post one RC, UC, or UD send work request
- * @qp: the QP to post on
- * @wr: the work request to send
- */
-static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-	struct ipath_swqe *wqe;
-	u32 next;
-	int i;
-	int j;
-	int acc;
-	int ret;
-	unsigned long flags;
-	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	if (qp->ibqp.qp_type != IB_QPT_SMI &&
-	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-		ret = -ENETDOWN;
-		goto bail;
-	}
-
-	/* Check that state is OK to post send. */
-	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
-		goto bail_inval;
-
-	/* IB spec says that num_sge == 0 is OK. */
-	if (wr->num_sge > qp->s_max_sge)
-		goto bail_inval;
-
-	/*
-	 * Don't allow RDMA reads or atomic operations on UC or
-	 * undefined operations.
-	 * Make sure buffer is large enough to hold the result for atomics.
-	 */
-	if (qp->ibqp.qp_type == IB_QPT_UC) {
-		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
-			goto bail_inval;
-	} else if (qp->ibqp.qp_type == IB_QPT_UD) {
-		/* Check UD opcode */
-		if (wr->opcode != IB_WR_SEND &&
-		    wr->opcode != IB_WR_SEND_WITH_IMM)
-			goto bail_inval;
-		/* Check UD destination address PD */
-		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
-			goto bail_inval;
-	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
-		goto bail_inval;
-	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-		   (wr->num_sge == 0 ||
-		    wr->sg_list[0].length < sizeof(u64) ||
-		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
-		goto bail_inval;
-	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
-		goto bail_inval;
-
-	next = qp->s_head + 1;
-	if (next >= qp->s_size)
-		next = 0;
-	if (next == qp->s_last) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	wqe = get_swqe_ptr(qp, qp->s_head);
-	wqe->wr = *wr;
-	wqe->length = 0;
-	if (wr->num_sge) {
-		acc = wr->opcode >= IB_WR_RDMA_READ ?
-			IB_ACCESS_LOCAL_WRITE : 0;
-		for (i = 0, j = 0; i < wr->num_sge; i++) {
-			u32 length = wr->sg_list[i].length;
-			int ok;
-
-			if (length == 0)
-				continue;
-			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
-					   &wr->sg_list[i], acc);
-			if (!ok)
-				goto bail_inval;
-			wqe->length += length;
-			j++;
-		}
-		wqe->wr.num_sge = j;
-	}
-	if (qp->ibqp.qp_type == IB_QPT_UC ||
-	    qp->ibqp.qp_type == IB_QPT_RC) {
-		if (wqe->length > 0x80000000U)
-			goto bail_inval;
-	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
-		goto bail_inval;
-	wqe->ssn = qp->s_ssn++;
-	qp->s_head = next;
-
-	ret = 0;
-	goto bail;
-
-bail_inval:
-	ret = -EINVAL;
-bail:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-	return ret;
-}
-
-/**
- * ipath_post_send - post a send on a QP
- * @ibqp: the QP to post the send on
- * @wr: the list of work requests to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
-			   struct ib_send_wr **bad_wr)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-	int err = 0;
-
-	for (; wr; wr = wr->next) {
-		err = ipath_post_one_send(qp, wr);
-		if (err) {
-			*bad_wr = wr;
-			goto bail;
-		}
-	}
-
-	/* Try to do the send work in the caller's context. */
-	ipath_do_send((unsigned long) qp);
-
-bail:
-	return err;
-}
-
-/**
- * ipath_post_receive - post a receive on a QP
- * @ibqp: the QP to post the receive on
- * @wr: the WR to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
-			      struct ib_recv_wr **bad_wr)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-	struct ipath_rwq *wq = qp->r_rq.wq;
-	unsigned long flags;
-	int ret;
-
-	/* Check that state is OK to post receive. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
-		*bad_wr = wr;
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	for (; wr; wr = wr->next) {
-		struct ipath_rwqe *wqe;
-		u32 next;
-		int i;
-
-		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
-			*bad_wr = wr;
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		spin_lock_irqsave(&qp->r_rq.lock, flags);
-		next = wq->head + 1;
-		if (next >= qp->r_rq.size)
-			next = 0;
-		if (next == wq->tail) {
-			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-			*bad_wr = wr;
-			ret = -ENOMEM;
-			goto bail;
-		}
-
-		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
-		wqe->wr_id = wr->wr_id;
-		wqe->num_sge = wr->num_sge;
-		for (i = 0; i < wr->num_sge; i++)
-			wqe->sg_list[i] = wr->sg_list[i];
-		/* Make sure queue entry is written before the head index. */
-		smp_wmb();
-		wq->head = next;
-		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-	}
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_qp_rcv - processing an incoming packet on a QP
- * @dev: the device the packet came on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_ib_rcv() to process an incoming packet
- * for the given QP.
- * Called at interrupt level.
- */
-static void ipath_qp_rcv(struct ipath_ibdev *dev,
-			 struct ipath_ib_header *hdr, int has_grh,
-			 void *data, u32 tlen, struct ipath_qp *qp)
-{
-	/* Check for valid receive state. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-		dev->n_pkt_drops++;
-		return;
-	}
-
-	switch (qp->ibqp.qp_type) {
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
-		if (ib_ipath_disable_sma)
-			break;
-		/* FALLTHROUGH */
-	case IB_QPT_UD:
-		ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
-		break;
-
-	case IB_QPT_RC:
-		ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
-		break;
-
-	case IB_QPT_UC:
-		ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
-		break;
-
-	default:
-		break;
-	}
-}
-
-/**
- * ipath_ib_rcv - process an incoming packet
- * @arg: the device pointer
- * @rhdr: the header of the packet
- * @data: the packet data
- * @tlen: the packet length
- *
- * This is called from ipath_kreceive() to process an incoming packet at
- * interrupt level. Tlen is the length of the header + data + CRC in bytes.
- */
-void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
-		  u32 tlen)
-{
-	struct ipath_ib_header *hdr = rhdr;
-	struct ipath_other_headers *ohdr;
-	struct ipath_qp *qp;
-	u32 qp_num;
-	int lnh;
-	u8 opcode;
-	u16 lid;
-
-	if (unlikely(dev == NULL))
-		goto bail;
-
-	if (unlikely(tlen < 24)) {	/* LRH+BTH+CRC */
-		dev->rcv_errors++;
-		goto bail;
-	}
-
-	/* Check for a valid destination LID (see ch. 7.11.1). */
-	lid = be16_to_cpu(hdr->lrh[1]);
-	if (lid < IPATH_MULTICAST_LID_BASE) {
-		lid &= ~((1 << dev->dd->ipath_lmc) - 1);
-		if (unlikely(lid != dev->dd->ipath_lid)) {
-			dev->rcv_errors++;
-			goto bail;
-		}
-	}
-
-	/* Check for GRH */
-	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-	if (lnh == IPATH_LRH_BTH)
-		ohdr = &hdr->u.oth;
-	else if (lnh == IPATH_LRH_GRH)
-		ohdr = &hdr->u.l.oth;
-	else {
-		dev->rcv_errors++;
-		goto bail;
-	}
-
-	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
-	dev->opstats[opcode].n_bytes += tlen;
-	dev->opstats[opcode].n_packets++;
-
-	/* Get the destination QP number. */
-	qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
-	if (qp_num == IPATH_MULTICAST_QPN) {
-		struct ipath_mcast *mcast;
-		struct ipath_mcast_qp *p;
-
-		if (lnh != IPATH_LRH_GRH) {
-			dev->n_pkt_drops++;
-			goto bail;
-		}
-		mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
-		if (mcast == NULL) {
-			dev->n_pkt_drops++;
-			goto bail;
-		}
-		dev->n_multicast_rcv++;
-		list_for_each_entry_rcu(p, &mcast->qp_list, list)
-			ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
-		/*
-		 * Notify ipath_multicast_detach() if it is waiting for us
-		 * to finish.
-		 */
-		if (atomic_dec_return(&mcast->refcount) <= 1)
-			wake_up(&mcast->wait);
-	} else {
-		qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
-		if (qp) {
-			dev->n_unicast_rcv++;
-			ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
-				     tlen, qp);
-			/*
-			 * Notify ipath_destroy_qp() if it is waiting
-			 * for us to finish.
-			 */
-			if (atomic_dec_and_test(&qp->refcount))
-				wake_up(&qp->wait);
-		} else
-			dev->n_pkt_drops++;
-	}
-
-bail:;
-}
-
-/**
- * ipath_ib_timer - verbs timer
- * @arg: the device pointer
- *
- * This is called from ipath_do_rcv_timer() at interrupt level to check for
- * QPs which need retransmits and to collect performance numbers.
- */
-static void ipath_ib_timer(struct ipath_ibdev *dev)
-{
-	struct ipath_qp *resend = NULL;
-	struct ipath_qp *rnr = NULL;
-	struct list_head *last;
-	struct ipath_qp *qp;
-	unsigned long flags;
-
-	if (dev == NULL)
-		return;
-
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	/* Start filling the next pending queue. */
-	if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
-		dev->pending_index = 0;
-	/* Save any requests still in the new queue, they have timed out. */
-	last = &dev->pending[dev->pending_index];
-	while (!list_empty(last)) {
-		qp = list_entry(last->next, struct ipath_qp, timerwait);
-		list_del_init(&qp->timerwait);
-		qp->timer_next = resend;
-		resend = qp;
-		atomic_inc(&qp->refcount);
-	}
-	last = &dev->rnrwait;
-	if (!list_empty(last)) {
-		qp = list_entry(last->next, struct ipath_qp, timerwait);
-		if (--qp->s_rnr_timeout == 0) {
-			do {
-				list_del_init(&qp->timerwait);
-				qp->timer_next = rnr;
-				rnr = qp;
-				atomic_inc(&qp->refcount);
-				if (list_empty(last))
-					break;
-				qp = list_entry(last->next, struct ipath_qp,
-						timerwait);
-			} while (qp->s_rnr_timeout == 0);
-		}
-	}
-	/*
-	 * We should only be in the started state if pma_sample_start != 0
-	 */
-	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
-	    --dev->pma_sample_start == 0) {
-		dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
-		ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
-					&dev->ipath_rword,
-					&dev->ipath_spkts,
-					&dev->ipath_rpkts,
-					&dev->ipath_xmit_wait);
-	}
-	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
-		if (dev->pma_sample_interval == 0) {
-			u64 ta, tb, tc, td, te;
-
-			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
-			ipath_snapshot_counters(dev->dd, &ta, &tb,
-						&tc, &td, &te);
-
-			dev->ipath_sword = ta - dev->ipath_sword;
-			dev->ipath_rword = tb - dev->ipath_rword;
-			dev->ipath_spkts = tc - dev->ipath_spkts;
-			dev->ipath_rpkts = td - dev->ipath_rpkts;
-			dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
-		}
-		else
-			dev->pma_sample_interval--;
-	}
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-	/* XXX What if timer fires again while this is running? */
-	while (resend != NULL) {
-		qp = resend;
-		resend = qp->timer_next;
-
-		spin_lock_irqsave(&qp->s_lock, flags);
-		if (qp->s_last != qp->s_tail &&
-		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-			dev->n_timeouts++;
-			ipath_restart_rc(qp, qp->s_last_psn + 1);
-		}
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-
-		/* Notify ipath_destroy_qp() if it is waiting. */
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
-	}
-	while (rnr != NULL) {
-		qp = rnr;
-		rnr = qp->timer_next;
-
-		spin_lock_irqsave(&qp->s_lock, flags);
-		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-			ipath_schedule_send(qp);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-
-		/* Notify ipath_destroy_qp() if it is waiting. */
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
-	}
-}
-
-static void update_sge(struct ipath_sge_state *ss, u32 length)
-{
-	struct ipath_sge *sge = &ss->sge;
-
-	sge->vaddr += length;
-	sge->length -= length;
-	sge->sge_length -= length;
-	if (sge->sge_length == 0) {
-		if (--ss->num_sge)
-			*sge = *ss->sg_list++;
-	} else if (sge->length == 0 && sge->mr != NULL) {
-		if (++sge->n >= IPATH_SEGSZ) {
-			if (++sge->m >= sge->mr->mapsz)
-				return;
-			sge->n = 0;
-		}
-		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
-		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
-	}
-}
-
-#ifdef __LITTLE_ENDIAN
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-	return data >> shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-	return data << shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
-	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-	return data;
-}
-#else
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-	return data << shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-	return data >> shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
-	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-	return data;
-}
-#endif
-
-static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
-		    u32 length, unsigned flush_wc)
-{
-	u32 extra = 0;
-	u32 data = 0;
-	u32 last;
-
-	while (1) {
-		u32 len = ss->sge.length;
-		u32 off;
-
-		if (len > length)
-			len = length;
-		if (len > ss->sge.sge_length)
-			len = ss->sge.sge_length;
-		BUG_ON(len == 0);
-		/* If the source address is not aligned, try to align it. */
-		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
-		if (off) {
-			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
-					    ~(sizeof(u32) - 1));
-			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
-			u32 y;
-
-			y = sizeof(u32) - off;
-			if (len > y)
-				len = y;
-			if (len + extra >= sizeof(u32)) {
-				data |= set_upper_bits(v, extra *
-						       BITS_PER_BYTE);
-				len = sizeof(u32) - extra;
-				if (len == length) {
-					last = data;
-					break;
-				}
-				__raw_writel(data, piobuf);
-				piobuf++;
-				extra = 0;
-				data = 0;
-			} else {
-				/* Clear unused upper bytes */
-				data |= clear_upper_bytes(v, len, extra);
-				if (len == length) {
-					last = data;
-					break;
-				}
-				extra += len;
-			}
-		} else if (extra) {
-			/* Source address is aligned. */
-			u32 *addr = (u32 *) ss->sge.vaddr;
-			int shift = extra * BITS_PER_BYTE;
-			int ushift = 32 - shift;
-			u32 l = len;
-
-			while (l >= sizeof(u32)) {
-				u32 v = *addr;
-
-				data |= set_upper_bits(v, shift);
-				__raw_writel(data, piobuf);
-				data = get_upper_bits(v, ushift);
-				piobuf++;
-				addr++;
-				l -= sizeof(u32);
-			}
-			/*
-			 * We still have 'extra' number of bytes leftover.
-			 */
-			if (l) {
-				u32 v = *addr;
-
-				if (l + extra >= sizeof(u32)) {
-					data |= set_upper_bits(v, shift);
-					len -= l + extra - sizeof(u32);
-					if (len == length) {
-						last = data;
-						break;
-					}
-					__raw_writel(data, piobuf);
-					piobuf++;
-					extra = 0;
-					data = 0;
-				} else {
-					/* Clear unused upper bytes */
-					data |= clear_upper_bytes(v, l,
-								  extra);
-					if (len == length) {
-						last = data;
-						break;
-					}
-					extra += l;
-				}
-			} else if (len == length) {
-				last = data;
-				break;
-			}
-		} else if (len == length) {
-			u32 w;
-
-			/*
-			 * Need to round up for the last dword in the
-			 * packet.
-			 */
-			w = (len + 3) >> 2;
-			__iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
-			piobuf += w - 1;
-			last = ((u32 *) ss->sge.vaddr)[w - 1];
-			break;
-		} else {
-			u32 w = len >> 2;
-
-			__iowrite32_copy(piobuf, ss->sge.vaddr, w);
-			piobuf += w;
-
-			extra = len & (sizeof(u32) - 1);
-			if (extra) {
-				u32 v = ((u32 *) ss->sge.vaddr)[w];
-
-				/* Clear unused upper bytes */
-				data = clear_upper_bytes(v, extra, 0);
-			}
-		}
-		update_sge(ss, len);
-		length -= len;
-	}
-	/* Update address before sending packet. */
-	update_sge(ss, length);
-	if (flush_wc) {
-		/* must flush early everything before trigger word */
-		ipath_flush_wc();
-		__raw_writel(last, piobuf);
-		/* be sure trigger word is written */
-		ipath_flush_wc();
-	} else
-		__raw_writel(last, piobuf);
-}
-
-/*
- * Convert IB rate to delay multiplier.
- */
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
-{
-	switch (rate) {
-	case IB_RATE_2_5_GBPS: return 8;
-	case IB_RATE_5_GBPS:   return 4;
-	case IB_RATE_10_GBPS:  return 2;
-	case IB_RATE_20_GBPS:  return 1;
-	default:	       return 0;
-	}
-}
-
-/*
- * Convert delay multiplier to IB rate
- */
-static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
-{
-	switch (mult) {
-	case 8:  return IB_RATE_2_5_GBPS;
-	case 4:  return IB_RATE_5_GBPS;
-	case 2:  return IB_RATE_10_GBPS;
-	case 1:  return IB_RATE_20_GBPS;
-	default: return IB_RATE_PORT_CURRENT;
-	}
-}
-
-static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
-{
-	struct ipath_verbs_txreq *tx = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	if (!list_empty(&dev->txreq_free)) {
-		struct list_head *l = dev->txreq_free.next;
-
-		list_del(l);
-		tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
-	}
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-	return tx;
-}
-
-static inline void put_txreq(struct ipath_ibdev *dev,
-			     struct ipath_verbs_txreq *tx)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	list_add(&tx->txreq.list, &dev->txreq_free);
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-}
-
-static void sdma_complete(void *cookie, int status)
-{
-	struct ipath_verbs_txreq *tx = cookie;
-	struct ipath_qp *qp = tx->qp;
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	unsigned long flags;
-	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
-		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
-
-	if (atomic_dec_and_test(&qp->s_dma_busy)) {
-		spin_lock_irqsave(&qp->s_lock, flags);
-		if (tx->wqe)
-			ipath_send_complete(qp, tx->wqe, ibs);
-		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-		     qp->s_last != qp->s_head) ||
-		    (qp->s_flags & IPATH_S_WAIT_DMA))
-			ipath_schedule_send(qp);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		wake_up(&qp->wait_dma);
-	} else if (tx->wqe) {
-		spin_lock_irqsave(&qp->s_lock, flags);
-		ipath_send_complete(qp, tx->wqe, ibs);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-	}
-
-	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-		kfree(tx->txreq.map_addr);
-	put_txreq(dev, tx);
-
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-}
-
-static void decrement_dma_busy(struct ipath_qp *qp)
-{
-	unsigned long flags;
-
-	if (atomic_dec_and_test(&qp->s_dma_busy)) {
-		spin_lock_irqsave(&qp->s_lock, flags);
-		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-		     qp->s_last != qp->s_head) ||
-		    (qp->s_flags & IPATH_S_WAIT_DMA))
-			ipath_schedule_send(qp);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		wake_up(&qp->wait_dma);
-	}
-}
-
-/*
- * Compute the number of clock cycles of delay before sending the next packet.
- * The multipliers reflect the number of clocks for the fastest rate so
- * one tick at 4xDDR is 8 ticks at 1xSDR.
- * If the destination port will take longer to receive a packet than
- * the outgoing link can send it, we need to delay sending the next packet
- * by the difference in time it takes the receiver to receive and the sender
- * to send this packet.
- * Note that this delay is always correct for UC and RC but not always
- * optimal for UD. For UD, the destination HCA can be different for each
- * packet, in which case, we could send packets to a different destination
- * while "waiting" for the delay. The overhead for doing this without
- * HW support is more than just paying the cost of delaying some packets
- * unnecessarily.
- */
-static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
-{
-	return (rcv_mult > snd_mult) ?
-		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
-}
-
-static int ipath_verbs_send_dma(struct ipath_qp *qp,
-				struct ipath_ib_header *hdr, u32 hdrwords,
-				struct ipath_sge_state *ss, u32 len,
-				u32 plen, u32 dwords)
-{
-	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	struct ipath_devdata *dd = dev->dd;
-	struct ipath_verbs_txreq *tx;
-	u32 *piobuf;
-	u32 control;
-	u32 ndesc;
-	int ret;
-
-	tx = qp->s_tx;
-	if (tx) {
-		qp->s_tx = NULL;
-		/* resend previously constructed packet */
-		atomic_inc(&qp->s_dma_busy);
-		ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
-		if (ret) {
-			qp->s_tx = tx;
-			decrement_dma_busy(qp);
-		}
-		goto bail;
-	}
-
-	tx = get_txreq(dev);
-	if (!tx) {
-		ret = -EBUSY;
-		goto bail;
-	}
-
-	/*
-	 * Get the saved delay count we computed for the previous packet
-	 * and save the delay count for this packet to be used next time
-	 * we get here.
-	 */
-	control = qp->s_pkt_delay;
-	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-	tx->qp = qp;
-	atomic_inc(&qp->refcount);
-	tx->wqe = qp->s_wqe;
-	tx->txreq.callback = sdma_complete;
-	tx->txreq.callback_cookie = tx;
-	tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
-		IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
-	if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
-
-	/* VL15 packets bypass credit check */
-	if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
-		control |= 1ULL << 31;
-		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
-	}
-
-	if (len) {
-		/*
-		 * Don't try to DMA if it takes more descriptors than
-		 * the queue holds.
-		 */
-		ndesc = ipath_count_sge(ss, len);
-		if (ndesc >= dd->ipath_sdma_descq_cnt)
-			ndesc = 0;
-	} else
-		ndesc = 1;
-	if (ndesc) {
-		tx->hdr.pbc[0] = cpu_to_le32(plen);
-		tx->hdr.pbc[1] = cpu_to_le32(control);
-		memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
-		tx->txreq.sg_count = ndesc;
-		tx->map_len = (hdrwords + 2) << 2;
-		tx->txreq.map_addr = &tx->hdr;
-		atomic_inc(&qp->s_dma_busy);
-		ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
-		if (ret) {
-			/* save ss and length in dwords */
-			tx->ss = ss;
-			tx->len = dwords;
-			qp->s_tx = tx;
-			decrement_dma_busy(qp);
-		}
-		goto bail;
-	}
-
-	/* Allocate a buffer and copy the header and payload to it. */
-	tx->map_len = (plen + 1) << 2;
-	piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
-	if (unlikely(piobuf == NULL)) {
-		ret = -EBUSY;
-		goto err_tx;
-	}
-	tx->txreq.map_addr = piobuf;
-	tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
-	tx->txreq.sg_count = 1;
-
-	*piobuf++ = (__force u32) cpu_to_le32(plen);
-	*piobuf++ = (__force u32) cpu_to_le32(control);
-	memcpy(piobuf, hdr, hdrwords << 2);
-	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
-
-	atomic_inc(&qp->s_dma_busy);
-	ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
-	/*
-	 * If we couldn't queue the DMA request, save the info
-	 * and try again later rather than destroying the
-	 * buffer and undoing the side effects of the copy.
-	 */
-	if (ret) {
-		tx->ss = NULL;
-		tx->len = 0;
-		qp->s_tx = tx;
-		decrement_dma_busy(qp);
-	}
-	dev->n_unaligned++;
-	goto bail;
-
-err_tx:
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-	put_txreq(dev, tx);
-bail:
-	return ret;
-}
-
-static int ipath_verbs_send_pio(struct ipath_qp *qp,
-				struct ipath_ib_header *ibhdr, u32 hdrwords,
-				struct ipath_sge_state *ss, u32 len,
-				u32 plen, u32 dwords)
-{
-	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-	u32 *hdr = (u32 *) ibhdr;
-	u32 __iomem *piobuf;
-	unsigned flush_wc;
-	u32 control;
-	int ret;
-	unsigned long flags;
-
-	piobuf = ipath_getpiobuf(dd, plen, NULL);
-	if (unlikely(piobuf == NULL)) {
-		ret = -EBUSY;
-		goto bail;
-	}
-
-	/*
-	 * Get the saved delay count we computed for the previous packet
-	 * and save the delay count for this packet to be used next time
-	 * we get here.
-	 */
-	control = qp->s_pkt_delay;
-	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-	/* VL15 packets bypass credit check */
-	if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
-		control |= 1ULL << 31;
-
-	/*
-	 * Write the length to the control qword plus any needed flags.
-	 * We have to flush after the PBC for correctness on some cpus
-	 * or WC buffer can be written out of order.
-	 */
-	writeq(((u64) control << 32) | plen, piobuf);
-	piobuf += 2;
-
-	flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
-	if (len == 0) {
-		/*
-		 * If there is just the header portion, must flush before
-		 * writing last word of header for correctness, and after
-		 * the last header word (trigger word).
-		 */
-		if (flush_wc) {
-			ipath_flush_wc();
-			__iowrite32_copy(piobuf, hdr, hdrwords - 1);
-			ipath_flush_wc();
-			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
-			ipath_flush_wc();
-		} else
-			__iowrite32_copy(piobuf, hdr, hdrwords);
-		goto done;
-	}
-
-	if (flush_wc)
-		ipath_flush_wc();
-	__iowrite32_copy(piobuf, hdr, hdrwords);
-	piobuf += hdrwords;
-
-	/* The common case is aligned and contained in one segment. */
-	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
-		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
-		u32 *addr = (u32 *) ss->sge.vaddr;
-
-		/* Update address before sending packet. */
-		update_sge(ss, len);
-		if (flush_wc) {
-			__iowrite32_copy(piobuf, addr, dwords - 1);
-			/* must flush early everything before trigger word */
-			ipath_flush_wc();
-			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
-			/* be sure trigger word is written */
-			ipath_flush_wc();
-		} else
-			__iowrite32_copy(piobuf, addr, dwords);
-		goto done;
-	}
-	copy_io(piobuf, ss, len, flush_wc);
-done:
-	if (qp->s_wqe) {
-		spin_lock_irqsave(&qp->s_lock, flags);
-		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-	}
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * ipath_verbs_send - send a packet
- * @qp: the QP to send on
- * @hdr: the packet header
- * @hdrwords: the number of 32-bit words in the header
- * @ss: the SGE to send
- * @len: the length of the packet in bytes
- */
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-		     u32 hdrwords, struct ipath_sge_state *ss, u32 len)
-{
-	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-	u32 plen;
-	int ret;
-	u32 dwords = (len + 3) >> 2;
-
-	/*
-	 * Calculate the send buffer trigger address.
-	 * The +1 counts for the pbc control dword following the pbc length.
-	 */
-	plen = hdrwords + dwords + 1;
-
-	/*
-	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
-	 * can defer SDMA restart until link goes ACTIVE without
-	 * worrying about just how we got there.
-	 */
-	if (qp->ibqp.qp_type == IB_QPT_SMI ||
-	    !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
-					   plen, dwords);
-	else
-		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
-					   plen, dwords);
-
-	return ret;
-}
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-			    u64 *rwords, u64 *spkts, u64 *rpkts,
-			    u64 *xmit_wait)
-{
-	int ret;
-
-	if (!(dd->ipath_flags & IPATH_INITTED)) {
-		/* no hardware, freeze, etc. */
-		ret = -EINVAL;
-		goto bail;
-	}
-	*swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
-	*rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-	*spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-	*rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-	*xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_get_counters - get various chip counters
- * @dd: the infinipath device
- * @cntrs: counters are placed here
- *
- * Return the counters needed by recv_pma_get_portcounters().
- */
-int ipath_get_counters(struct ipath_devdata *dd,
-		       struct ipath_verbs_counters *cntrs)
-{
-	struct ipath_cregs const *crp = dd->ipath_cregs;
-	int ret;
-
-	if (!(dd->ipath_flags & IPATH_INITTED)) {
-		/* no hardware, freeze, etc. */
-		ret = -EINVAL;
-		goto bail;
-	}
-	cntrs->symbol_error_counter =
-		ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
-	cntrs->link_error_recovery_counter =
-		ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
-	/*
-	 * The link downed counter counts when the other side downs the
-	 * connection.  We add in the number of times we downed the link
-	 * due to local link integrity errors to compensate.
-	 */
-	cntrs->link_downed_counter =
-		ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
-	cntrs->port_rcv_errors =
-		ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
-		ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
-		ipath_snap_cntr(dd, crp->cr_portovflcnt) +
-		ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
-		ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
-		ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
-		ipath_snap_cntr(dd, crp->cr_erricrccnt) +
-		ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
-		ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
-		ipath_snap_cntr(dd, crp->cr_badformatcnt) +
-		dd->ipath_rxfc_unsupvl_errs;
-	if (crp->cr_rxotherlocalphyerrcnt)
-		cntrs->port_rcv_errors +=
-			ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
-	if (crp->cr_rxvlerrcnt)
-		cntrs->port_rcv_errors +=
-			ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
-	cntrs->port_rcv_remphys_errors =
-		ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
-	cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
-	cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
-	cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
-	cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
-	cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
-	cntrs->local_link_integrity_errors =
-		crp->cr_locallinkintegrityerrcnt ?
-		ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
-		((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-		 dd->ipath_lli_errs : dd->ipath_lli_errors);
-	cntrs->excessive_buffer_overrun_errors =
-		crp->cr_excessbufferovflcnt ?
-		ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
-		dd->ipath_overrun_thresh_errs;
-	cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
-		ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_ib_piobufavail - callback when a PIO buffer is available
- * @arg: the device pointer
- *
- * This is called from ipath_intr() at interrupt level when a PIO buffer is
- * available after ipath_verbs_send() returned an error that no buffers were
- * available.  Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just restart the send tasklet and
- * return zero).
- */
-int ipath_ib_piobufavail(struct ipath_ibdev *dev)
-{
-	struct list_head *list;
-	struct ipath_qp *qplist;
-	struct ipath_qp *qp;
-	unsigned long flags;
-
-	if (dev == NULL)
-		goto bail;
-
-	list = &dev->piowait;
-	qplist = NULL;
-
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	while (!list_empty(list)) {
-		qp = list_entry(list->next, struct ipath_qp, piowait);
-		list_del_init(&qp->piowait);
-		qp->pio_next = qplist;
-		qplist = qp;
-		atomic_inc(&qp->refcount);
-	}
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-	while (qplist != NULL) {
-		qp = qplist;
-		qplist = qp->pio_next;
-
-		spin_lock_irqsave(&qp->s_lock, flags);
-		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-			ipath_schedule_send(qp);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-
-		/* Notify ipath_destroy_qp() if it is waiting. */
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
-	}
-
-bail:
-	return 0;
-}
-
-static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-			      struct ib_udata *uhw)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-
-	if (uhw->inlen || uhw->outlen)
-		return -EINVAL;
-
-	memset(props, 0, sizeof(*props));
-
-	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
-		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
-	props->page_size_cap = PAGE_SIZE;
-	props->vendor_id =
-		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
-	props->vendor_part_id = dev->dd->ipath_deviceid;
-	props->hw_ver = dev->dd->ipath_pcirev;
-
-	props->sys_image_guid = dev->sys_image_guid;
-
-	props->max_mr_size = ~0ull;
-	props->max_qp = ib_ipath_max_qps;
-	props->max_qp_wr = ib_ipath_max_qp_wrs;
-	props->max_sge = ib_ipath_max_sges;
-	props->max_cq = ib_ipath_max_cqs;
-	props->max_ah = ib_ipath_max_ahs;
-	props->max_cqe = ib_ipath_max_cqes;
-	props->max_mr = dev->lk_table.max;
-	props->max_fmr = dev->lk_table.max;
-	props->max_map_per_fmr = 32767;
-	props->max_pd = ib_ipath_max_pds;
-	props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
-	props->max_qp_init_rd_atom = 255;
-	/* props->max_res_rd_atom */
-	props->max_srq = ib_ipath_max_srqs;
-	props->max_srq_wr = ib_ipath_max_srq_wrs;
-	props->max_srq_sge = ib_ipath_max_srq_sges;
-	/* props->local_ca_ack_delay */
-	props->atomic_cap = IB_ATOMIC_GLOB;
-	props->max_pkeys = ipath_get_npkeys(dev->dd);
-	props->max_mcast_grp = ib_ipath_max_mcast_grps;
-	props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
-	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
-		props->max_mcast_grp;
-
-	return 0;
-}
-
-const u8 ipath_cvt_physportstate[32] = {
-	[INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
-	[INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
-	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
-	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
-	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
-	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
-	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
-		IB_PHYSPORTSTATE_CFG_TRAIN,
-	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
-		IB_PHYSPORTSTATE_CFG_TRAIN,
-	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
-		IB_PHYSPORTSTATE_CFG_TRAIN,
-	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
-		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
-		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
-		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
-	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
-};
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
-{
-	return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
-}
-
-static int ipath_query_port(struct ib_device *ibdev,
-			    u8 port, struct ib_port_attr *props)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_devdata *dd = dev->dd;
-	enum ib_mtu mtu;
-	u16 lid = dd->ipath_lid;
-	u64 ibcstat;
-
-	memset(props, 0, sizeof(*props));
-	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
-	props->lmc = dd->ipath_lmc;
-	props->sm_lid = dev->sm_lid;
-	props->sm_sl = dev->sm_sl;
-	ibcstat = dd->ipath_lastibcstat;
-	/* map LinkState to IB portinfo values.  */
-	props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
-
-	/* See phys_state_show() */
-	props->phys_state = /* MEA: assumes shift == 0 */
-		ipath_cvt_physportstate[dd->ipath_lastibcstat &
-		dd->ibcs_lts_mask];
-	props->port_cap_flags = dev->port_cap_flags;
-	props->gid_tbl_len = 1;
-	props->max_msg_sz = 0x80000000;
-	props->pkey_tbl_len = ipath_get_npkeys(dd);
-	props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
-		dev->z_pkey_violations;
-	props->qkey_viol_cntr = dev->qkey_violations;
-	props->active_width = dd->ipath_link_width_active;
-	/* See rate_show() */
-	props->active_speed = dd->ipath_link_speed_active;
-	props->max_vl_num = 1;		/* VLCap = VL0 */
-	props->init_type_reply = 0;
-
-	props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-	switch (dd->ipath_ibmtu) {
-	case 4096:
-		mtu = IB_MTU_4096;
-		break;
-	case 2048:
-		mtu = IB_MTU_2048;
-		break;
-	case 1024:
-		mtu = IB_MTU_1024;
-		break;
-	case 512:
-		mtu = IB_MTU_512;
-		break;
-	case 256:
-		mtu = IB_MTU_256;
-		break;
-	default:
-		mtu = IB_MTU_2048;
-	}
-	props->active_mtu = mtu;
-	props->subnet_timeout = dev->subnet_timeout;
-
-	return 0;
-}
-
-static int ipath_modify_device(struct ib_device *device,
-			       int device_modify_mask,
-			       struct ib_device_modify *device_modify)
-{
-	int ret;
-
-	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
-				   IB_DEVICE_MODIFY_NODE_DESC)) {
-		ret = -EOPNOTSUPP;
-		goto bail;
-	}
-
-	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
-		memcpy(device->node_desc, device_modify->node_desc, 64);
-
-	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
-		to_idev(device)->sys_image_guid =
-			cpu_to_be64(device_modify->sys_image_guid);
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-static int ipath_modify_port(struct ib_device *ibdev,
-			     u8 port, int port_modify_mask,
-			     struct ib_port_modify *props)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-
-	dev->port_cap_flags |= props->set_port_cap_mask;
-	dev->port_cap_flags &= ~props->clr_port_cap_mask;
-	if (port_modify_mask & IB_PORT_SHUTDOWN)
-		ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
-	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
-		dev->qkey_violations = 0;
-	return 0;
-}
-
-static int ipath_query_gid(struct ib_device *ibdev, u8 port,
-			   int index, union ib_gid *gid)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	int ret;
-
-	if (index >= 1) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	gid->global.subnet_prefix = dev->gid_prefix;
-	gid->global.interface_id = dev->dd->ipath_guid;
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	struct ipath_pd *pd;
-	struct ib_pd *ret;
-
-	/*
-	 * This is actually totally arbitrary.	Some correctness tests
-	 * assume there's a maximum number of PDs that can be allocated.
-	 * We don't actually have this limit, but we fail the test if
-	 * we allow allocations of more than we report for this value.
-	 */
-
-	pd = kmalloc(sizeof *pd, GFP_KERNEL);
-	if (!pd) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	spin_lock(&dev->n_pds_lock);
-	if (dev->n_pds_allocated == ib_ipath_max_pds) {
-		spin_unlock(&dev->n_pds_lock);
-		kfree(pd);
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	dev->n_pds_allocated++;
-	spin_unlock(&dev->n_pds_lock);
-
-	/* ib_alloc_pd() will initialize pd->ibpd. */
-	pd->user = udata != NULL;
-
-	ret = &pd->ibpd;
-
-bail:
-	return ret;
-}
-
-static int ipath_dealloc_pd(struct ib_pd *ibpd)
-{
-	struct ipath_pd *pd = to_ipd(ibpd);
-	struct ipath_ibdev *dev = to_idev(ibpd->device);
-
-	spin_lock(&dev->n_pds_lock);
-	dev->n_pds_allocated--;
-	spin_unlock(&dev->n_pds_lock);
-
-	kfree(pd);
-
-	return 0;
-}
-
-/**
- * ipath_create_ah - create an address handle
- * @pd: the protection domain
- * @ah_attr: the attributes of the AH
- *
- * This may be called from interrupt context.
- */
-static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
-				     struct ib_ah_attr *ah_attr)
-{
-	struct ipath_ah *ah;
-	struct ib_ah *ret;
-	struct ipath_ibdev *dev = to_idev(pd->device);
-	unsigned long flags;
-
-	/* A multicast address requires a GRH (see ch. 8.4.1). */
-	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-	    ah_attr->dlid != IPATH_PERMISSIVE_LID &&
-	    !(ah_attr->ah_flags & IB_AH_GRH)) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	if (ah_attr->dlid == 0) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	if (ah_attr->port_num < 1 ||
-	    ah_attr->port_num > pd->device->phys_port_cnt) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
-
-	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-	if (!ah) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	spin_lock_irqsave(&dev->n_ahs_lock, flags);
-	if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
-		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-		kfree(ah);
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	dev->n_ahs_allocated++;
-	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-	/* ib_create_ah() will initialize ah->ibah. */
-	ah->attr = *ah_attr;
-	ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
-
-	ret = &ah->ibah;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_destroy_ah - destroy an address handle
- * @ibah: the AH to destroy
- *
- * This may be called from interrupt context.
- */
-static int ipath_destroy_ah(struct ib_ah *ibah)
-{
-	struct ipath_ibdev *dev = to_idev(ibah->device);
-	struct ipath_ah *ah = to_iah(ibah);
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->n_ahs_lock, flags);
-	dev->n_ahs_allocated--;
-	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-	kfree(ah);
-
-	return 0;
-}
-
-static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
-{
-	struct ipath_ah *ah = to_iah(ibah);
-
-	*ah_attr = ah->attr;
-	ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
-
-	return 0;
-}
-
-/**
- * ipath_get_npkeys - return the size of the PKEY table for port 0
- * @dd: the infinipath device
- */
-unsigned ipath_get_npkeys(struct ipath_devdata *dd)
-{
-	return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
-}
-
-/**
- * ipath_get_pkey - return the indexed PKEY from the port PKEY table
- * @dd: the infinipath device
- * @index: the PKEY index
- */
-unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
-{
-	unsigned ret;
-
-	/* always a kernel port, no locking needed */
-	if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
-		ret = 0;
-	else
-		ret = dd->ipath_pd[0]->port_pkeys[index];
-
-	return ret;
-}
-
-static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-			    u16 *pkey)
-{
-	struct ipath_ibdev *dev = to_idev(ibdev);
-	int ret;
-
-	if (index >= ipath_get_npkeys(dev->dd)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	*pkey = ipath_get_pkey(dev->dd, index);
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
- * ipath_alloc_ucontext - allocate a ucontest
- * @ibdev: the infiniband device
- * @udata: not used by the InfiniPath driver
- */
-
-static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata)
-{
-	struct ipath_ucontext *context;
-	struct ib_ucontext *ret;
-
-	context = kmalloc(sizeof *context, GFP_KERNEL);
-	if (!context) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	ret = &context->ibucontext;
-
-bail:
-	return ret;
-}
-
-static int ipath_dealloc_ucontext(struct ib_ucontext *context)
-{
-	kfree(to_iucontext(context));
-	return 0;
-}
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev);
-
-static void __verbs_timer(unsigned long arg)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *) arg;
-
-	/* Handle verbs layer timeouts. */
-	ipath_ib_timer(dd->verbs_dev);
-
-	mod_timer(&dd->verbs_timer, jiffies + 1);
-}
-
-static int enable_timer(struct ipath_devdata *dd)
-{
-	/*
-	 * Early chips had a design flaw where the chip and kernel idea
-	 * of the tail register don't always agree, and therefore we won't
-	 * get an interrupt on the next packet received.
-	 * If the board supports per packet receive interrupts, use it.
-	 * Otherwise, the timer function periodically checks for packets
-	 * to cover this case.
-	 * Either way, the timer is needed for verbs layer related
-	 * processing.
-	 */
-	if (dd->ipath_flags & IPATH_GPIO_INTR) {
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
-				 0x2074076542310ULL);
-		/* Enable GPIO bit 2 interrupt */
-		dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-				 dd->ipath_gpio_mask);
-	}
-
-	init_timer(&dd->verbs_timer);
-	dd->verbs_timer.function = __verbs_timer;
-	dd->verbs_timer.data = (unsigned long)dd;
-	dd->verbs_timer.expires = jiffies + 1;
-	add_timer(&dd->verbs_timer);
-
-	return 0;
-}
-
-static int disable_timer(struct ipath_devdata *dd)
-{
-	/* Disable GPIO bit 2 interrupt */
-	if (dd->ipath_flags & IPATH_GPIO_INTR) {
-                /* Disable GPIO bit 2 interrupt */
-		dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-				 dd->ipath_gpio_mask);
-		/*
-		 * We might want to undo changes to debugportselect,
-		 * but how?
-		 */
-	}
-
-	del_timer_sync(&dd->verbs_timer);
-
-	return 0;
-}
-
-static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
-			        struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	err = ipath_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-	return 0;
-}
-
-/**
- * ipath_register_ib_device - register our device with the infiniband core
- * @dd: the device data structure
- * Return the allocated ipath_ibdev pointer or NULL on error.
- */
-int ipath_register_ib_device(struct ipath_devdata *dd)
-{
-	struct ipath_verbs_counters cntrs;
-	struct ipath_ibdev *idev;
-	struct ib_device *dev;
-	struct ipath_verbs_txreq *tx;
-	unsigned i;
-	int ret;
-
-	idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
-	if (idev == NULL) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	dev = &idev->ibdev;
-
-	if (dd->ipath_sdma_descq_cnt) {
-		tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
-			     GFP_KERNEL);
-		if (tx == NULL) {
-			ret = -ENOMEM;
-			goto err_tx;
-		}
-	} else
-		tx = NULL;
-	idev->txreq_bufs = tx;
-
-	/* Only need to initialize non-zero fields. */
-	spin_lock_init(&idev->n_pds_lock);
-	spin_lock_init(&idev->n_ahs_lock);
-	spin_lock_init(&idev->n_cqs_lock);
-	spin_lock_init(&idev->n_qps_lock);
-	spin_lock_init(&idev->n_srqs_lock);
-	spin_lock_init(&idev->n_mcast_grps_lock);
-
-	spin_lock_init(&idev->qp_table.lock);
-	spin_lock_init(&idev->lk_table.lock);
-	idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
-	/* Set the prefix to the default value (see ch. 4.1.1) */
-	idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
-
-	ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
-	if (ret)
-		goto err_qp;
-
-	/*
-	 * The top ib_ipath_lkey_table_size bits are used to index the
-	 * table.  The lower 8 bits can be owned by the user (copied from
-	 * the LKEY).  The remaining bits act as a generation number or tag.
-	 */
-	idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
-	idev->lk_table.table = kzalloc(idev->lk_table.max *
-				       sizeof(*idev->lk_table.table),
-				       GFP_KERNEL);
-	if (idev->lk_table.table == NULL) {
-		ret = -ENOMEM;
-		goto err_lk;
-	}
-	INIT_LIST_HEAD(&idev->pending_mmaps);
-	spin_lock_init(&idev->pending_lock);
-	idev->mmap_offset = PAGE_SIZE;
-	spin_lock_init(&idev->mmap_offset_lock);
-	INIT_LIST_HEAD(&idev->pending[0]);
-	INIT_LIST_HEAD(&idev->pending[1]);
-	INIT_LIST_HEAD(&idev->pending[2]);
-	INIT_LIST_HEAD(&idev->piowait);
-	INIT_LIST_HEAD(&idev->rnrwait);
-	INIT_LIST_HEAD(&idev->txreq_free);
-	idev->pending_index = 0;
-	idev->port_cap_flags =
-		IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
-	if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
-		idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
-	idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
-	idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
-	idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
-	idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
-	idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
-
-	/* Snapshot current HW counters to "clear" them. */
-	ipath_get_counters(dd, &cntrs);
-	idev->z_symbol_error_counter = cntrs.symbol_error_counter;
-	idev->z_link_error_recovery_counter =
-		cntrs.link_error_recovery_counter;
-	idev->z_link_downed_counter = cntrs.link_downed_counter;
-	idev->z_port_rcv_errors = cntrs.port_rcv_errors;
-	idev->z_port_rcv_remphys_errors =
-		cntrs.port_rcv_remphys_errors;
-	idev->z_port_xmit_discards = cntrs.port_xmit_discards;
-	idev->z_port_xmit_data = cntrs.port_xmit_data;
-	idev->z_port_rcv_data = cntrs.port_rcv_data;
-	idev->z_port_xmit_packets = cntrs.port_xmit_packets;
-	idev->z_port_rcv_packets = cntrs.port_rcv_packets;
-	idev->z_local_link_integrity_errors =
-		cntrs.local_link_integrity_errors;
-	idev->z_excessive_buffer_overrun_errors =
-		cntrs.excessive_buffer_overrun_errors;
-	idev->z_vl15_dropped = cntrs.vl15_dropped;
-
-	for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
-		list_add(&tx->txreq.list, &idev->txreq_free);
-
-	/*
-	 * The system image GUID is supposed to be the same for all
-	 * IB HCAs in a single system but since there can be other
-	 * device types in the system, we can't be sure this is unique.
-	 */
-	if (!sys_image_guid)
-		sys_image_guid = dd->ipath_guid;
-	idev->sys_image_guid = sys_image_guid;
-	idev->ib_unit = dd->ipath_unit;
-	idev->dd = dd;
-
-	strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
-	dev->owner = THIS_MODULE;
-	dev->node_guid = dd->ipath_guid;
-	dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
-	dev->uverbs_cmd_mask =
-		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
-		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
-		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
-		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
-		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
-		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
-		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
-		(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
-		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
-		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
-		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
-		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
-		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
-		(1ull << IB_USER_VERBS_CMD_POLL_CQ)		|
-		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)	|
-		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
-		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
-		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
-		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
-		(1ull << IB_USER_VERBS_CMD_POST_SEND)		|
-		(1ull << IB_USER_VERBS_CMD_POST_RECV)		|
-		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
-		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
-		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
-		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
-		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
-		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
-		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
-	dev->node_type = RDMA_NODE_IB_CA;
-	dev->phys_port_cnt = 1;
-	dev->num_comp_vectors = 1;
-	dev->dma_device = &dd->pcidev->dev;
-	dev->query_device = ipath_query_device;
-	dev->modify_device = ipath_modify_device;
-	dev->query_port = ipath_query_port;
-	dev->modify_port = ipath_modify_port;
-	dev->query_pkey = ipath_query_pkey;
-	dev->query_gid = ipath_query_gid;
-	dev->alloc_ucontext = ipath_alloc_ucontext;
-	dev->dealloc_ucontext = ipath_dealloc_ucontext;
-	dev->alloc_pd = ipath_alloc_pd;
-	dev->dealloc_pd = ipath_dealloc_pd;
-	dev->create_ah = ipath_create_ah;
-	dev->destroy_ah = ipath_destroy_ah;
-	dev->query_ah = ipath_query_ah;
-	dev->create_srq = ipath_create_srq;
-	dev->modify_srq = ipath_modify_srq;
-	dev->query_srq = ipath_query_srq;
-	dev->destroy_srq = ipath_destroy_srq;
-	dev->create_qp = ipath_create_qp;
-	dev->modify_qp = ipath_modify_qp;
-	dev->query_qp = ipath_query_qp;
-	dev->destroy_qp = ipath_destroy_qp;
-	dev->post_send = ipath_post_send;
-	dev->post_recv = ipath_post_receive;
-	dev->post_srq_recv = ipath_post_srq_receive;
-	dev->create_cq = ipath_create_cq;
-	dev->destroy_cq = ipath_destroy_cq;
-	dev->resize_cq = ipath_resize_cq;
-	dev->poll_cq = ipath_poll_cq;
-	dev->req_notify_cq = ipath_req_notify_cq;
-	dev->get_dma_mr = ipath_get_dma_mr;
-	dev->reg_phys_mr = ipath_reg_phys_mr;
-	dev->reg_user_mr = ipath_reg_user_mr;
-	dev->dereg_mr = ipath_dereg_mr;
-	dev->alloc_fmr = ipath_alloc_fmr;
-	dev->map_phys_fmr = ipath_map_phys_fmr;
-	dev->unmap_fmr = ipath_unmap_fmr;
-	dev->dealloc_fmr = ipath_dealloc_fmr;
-	dev->attach_mcast = ipath_multicast_attach;
-	dev->detach_mcast = ipath_multicast_detach;
-	dev->process_mad = ipath_process_mad;
-	dev->mmap = ipath_mmap;
-	dev->dma_ops = &ipath_dma_mapping_ops;
-	dev->get_port_immutable = ipath_port_immutable;
-
-	snprintf(dev->node_desc, sizeof(dev->node_desc),
-		 IPATH_IDSTR " %s", init_utsname()->nodename);
-
-	ret = ib_register_device(dev, NULL);
-	if (ret)
-		goto err_reg;
-
-	ret = ipath_verbs_register_sysfs(dev);
-	if (ret)
-		goto err_class;
-
-	enable_timer(dd);
-
-	goto bail;
-
-err_class:
-	ib_unregister_device(dev);
-err_reg:
-	kfree(idev->lk_table.table);
-err_lk:
-	kfree(idev->qp_table.table);
-err_qp:
-	kfree(idev->txreq_bufs);
-err_tx:
-	ib_dealloc_device(dev);
-	ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
-	idev = NULL;
-
-bail:
-	dd->verbs_dev = idev;
-	return ret;
-}
-
-void ipath_unregister_ib_device(struct ipath_ibdev *dev)
-{
-	struct ib_device *ibdev = &dev->ibdev;
-	u32 qps_inuse;
-
-	ib_unregister_device(ibdev);
-
-	disable_timer(dev->dd);
-
-	if (!list_empty(&dev->pending[0]) ||
-	    !list_empty(&dev->pending[1]) ||
-	    !list_empty(&dev->pending[2]))
-		ipath_dev_err(dev->dd, "pending list not empty!\n");
-	if (!list_empty(&dev->piowait))
-		ipath_dev_err(dev->dd, "piowait list not empty!\n");
-	if (!list_empty(&dev->rnrwait))
-		ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
-	if (!ipath_mcast_tree_empty())
-		ipath_dev_err(dev->dd, "multicast table memory leak!\n");
-	/*
-	 * Note that ipath_unregister_ib_device() can be called before all
-	 * the QPs are destroyed!
-	 */
-	qps_inuse = ipath_free_all_qps(&dev->qp_table);
-	if (qps_inuse)
-		ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
-			qps_inuse);
-	kfree(dev->qp_table.table);
-	kfree(dev->lk_table.table);
-	kfree(dev->txreq_bufs);
-	ib_dealloc_device(ibdev);
-}
-
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct ipath_ibdev *dev =
-		container_of(device, struct ipath_ibdev, ibdev.dev);
-
-	return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
-}
-
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct ipath_ibdev *dev =
-		container_of(device, struct ipath_ibdev, ibdev.dev);
-	int ret;
-
-	ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
-	if (ret < 0)
-		goto bail;
-	strcat(buf, "\n");
-	ret = strlen(buf);
-
-bail:
-	return ret;
-}
-
-static ssize_t show_stats(struct device *device, struct device_attribute *attr,
-			  char *buf)
-{
-	struct ipath_ibdev *dev =
-		container_of(device, struct ipath_ibdev, ibdev.dev);
-	int i;
-	int len;
-
-	len = sprintf(buf,
-		      "RC resends  %d\n"
-		      "RC no QACK  %d\n"
-		      "RC ACKs     %d\n"
-		      "RC SEQ NAKs %d\n"
-		      "RC RDMA seq %d\n"
-		      "RC RNR NAKs %d\n"
-		      "RC OTH NAKs %d\n"
-		      "RC timeouts %d\n"
-		      "RC RDMA dup %d\n"
-		      "piobuf wait %d\n"
-		      "unaligned   %d\n"
-		      "PKT drops   %d\n"
-		      "WQE errs    %d\n",
-		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
-		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
-		      dev->n_other_naks, dev->n_timeouts,
-		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
-		      dev->n_pkt_drops, dev->n_wqe_errs);
-	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
-		const struct ipath_opcode_stats *si = &dev->opstats[i];
-
-		if (!si->n_packets && !si->n_bytes)
-			continue;
-		len += sprintf(buf + len, "%02x %llu/%llu\n", i,
-			       (unsigned long long) si->n_packets,
-			       (unsigned long long) si->n_bytes);
-	}
-	return len;
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
-
-static struct device_attribute *ipath_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
-	&dev_attr_stats
-};
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev)
-{
-	int i;
-	int ret;
-
-	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
-		ret = device_create_file(&dev->dev,
-				       ipath_class_attributes[i]);
-		if (ret)
-			goto bail;
-	}
-	return 0;
-bail:
-	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
-		device_remove_file(&dev->dev, ipath_class_attributes[i]);
-	return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
deleted file mode 100644
index ec167e5..0000000
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ /dev/null
@@ -1,939 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IPATH_VERBS_H
-#define IPATH_VERBS_H
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/kref.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_user_verbs.h>
-
-#include "ipath_kernel.h"
-
-#define IPATH_MAX_RDMA_ATOMIC	4
-
-#define QPN_MAX                 (1 << 24)
-#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define IPATH_UVERBS_ABI_VERSION       2
-
-/*
- * Define an ib_cq_notify value that is not valid so we know when CQ
- * notifications are armed.
- */
-#define IB_CQ_NONE	(IB_CQ_NEXT_COMP + 1)
-
-/* AETH NAK opcode values */
-#define IB_RNR_NAK			0x20
-#define IB_NAK_PSN_ERROR		0x60
-#define IB_NAK_INVALID_REQUEST		0x61
-#define IB_NAK_REMOTE_ACCESS_ERROR	0x62
-#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
-#define IB_NAK_INVALID_RD_REQUEST	0x64
-
-/* Flags for checking QP state (see ib_ipath_state_ops[]) */
-#define IPATH_POST_SEND_OK		0x01
-#define IPATH_POST_RECV_OK		0x02
-#define IPATH_PROCESS_RECV_OK		0x04
-#define IPATH_PROCESS_SEND_OK		0x08
-#define IPATH_PROCESS_NEXT_SEND_OK	0x10
-#define IPATH_FLUSH_SEND		0x20
-#define IPATH_FLUSH_RECV		0x40
-#define IPATH_PROCESS_OR_FLUSH_SEND \
-	(IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
-
-/* IB Performance Manager status values */
-#define IB_PMA_SAMPLE_STATUS_DONE	0x00
-#define IB_PMA_SAMPLE_STATUS_STARTED	0x01
-#define IB_PMA_SAMPLE_STATUS_RUNNING	0x02
-
-/* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA	cpu_to_be16(0x0001)
-#define IB_PMA_PORT_RCV_DATA	cpu_to_be16(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS	cpu_to_be16(0x0003)
-#define IB_PMA_PORT_RCV_PKTS	cpu_to_be16(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT	cpu_to_be16(0x0005)
-
-struct ib_reth {
-	__be64 vaddr;
-	__be32 rkey;
-	__be32 length;
-} __attribute__ ((packed));
-
-struct ib_atomic_eth {
-	__be32 vaddr[2];	/* unaligned so access as 2 32-bit words */
-	__be32 rkey;
-	__be64 swap_data;
-	__be64 compare_data;
-} __attribute__ ((packed));
-
-struct ipath_other_headers {
-	__be32 bth[3];
-	union {
-		struct {
-			__be32 deth[2];
-			__be32 imm_data;
-		} ud;
-		struct {
-			struct ib_reth reth;
-			__be32 imm_data;
-		} rc;
-		struct {
-			__be32 aeth;
-			__be32 atomic_ack_eth[2];
-		} at;
-		__be32 imm_data;
-		__be32 aeth;
-		struct ib_atomic_eth atomic_eth;
-	} u;
-} __attribute__ ((packed));
-
-/*
- * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
- * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
- * will be in the eager header buffer.  The remaining 12 or 16 bytes
- * are in the data buffer.
- */
-struct ipath_ib_header {
-	__be16 lrh[4];
-	union {
-		struct {
-			struct ib_grh grh;
-			struct ipath_other_headers oth;
-		} l;
-		struct ipath_other_headers oth;
-	} u;
-} __attribute__ ((packed));
-
-struct ipath_pio_header {
-	__le32 pbc[2];
-	struct ipath_ib_header hdr;
-} __attribute__ ((packed));
-
-/*
- * There is one struct ipath_mcast for each multicast GID.
- * All attached QPs are then stored as a list of
- * struct ipath_mcast_qp.
- */
-struct ipath_mcast_qp {
-	struct list_head list;
-	struct ipath_qp *qp;
-};
-
-struct ipath_mcast {
-	struct rb_node rb_node;
-	union ib_gid mgid;
-	struct list_head qp_list;
-	wait_queue_head_t wait;
-	atomic_t refcount;
-	int n_attached;
-};
-
-/* Protection domain */
-struct ipath_pd {
-	struct ib_pd ibpd;
-	int user;		/* non-zero if created from user space */
-};
-
-/* Address Handle */
-struct ipath_ah {
-	struct ib_ah ibah;
-	struct ib_ah_attr attr;
-};
-
-/*
- * This structure is used by ipath_mmap() to validate an offset
- * when an mmap() request is made.  The vm_area_struct then uses
- * this as its vm_private_data.
- */
-struct ipath_mmap_info {
-	struct list_head pending_mmaps;
-	struct ib_ucontext *context;
-	void *obj;
-	__u64 offset;
-	struct kref ref;
-	unsigned size;
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and completion queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- */
-struct ipath_cq_wc {
-	u32 head;		/* index of next entry to fill */
-	u32 tail;		/* index of next ib_poll_cq() entry */
-	union {
-		/* these are actually size ibcq.cqe + 1 */
-		struct ib_uverbs_wc uqueue[0];
-		struct ib_wc kqueue[0];
-	};
-};
-
-/*
- * The completion queue structure.
- */
-struct ipath_cq {
-	struct ib_cq ibcq;
-	struct tasklet_struct comptask;
-	spinlock_t lock;
-	u8 notify;
-	u8 triggered;
-	struct ipath_cq_wc *queue;
-	struct ipath_mmap_info *ip;
-};
-
-/*
- * A segment is a linear region of low physical memory.
- * XXX Maybe we should use phys addr here and kmap()/kunmap().
- * Used by the verbs layer.
- */
-struct ipath_seg {
-	void *vaddr;
-	size_t length;
-};
-
-/* The number of ipath_segs that fit in a page. */
-#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
-
-struct ipath_segarray {
-	struct ipath_seg segs[IPATH_SEGSZ];
-};
-
-struct ipath_mregion {
-	struct ib_pd *pd;	/* shares refcnt of ibmr.pd */
-	u64 user_base;		/* User's address for this region */
-	u64 iova;		/* IB start address of this region */
-	size_t length;
-	u32 lkey;
-	u32 offset;		/* offset (bytes) to start of region */
-	int access_flags;
-	u32 max_segs;		/* number of ipath_segs in all the arrays */
-	u32 mapsz;		/* size of the map array */
-	struct ipath_segarray *map[0];	/* the segments */
-};
-
-/*
- * These keep track of the copy progress within a memory region.
- * Used by the verbs layer.
- */
-struct ipath_sge {
-	struct ipath_mregion *mr;
-	void *vaddr;		/* kernel virtual address of segment */
-	u32 sge_length;		/* length of the SGE */
-	u32 length;		/* remaining length of the segment */
-	u16 m;			/* current index: mr->map[m] */
-	u16 n;			/* current index: mr->map[m]->segs[n] */
-};
-
-/* Memory region */
-struct ipath_mr {
-	struct ib_mr ibmr;
-	struct ib_umem *umem;
-	struct ipath_mregion mr;	/* must be last */
-};
-
-/*
- * Send work request queue entry.
- * The size of the sg_list is determined when the QP is created and stored
- * in qp->s_max_sge.
- */
-struct ipath_swqe {
-	struct ib_send_wr wr;	/* don't use wr.sg_list */
-	u32 psn;		/* first packet sequence number */
-	u32 lpsn;		/* last packet sequence number */
-	u32 ssn;		/* send sequence number */
-	u32 length;		/* total length of data in sg_list */
-	struct ipath_sge sg_list[0];
-};
-
-/*
- * Receive work request queue entry.
- * The size of the sg_list is determined when the QP (or SRQ) is created
- * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
- */
-struct ipath_rwqe {
-	u64 wr_id;
-	u8 num_sge;
-	struct ib_sge sg_list[0];
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and receive work queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- * Note that the wq array elements are variable size so you can't
- * just index into the array to get the N'th element;
- * use get_rwqe_ptr() instead.
- */
-struct ipath_rwq {
-	u32 head;		/* new work requests posted to the head */
-	u32 tail;		/* receives pull requests from here. */
-	struct ipath_rwqe wq[0];
-};
-
-struct ipath_rq {
-	struct ipath_rwq *wq;
-	spinlock_t lock;
-	u32 size;		/* size of RWQE array */
-	u8 max_sge;
-};
-
-struct ipath_srq {
-	struct ib_srq ibsrq;
-	struct ipath_rq rq;
-	struct ipath_mmap_info *ip;
-	/* send signal when number of RWQEs < limit */
-	u32 limit;
-};
-
-struct ipath_sge_state {
-	struct ipath_sge *sg_list;      /* next SGE to be used if any */
-	struct ipath_sge sge;   /* progress state for the current SGE */
-	u8 num_sge;
-	u8 static_rate;
-};
-
-/*
- * This structure holds the information that the send tasklet needs
- * to send a RDMA read response or atomic operation.
- */
-struct ipath_ack_entry {
-	u8 opcode;
-	u8 sent;
-	u32 psn;
-	union {
-		struct ipath_sge_state rdma_sge;
-		u64 atomic_data;
-	};
-};
-
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
- *
- * Common variables are protected by both r_rq.lock and s_lock in that order
- * which only happens in modify_qp() or changing the QP 'state'.
- */
-struct ipath_qp {
-	struct ib_qp ibqp;
-	struct ipath_qp *next;		/* link list for QPN hash table */
-	struct ipath_qp *timer_next;	/* link list for ipath_ib_timer() */
-	struct ipath_qp *pio_next;	/* link for ipath_ib_piobufavail() */
-	struct list_head piowait;	/* link for wait PIO buf */
-	struct list_head timerwait;	/* link for waiting for timeouts */
-	struct ib_ah_attr remote_ah_attr;
-	struct ipath_ib_header s_hdr;	/* next packet header to send */
-	atomic_t refcount;
-	wait_queue_head_t wait;
-	wait_queue_head_t wait_dma;
-	struct tasklet_struct s_task;
-	struct ipath_mmap_info *ip;
-	struct ipath_sge_state *s_cur_sge;
-	struct ipath_verbs_txreq *s_tx;
-	struct ipath_sge_state s_sge;	/* current send request data */
-	struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
-	struct ipath_sge_state s_ack_rdma_sge;
-	struct ipath_sge_state s_rdma_read_sge;
-	struct ipath_sge_state r_sge;	/* current receive data */
-	spinlock_t s_lock;
-	atomic_t s_dma_busy;
-	u16 s_pkt_delay;
-	u16 s_hdrwords;		/* size of s_hdr in 32 bit words */
-	u32 s_cur_size;		/* size of send packet in bytes */
-	u32 s_len;		/* total length of s_sge */
-	u32 s_rdma_read_len;	/* total length of s_rdma_read_sge */
-	u32 s_next_psn;		/* PSN for next request */
-	u32 s_last_psn;		/* last response PSN processed */
-	u32 s_psn;		/* current packet sequence number */
-	u32 s_ack_rdma_psn;	/* PSN for sending RDMA read responses */
-	u32 s_ack_psn;		/* PSN for acking sends and RDMA writes */
-	u32 s_rnr_timeout;	/* number of milliseconds for RNR timeout */
-	u32 r_ack_psn;		/* PSN for next ACK or atomic ACK */
-	u64 r_wr_id;		/* ID for current receive WQE */
-	unsigned long r_aflags;
-	u32 r_len;		/* total length of r_sge */
-	u32 r_rcv_len;		/* receive data len processed */
-	u32 r_psn;		/* expected rcv packet sequence number */
-	u32 r_msn;		/* message sequence number */
-	u8 state;		/* QP state */
-	u8 s_state;		/* opcode of last packet sent */
-	u8 s_ack_state;		/* opcode of packet to ACK */
-	u8 s_nak_state;		/* non-zero if NAK is pending */
-	u8 r_state;		/* opcode of last packet received */
-	u8 r_nak_state;		/* non-zero if NAK is pending */
-	u8 r_min_rnr_timer;	/* retry timeout value for RNR NAKs */
-	u8 r_flags;
-	u8 r_max_rd_atomic;	/* max number of RDMA read/atomic to receive */
-	u8 r_head_ack_queue;	/* index into s_ack_queue[] */
-	u8 qp_access_flags;
-	u8 s_max_sge;		/* size of s_wq->sg_list */
-	u8 s_retry_cnt;		/* number of times to retry */
-	u8 s_rnr_retry_cnt;
-	u8 s_retry;		/* requester retry counter */
-	u8 s_rnr_retry;		/* requester RNR retry counter */
-	u8 s_pkey_index;	/* PKEY index to use */
-	u8 s_max_rd_atomic;	/* max number of RDMA read/atomic to send */
-	u8 s_num_rd_atomic;	/* number of RDMA read/atomic pending */
-	u8 s_tail_ack_queue;	/* index into s_ack_queue[] */
-	u8 s_flags;
-	u8 s_dmult;
-	u8 s_draining;
-	u8 timeout;		/* Timeout for this QP */
-	enum ib_mtu path_mtu;
-	u32 remote_qpn;
-	u32 qkey;		/* QKEY for this QP (for UD or RD) */
-	u32 s_size;		/* send work queue size */
-	u32 s_head;		/* new entries added here */
-	u32 s_tail;		/* next entry to process */
-	u32 s_cur;		/* current work queue entry */
-	u32 s_last;		/* last un-ACK'ed entry */
-	u32 s_ssn;		/* SSN of tail entry */
-	u32 s_lsn;		/* limit sequence number (credit) */
-	struct ipath_swqe *s_wq;	/* send work queue */
-	struct ipath_swqe *s_wqe;
-	struct ipath_sge *r_ud_sg_list;
-	struct ipath_rq r_rq;		/* receive work queue */
-	struct ipath_sge r_sg_list[0];	/* verified SGEs */
-};
-
-/*
- * Atomic bit definitions for r_aflags.
- */
-#define IPATH_R_WRID_VALID	0
-
-/*
- * Bit definitions for r_flags.
- */
-#define IPATH_R_REUSE_SGE	0x01
-#define IPATH_R_RDMAR_SEQ	0x02
-
-/*
- * Bit definitions for s_flags.
- *
- * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
- *			   before processing the next SWQE
- * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
- *			   before processing the next SWQE
- * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
- * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
- * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
- *		      next send completion entry not via send DMA.
- */
-#define IPATH_S_SIGNAL_REQ_WR	0x01
-#define IPATH_S_FENCE_PENDING	0x02
-#define IPATH_S_RDMAR_PENDING	0x04
-#define IPATH_S_ACK_PENDING	0x08
-#define IPATH_S_BUSY		0x10
-#define IPATH_S_WAITING		0x20
-#define IPATH_S_WAIT_SSN_CREDIT	0x40
-#define IPATH_S_WAIT_DMA	0x80
-
-#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
-	IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
-
-#define IPATH_PSN_CREDIT	512
-
-/*
- * Since struct ipath_swqe is not a fixed size, we can't simply index into
- * struct ipath_qp.s_wq.  This function does the array index computation.
- */
-static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
-					      unsigned n)
-{
-	return (struct ipath_swqe *)((char *)qp->s_wq +
-				     (sizeof(struct ipath_swqe) +
-				      qp->s_max_sge *
-				      sizeof(struct ipath_sge)) * n);
-}
-
-/*
- * Since struct ipath_rwqe is not a fixed size, we can't simply index into
- * struct ipath_rwq.wq.  This function does the array index computation.
- */
-static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
-					      unsigned n)
-{
-	return (struct ipath_rwqe *)
-		((char *) rq->wq->wq +
-		 (sizeof(struct ipath_rwqe) +
-		  rq->max_sge * sizeof(struct ib_sge)) * n);
-}
-
-/*
- * QPN-map pages start out as NULL, they get allocated upon
- * first use and are never deallocated. This way,
- * large bitmaps are not allocated unless large numbers of QPs are used.
- */
-struct qpn_map {
-	atomic_t n_free;
-	void *page;
-};
-
-struct ipath_qp_table {
-	spinlock_t lock;
-	u32 last;		/* last QP number allocated */
-	u32 max;		/* size of the hash table */
-	u32 nmaps;		/* size of the map table */
-	struct ipath_qp **table;
-	/* bit map of free numbers */
-	struct qpn_map map[QPNMAP_ENTRIES];
-};
-
-struct ipath_lkey_table {
-	spinlock_t lock;
-	u32 next;		/* next unused index (speeds search) */
-	u32 gen;		/* generation count */
-	u32 max;		/* size of the table */
-	struct ipath_mregion **table;
-};
-
-struct ipath_opcode_stats {
-	u64 n_packets;		/* number of packets */
-	u64 n_bytes;		/* total number of bytes */
-};
-
-struct ipath_ibdev {
-	struct ib_device ibdev;
-	struct ipath_devdata *dd;
-	struct list_head pending_mmaps;
-	spinlock_t mmap_offset_lock;
-	u32 mmap_offset;
-	int ib_unit;		/* This is the device number */
-	u16 sm_lid;		/* in host order */
-	u8 sm_sl;
-	u8 mkeyprot;
-	/* non-zero when timer is set */
-	unsigned long mkey_lease_timeout;
-
-	/* The following fields are really per port. */
-	struct ipath_qp_table qp_table;
-	struct ipath_lkey_table lk_table;
-	struct list_head pending[3];	/* FIFO of QPs waiting for ACKs */
-	struct list_head piowait;	/* list for wait PIO buf */
-	struct list_head txreq_free;
-	void *txreq_bufs;
-	/* list of QPs waiting for RNR timer */
-	struct list_head rnrwait;
-	spinlock_t pending_lock;
-	__be64 sys_image_guid;	/* in network order */
-	__be64 gid_prefix;	/* in network order */
-	__be64 mkey;
-
-	u32 n_pds_allocated;	/* number of PDs allocated for device */
-	spinlock_t n_pds_lock;
-	u32 n_ahs_allocated;	/* number of AHs allocated for device */
-	spinlock_t n_ahs_lock;
-	u32 n_cqs_allocated;	/* number of CQs allocated for device */
-	spinlock_t n_cqs_lock;
-	u32 n_qps_allocated;	/* number of QPs allocated for device */
-	spinlock_t n_qps_lock;
-	u32 n_srqs_allocated;	/* number of SRQs allocated for device */
-	spinlock_t n_srqs_lock;
-	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
-	spinlock_t n_mcast_grps_lock;
-
-	u64 ipath_sword;	/* total dwords sent (sample result) */
-	u64 ipath_rword;	/* total dwords received (sample result) */
-	u64 ipath_spkts;	/* total packets sent (sample result) */
-	u64 ipath_rpkts;	/* total packets received (sample result) */
-	/* # of ticks no data sent (sample result) */
-	u64 ipath_xmit_wait;
-	u64 rcv_errors;		/* # of packets with SW detected rcv errs */
-	u64 n_unicast_xmit;	/* total unicast packets sent */
-	u64 n_unicast_rcv;	/* total unicast packets received */
-	u64 n_multicast_xmit;	/* total multicast packets sent */
-	u64 n_multicast_rcv;	/* total multicast packets received */
-	u64 z_symbol_error_counter;		/* starting count for PMA */
-	u64 z_link_error_recovery_counter;	/* starting count for PMA */
-	u64 z_link_downed_counter;		/* starting count for PMA */
-	u64 z_port_rcv_errors;			/* starting count for PMA */
-	u64 z_port_rcv_remphys_errors;		/* starting count for PMA */
-	u64 z_port_xmit_discards;		/* starting count for PMA */
-	u64 z_port_xmit_data;			/* starting count for PMA */
-	u64 z_port_rcv_data;			/* starting count for PMA */
-	u64 z_port_xmit_packets;		/* starting count for PMA */
-	u64 z_port_rcv_packets;			/* starting count for PMA */
-	u32 z_pkey_violations;			/* starting count for PMA */
-	u32 z_local_link_integrity_errors;	/* starting count for PMA */
-	u32 z_excessive_buffer_overrun_errors;	/* starting count for PMA */
-	u32 z_vl15_dropped;			/* starting count for PMA */
-	u32 n_rc_resends;
-	u32 n_rc_acks;
-	u32 n_rc_qacks;
-	u32 n_seq_naks;
-	u32 n_rdma_seq;
-	u32 n_rnr_naks;
-	u32 n_other_naks;
-	u32 n_timeouts;
-	u32 n_pkt_drops;
-	u32 n_vl15_dropped;
-	u32 n_wqe_errs;
-	u32 n_rdma_dup_busy;
-	u32 n_piowait;
-	u32 n_unaligned;
-	u32 port_cap_flags;
-	u32 pma_sample_start;
-	u32 pma_sample_interval;
-	__be16 pma_counter_select[5];
-	u16 pma_tag;
-	u16 qkey_violations;
-	u16 mkey_violations;
-	u16 mkey_lease_period;
-	u16 pending_index;	/* which pending queue is active */
-	u8 pma_sample_status;
-	u8 subnet_timeout;
-	u8 vl_high_limit;
-	struct ipath_opcode_stats opstats[128];
-};
-
-struct ipath_verbs_counters {
-	u64 symbol_error_counter;
-	u64 link_error_recovery_counter;
-	u64 link_downed_counter;
-	u64 port_rcv_errors;
-	u64 port_rcv_remphys_errors;
-	u64 port_xmit_discards;
-	u64 port_xmit_data;
-	u64 port_rcv_data;
-	u64 port_xmit_packets;
-	u64 port_rcv_packets;
-	u32 local_link_integrity_errors;
-	u32 excessive_buffer_overrun_errors;
-	u32 vl15_dropped;
-};
-
-struct ipath_verbs_txreq {
-	struct ipath_qp         *qp;
-	struct ipath_swqe       *wqe;
-	u32                      map_len;
-	u32                      len;
-	struct ipath_sge_state  *ss;
-	struct ipath_pio_header  hdr;
-	struct ipath_sdma_txreq  txreq;
-};
-
-static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
-{
-	return container_of(ibmr, struct ipath_mr, ibmr);
-}
-
-static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
-{
-	return container_of(ibpd, struct ipath_pd, ibpd);
-}
-
-static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
-{
-	return container_of(ibah, struct ipath_ah, ibah);
-}
-
-static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
-{
-	return container_of(ibcq, struct ipath_cq, ibcq);
-}
-
-static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
-{
-	return container_of(ibsrq, struct ipath_srq, ibsrq);
-}
-
-static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
-{
-	return container_of(ibqp, struct ipath_qp, ibqp);
-}
-
-static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
-{
-	return container_of(ibdev, struct ipath_ibdev, ibdev);
-}
-
-/*
- * This must be called with s_lock held.
- */
-static inline void ipath_schedule_send(struct ipath_qp *qp)
-{
-	if (qp->s_flags & IPATH_S_ANY_WAIT)
-		qp->s_flags &= ~IPATH_S_ANY_WAIT;
-	if (!(qp->s_flags & IPATH_S_BUSY))
-		tasklet_hi_schedule(&qp->s_task);
-}
-
-int ipath_process_mad(struct ib_device *ibdev,
-		      int mad_flags,
-		      u8 port_num,
-		      const struct ib_wc *in_wc,
-		      const struct ib_grh *in_grh,
-		      const struct ib_mad_hdr *in, size_t in_mad_size,
-		      struct ib_mad_hdr *out, size_t *out_mad_size,
-		      u16 *out_mad_pkey_index);
-
-/*
- * Compare the lower 24 bits of the two values.
- * Returns an integer <, ==, or > than zero.
- */
-static inline int ipath_cmp24(u32 a, u32 b)
-{
-	return (((int) a) - ((int) b)) << 8;
-}
-
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-			    u64 *rwords, u64 *spkts, u64 *rpkts,
-			    u64 *xmit_wait);
-
-int ipath_get_counters(struct ipath_devdata *dd,
-		       struct ipath_verbs_counters *cntrs);
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_mcast_tree_empty(void);
-
-__be32 ipath_compute_aeth(struct ipath_qp *qp);
-
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
-
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-			      struct ib_qp_init_attr *init_attr,
-			      struct ib_udata *udata);
-
-int ipath_destroy_qp(struct ib_qp *ibqp);
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		    int attr_mask, struct ib_udata *udata);
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		   int attr_mask, struct ib_qp_init_attr *init_attr);
-
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
-
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
-
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
-
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
-
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
-
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
-
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
-
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
-
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
-		     struct ipath_mregion *mr);
-
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
-
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-		  struct ib_sge *sge, int acc);
-
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-		  u32 len, u64 vaddr, u32 rkey, int acc);
-
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-			   struct ib_recv_wr **bad_wr);
-
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-				struct ib_srq_init_attr *srq_init_attr,
-				struct ib_udata *udata);
-
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-		     enum ib_srq_attr_mask attr_mask,
-		     struct ib_udata *udata);
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-
-int ipath_destroy_srq(struct ib_srq *ibsrq);
-
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
-
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-			      const struct ib_cq_init_attr *attr,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata);
-
-int ipath_destroy_cq(struct ib_cq *ibcq);
-
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
-
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
-
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
-
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-				struct ib_phys_buf *buffer_list,
-				int num_phys_buf, int acc, u64 *iova_start);
-
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				u64 virt_addr, int mr_access_flags,
-				struct ib_udata *udata);
-
-int ipath_dereg_mr(struct ib_mr *ibmr);
-
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-			       struct ib_fmr_attr *fmr_attr);
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-		       int list_len, u64 iova);
-
-int ipath_unmap_fmr(struct list_head *fmr_list);
-
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
-
-void ipath_release_mmap_info(struct kref *ref);
-
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-					       u32 size,
-					       struct ib_ucontext *context,
-					       void *obj);
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-			    struct ipath_mmap_info *ip,
-			    u32 size, void *obj);
-
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-void ipath_insert_rnr_queue(struct ipath_qp *qp);
-
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-		   u32 *lengthp, struct ipath_sge_state *ss);
-
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
-
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-		   struct ib_global_route *grh, u32 hwords, u32 nwords);
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-			   struct ipath_other_headers *ohdr,
-			   u32 bth0, u32 bth2);
-
-void ipath_do_send(unsigned long data);
-
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-			 enum ib_wc_status status);
-
-int ipath_make_rc_req(struct ipath_qp *qp);
-
-int ipath_make_uc_req(struct ipath_qp *qp);
-
-int ipath_make_ud_req(struct ipath_qp *qp);
-
-int ipath_register_ib_device(struct ipath_devdata *);
-
-void ipath_unregister_ib_device(struct ipath_ibdev *);
-
-void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
-
-int ipath_ib_piobufavail(struct ipath_ibdev *);
-
-unsigned ipath_get_npkeys(struct ipath_devdata *);
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *);
-
-unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
-
-extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
-
-/*
- * Below converts HCA-specific LinkTrainingState to IB PhysPortState
- * values.
- */
-extern const u8 ipath_cvt_physportstate[];
-#define IB_PHYSPORTSTATE_SLEEP 1
-#define IB_PHYSPORTSTATE_POLL 2
-#define IB_PHYSPORTSTATE_DISABLED 3
-#define IB_PHYSPORTSTATE_CFG_TRAIN 4
-#define IB_PHYSPORTSTATE_LINKUP 5
-#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
-
-extern const int ib_ipath_state_ops[];
-
-extern unsigned int ib_ipath_lkey_table_size;
-
-extern unsigned int ib_ipath_max_cqes;
-
-extern unsigned int ib_ipath_max_cqs;
-
-extern unsigned int ib_ipath_max_qp_wrs;
-
-extern unsigned int ib_ipath_max_qps;
-
-extern unsigned int ib_ipath_max_sges;
-
-extern unsigned int ib_ipath_max_mcast_grps;
-
-extern unsigned int ib_ipath_max_mcast_qp_attached;
-
-extern unsigned int ib_ipath_max_srqs;
-
-extern unsigned int ib_ipath_max_srq_sges;
-
-extern unsigned int ib_ipath_max_srq_wrs;
-
-extern const u32 ib_ipath_rnr_table[];
-
-extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
-
-#endif				/* IPATH_VERBS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
deleted file mode 100644
index 6216ea9..0000000
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/rculist.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include "ipath_verbs.h"
-
-/*
- * Global table of GID to attached QPs.
- * The table is global to all ipath devices since a send from one QP/device
- * needs to be locally routed to any locally attached QPs on the same
- * or different device.
- */
-static struct rb_root mcast_tree;
-static DEFINE_SPINLOCK(mcast_lock);
-
-/**
- * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
- * @qp: the QP to link
- */
-static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
-{
-	struct ipath_mcast_qp *mqp;
-
-	mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
-	if (!mqp)
-		goto bail;
-
-	mqp->qp = qp;
-	atomic_inc(&qp->refcount);
-
-bail:
-	return mqp;
-}
-
-static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
-{
-	struct ipath_qp *qp = mqp->qp;
-
-	/* Notify ipath_destroy_qp() if it is waiting. */
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-
-	kfree(mqp);
-}
-
-/**
- * ipath_mcast_alloc - allocate the multicast GID structure
- * @mgid: the multicast GID
- *
- * A list of QPs will be attached to this structure.
- */
-static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
-{
-	struct ipath_mcast *mcast;
-
-	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
-	if (!mcast)
-		goto bail;
-
-	mcast->mgid = *mgid;
-	INIT_LIST_HEAD(&mcast->qp_list);
-	init_waitqueue_head(&mcast->wait);
-	atomic_set(&mcast->refcount, 0);
-	mcast->n_attached = 0;
-
-bail:
-	return mcast;
-}
-
-static void ipath_mcast_free(struct ipath_mcast *mcast)
-{
-	struct ipath_mcast_qp *p, *tmp;
-
-	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
-		ipath_mcast_qp_free(p);
-
-	kfree(mcast);
-}
-
-/**
- * ipath_mcast_find - search the global table for the given multicast GID
- * @mgid: the multicast GID to search for
- *
- * Returns NULL if not found.
- *
- * The caller is responsible for decrementing the reference count if found.
- */
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
-{
-	struct rb_node *n;
-	unsigned long flags;
-	struct ipath_mcast *mcast;
-
-	spin_lock_irqsave(&mcast_lock, flags);
-	n = mcast_tree.rb_node;
-	while (n) {
-		int ret;
-
-		mcast = rb_entry(n, struct ipath_mcast, rb_node);
-
-		ret = memcmp(mgid->raw, mcast->mgid.raw,
-			     sizeof(union ib_gid));
-		if (ret < 0)
-			n = n->rb_left;
-		else if (ret > 0)
-			n = n->rb_right;
-		else {
-			atomic_inc(&mcast->refcount);
-			spin_unlock_irqrestore(&mcast_lock, flags);
-			goto bail;
-		}
-	}
-	spin_unlock_irqrestore(&mcast_lock, flags);
-
-	mcast = NULL;
-
-bail:
-	return mcast;
-}
-
-/**
- * ipath_mcast_add - insert mcast GID into table and attach QP struct
- * @mcast: the mcast GID table
- * @mqp: the QP to attach
- *
- * Return zero if both were added.  Return EEXIST if the GID was already in
- * the table but the QP was added.  Return ESRCH if the QP was already
- * attached and neither structure was added.
- */
-static int ipath_mcast_add(struct ipath_ibdev *dev,
-			   struct ipath_mcast *mcast,
-			   struct ipath_mcast_qp *mqp)
-{
-	struct rb_node **n = &mcast_tree.rb_node;
-	struct rb_node *pn = NULL;
-	int ret;
-
-	spin_lock_irq(&mcast_lock);
-
-	while (*n) {
-		struct ipath_mcast *tmcast;
-		struct ipath_mcast_qp *p;
-
-		pn = *n;
-		tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
-
-		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
-			     sizeof(union ib_gid));
-		if (ret < 0) {
-			n = &pn->rb_left;
-			continue;
-		}
-		if (ret > 0) {
-			n = &pn->rb_right;
-			continue;
-		}
-
-		/* Search the QP list to see if this is already there. */
-		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
-			if (p->qp == mqp->qp) {
-				ret = ESRCH;
-				goto bail;
-			}
-		}
-		if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
-			ret = ENOMEM;
-			goto bail;
-		}
-
-		tmcast->n_attached++;
-
-		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
-		ret = EEXIST;
-		goto bail;
-	}
-
-	spin_lock(&dev->n_mcast_grps_lock);
-	if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
-		spin_unlock(&dev->n_mcast_grps_lock);
-		ret = ENOMEM;
-		goto bail;
-	}
-
-	dev->n_mcast_grps_allocated++;
-	spin_unlock(&dev->n_mcast_grps_lock);
-
-	mcast->n_attached++;
-
-	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
-
-	atomic_inc(&mcast->refcount);
-	rb_link_node(&mcast->rb_node, pn, n);
-	rb_insert_color(&mcast->rb_node, &mcast_tree);
-
-	ret = 0;
-
-bail:
-	spin_unlock_irq(&mcast_lock);
-
-	return ret;
-}
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-	struct ipath_ibdev *dev = to_idev(ibqp->device);
-	struct ipath_mcast *mcast;
-	struct ipath_mcast_qp *mqp;
-	int ret;
-
-	/*
-	 * Allocate data structures since its better to do this outside of
-	 * spin locks and it will most likely be needed.
-	 */
-	mcast = ipath_mcast_alloc(gid);
-	if (mcast == NULL) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-	mqp = ipath_mcast_qp_alloc(qp);
-	if (mqp == NULL) {
-		ipath_mcast_free(mcast);
-		ret = -ENOMEM;
-		goto bail;
-	}
-	switch (ipath_mcast_add(dev, mcast, mqp)) {
-	case ESRCH:
-		/* Neither was used: can't attach the same QP twice. */
-		ipath_mcast_qp_free(mqp);
-		ipath_mcast_free(mcast);
-		ret = -EINVAL;
-		goto bail;
-	case EEXIST:		/* The mcast wasn't used */
-		ipath_mcast_free(mcast);
-		break;
-	case ENOMEM:
-		/* Exceeded the maximum number of mcast groups. */
-		ipath_mcast_qp_free(mqp);
-		ipath_mcast_free(mcast);
-		ret = -ENOMEM;
-		goto bail;
-	default:
-		break;
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-	struct ipath_qp *qp = to_iqp(ibqp);
-	struct ipath_ibdev *dev = to_idev(ibqp->device);
-	struct ipath_mcast *mcast = NULL;
-	struct ipath_mcast_qp *p, *tmp;
-	struct rb_node *n;
-	int last = 0;
-	int ret;
-
-	spin_lock_irq(&mcast_lock);
-
-	/* Find the GID in the mcast table. */
-	n = mcast_tree.rb_node;
-	while (1) {
-		if (n == NULL) {
-			spin_unlock_irq(&mcast_lock);
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		mcast = rb_entry(n, struct ipath_mcast, rb_node);
-		ret = memcmp(gid->raw, mcast->mgid.raw,
-			     sizeof(union ib_gid));
-		if (ret < 0)
-			n = n->rb_left;
-		else if (ret > 0)
-			n = n->rb_right;
-		else
-			break;
-	}
-
-	/* Search the QP list. */
-	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
-		if (p->qp != qp)
-			continue;
-		/*
-		 * We found it, so remove it, but don't poison the forward
-		 * link until we are sure there are no list walkers.
-		 */
-		list_del_rcu(&p->list);
-		mcast->n_attached--;
-
-		/* If this was the last attached QP, remove the GID too. */
-		if (list_empty(&mcast->qp_list)) {
-			rb_erase(&mcast->rb_node, &mcast_tree);
-			last = 1;
-		}
-		break;
-	}
-
-	spin_unlock_irq(&mcast_lock);
-
-	if (p) {
-		/*
-		 * Wait for any list walkers to finish before freeing the
-		 * list element.
-		 */
-		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
-		ipath_mcast_qp_free(p);
-	}
-	if (last) {
-		atomic_dec(&mcast->refcount);
-		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
-		ipath_mcast_free(mcast);
-		spin_lock_irq(&dev->n_mcast_grps_lock);
-		dev->n_mcast_grps_allocated--;
-		spin_unlock_irq(&dev->n_mcast_grps_lock);
-	}
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-int ipath_mcast_tree_empty(void)
-{
-	return mcast_tree.rb_node == NULL;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
deleted file mode 100644
index 1a7e20a..0000000
--- a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on PowerPC only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * Nothing to do on PowerPC, so just return without error.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-	return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
deleted file mode 100644
index 7b6e4c8..0000000
--- a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on x86_64 only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include <linux/pci.h>
-#include <asm/processor.h>
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
- * write combining.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-	int ret = 0;
-	u64 pioaddr, piolen;
-	unsigned bits;
-	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
-	const size_t len = pci_resource_len(dd->pcidev, 0);
-
-	/*
-	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
-	 * chip.  Linux (possibly the hardware) requires it to be on a power
-	 * of 2 address matching the length (which has to be a power of 2).
-	 * For rev1, that means the base address, for rev2, it will be just
-	 * the PIO buffers themselves.
-	 * For chips with two sets of buffers, the calculations are
-	 * somewhat more complicated; we need to sum, and the piobufbase
-	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
-	 * The buffers are still packed, so a single range covers both.
-	 */
-	if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
-		unsigned long pio2kbase, pio4kbase;
-		pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
-		pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
-		if (pio2kbase < pio4kbase) { /* all, for now */
-			pioaddr = addr + pio2kbase;
-			piolen = pio4kbase - pio2kbase +
-				dd->ipath_piobcnt4k * dd->ipath_4kalign;
-		} else {
-			pioaddr = addr + pio4kbase;
-			piolen = pio2kbase - pio4kbase +
-				dd->ipath_piobcnt2k * dd->ipath_palign;
-		}
-	} else {  /* single buffer size (2K, currently) */
-		pioaddr = addr + dd->ipath_piobufbase;
-		piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
-			dd->ipath_piobcnt4k * dd->ipath_4kalign;
-	}
-
-	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
-		/* do nothing */ ;
-
-	if (piolen != (1ULL << bits)) {
-		piolen >>= bits;
-		while (piolen >>= 1)
-			bits++;
-		piolen = 1ULL << (bits + 1);
-	}
-	if (pioaddr & (piolen - 1)) {
-		u64 atmp;
-		ipath_dbg("pioaddr %llx not on right boundary for size "
-			  "%llx, fixing\n",
-			  (unsigned long long) pioaddr,
-			  (unsigned long long) piolen);
-		atmp = pioaddr & ~(piolen - 1);
-		if (atmp < addr || (atmp + piolen) > (addr + len)) {
-			ipath_dev_err(dd, "No way to align address/size "
-				      "(%llx/%llx), no WC mtrr\n",
-				      (unsigned long long) atmp,
-				      (unsigned long long) piolen << 1);
-			ret = -ENODEV;
-		} else {
-			ipath_dbg("changing WC base from %llx to %llx, "
-				  "len from %llx to %llx\n",
-				  (unsigned long long) pioaddr,
-				  (unsigned long long) atmp,
-				  (unsigned long long) piolen,
-				  (unsigned long long) piolen << 1);
-			pioaddr = atmp;
-			piolen <<= 1;
-		}
-	}
-
-	if (!ret) {
-		dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
-		if (dd->wc_cookie < 0) {
-			ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
-			ret = -ENODEV;
-		} else if (dd->wc_cookie == 0)
-			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
-		else
-			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
-	}
-
-	return ret;
-}
-
-/**
- * ipath_disable_wc - disable write combining for MMIO writes to the device
- * @dd: infinipath device
- */
-void ipath_disable_wc(struct ipath_devdata *dd)
-{
-	arch_phys_wc_del(dd->wc_cookie);
-}
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
index 0bff438..e582553 100644
--- a/drivers/staging/rdma/Kconfig
+++ b/drivers/staging/rdma/Kconfig
@@ -22,4 +22,6 @@ menuconfig STAGING_RDMA
 # Please keep entries in alphabetic order
 if STAGING_RDMA
 
+source "drivers/staging/rdma/ipath/Kconfig"
+
 endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
index b5e94f1..484dac7 100644
--- a/drivers/staging/rdma/Makefile
+++ b/drivers/staging/rdma/Makefile
@@ -1 +1,2 @@
 # Entries for RDMA_STAGING tree
+obj-$(CONFIG_INFINIBAND_IPATH)	+= ipath/
diff --git a/drivers/staging/rdma/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig
new file mode 100644
index 0000000..041ce06
--- /dev/null
+++ b/drivers/staging/rdma/ipath/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_IPATH
+	tristate "QLogic HTX HCA support"
+	depends on 64BIT && NET && HT_IRQ
+	---help---
+	This is a driver for the deprecated QLogic Hyper-Transport
+	IB host channel adapter (model QHT7140),
+	including InfiniBand verbs support.  This driver allows these
+	devices to be used with both kernel upper level protocols such
+	as IP-over-InfiniBand as well as with userspace applications
+	(in conjunction with InfiniBand userspace access).
+	For QLogic PCIe QLE based cards, use the QIB driver instead.
+
+	If you have this hardware you will need to boot with PAT disabled
+	on your x86-64 systems, use the nopat kernel parameter.
+
+	Note that this driver will soon be removed entirely from the kernel.
diff --git a/drivers/staging/rdma/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile
new file mode 100644
index 0000000..4496f28
--- /dev/null
+++ b/drivers/staging/rdma/ipath/Makefile
@@ -0,0 +1,37 @@
+ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
+	-DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ib_ipath-y := \
+	ipath_cq.o \
+	ipath_diag.o \
+	ipath_dma.o \
+	ipath_driver.o \
+	ipath_eeprom.o \
+	ipath_file_ops.o \
+	ipath_fs.o \
+	ipath_init_chip.o \
+	ipath_intr.o \
+	ipath_keys.o \
+	ipath_mad.o \
+	ipath_mmap.o \
+	ipath_mr.o \
+	ipath_qp.o \
+	ipath_rc.o \
+	ipath_ruc.o \
+	ipath_sdma.o \
+	ipath_srq.o \
+	ipath_stats.o \
+	ipath_sysfs.o \
+	ipath_uc.o \
+	ipath_ud.o \
+	ipath_user_pages.o \
+	ipath_user_sdma.o \
+	ipath_verbs_mcast.o \
+	ipath_verbs.o
+
+ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
+
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO
new file mode 100644
index 0000000..cb00158
--- /dev/null
+++ b/drivers/staging/rdma/ipath/TODO
@@ -0,0 +1,5 @@
+The ipath driver has been moved to staging in preparation for its removal in a
+few releases. The driver will be deleted during the 4.6 merge window.
+
+Contact Dennis Dalessandro <dennis.dalessandro@intel.com> and
+Cc: linux-rdma@vger.kernel.org
diff --git a/drivers/staging/rdma/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h
new file mode 100644
index 0000000..28cfe97
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_common.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN		0
+#define IPATH_IB_LINKARM		1
+#define IPATH_IB_LINKACTIVE		2
+#define IPATH_IB_LINKDOWN_ONLY		3
+#define IPATH_IB_LINKDOWN_SLEEP		4
+#define IPATH_IB_LINKDOWN_DISABLE	5
+#define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
+#define IPATH_IB_LINK_NO_HRTBT	8 /* disable Heartbeat, e.g. for loopback */
+#define IPATH_IB_LINK_HRTBT	9 /* enable heartbeat, normal, non-loopback */
+
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
+/*
+ * stats maintained by the driver.  For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+	/* number of interrupts taken */
+	__u64 sps_ints;
+	/* number of interrupts for errors */
+	__u64 sps_errints;
+	/* number of errors from chip (not incl. packet errors or CRC) */
+	__u64 sps_errs;
+	/* number of packet errors from chip other than CRC */
+	__u64 sps_pkterrs;
+	/* number of packets with CRC errors (ICRC and VCRC) */
+	__u64 sps_crcerrs;
+	/* number of hardware errors reported (parity, etc.) */
+	__u64 sps_hwerrs;
+	/* number of times IB link changed state unexpectedly */
+	__u64 sps_iblink;
+	__u64 sps_unused; /* was fastrcvint, no longer implemented */
+	/* number of kernel (port0) packets received */
+	__u64 sps_port0pkts;
+	/* number of "ethernet" packets sent by driver */
+	__u64 sps_ether_spkts;
+	/* number of "ethernet" packets received by driver */
+	__u64 sps_ether_rpkts;
+	/* number of SMA packets sent by driver. Obsolete. */
+	__u64 sps_sma_spkts;
+	/* number of SMA packets received by driver. Obsolete. */
+	__u64 sps_sma_rpkts;
+	/* number of times all ports rcvhdrq was full and packet dropped */
+	__u64 sps_hdrqfull;
+	/* number of times all ports egrtid was full and packet dropped */
+	__u64 sps_etidfull;
+	/*
+	 * number of times we tried to send from driver, but no pio buffers
+	 * avail
+	 */
+	__u64 sps_nopiobufs;
+	/* number of ports currently open */
+	__u64 sps_ports;
+	/* list of pkeys (other than default) accepted (0 means not set) */
+	__u16 sps_pkeys[4];
+	__u16 sps_unused16[4]; /* available; maintaining compatible layout */
+	/* number of user ports per chip (not IB ports) */
+	__u32 sps_nports;
+	/* not our interrupt, or already handled */
+	__u32 sps_nullintr;
+	/* max number of packets handled per receive call */
+	__u32 sps_maxpkts_call;
+	/* avg number of packets handled per receive call */
+	__u32 sps_avgpkts_call;
+	/* total number of pages locked */
+	__u64 sps_pagelocks;
+	/* total number of pages unlocked */
+	__u64 sps_pageunlocks;
+	/*
+	 * Number of packets dropped in kernel other than errors (ether
+	 * packets if ipath not configured, etc.)
+	 */
+	__u64 sps_krdrops;
+	__u64 sps_txeparity; /* PIO buffer parity error, recovered */
+	/* pad for future growth */
+	__u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1	/* basic initialization done */
+#define IPATH_STATUS_DISABLED      0x2	/* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED    0x4
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT	0x1
+#define IPATH_RUNTIME_PCIE	0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER	0x4
+#define IPATH_RUNTIME_RCVHDR_COPY	0x8
+#define IPATH_RUNTIME_MASTER	0x10
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SDMA	      0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* InfiniPath port assigned, goes into sent packets */
+	__u16 spi_port;
+	__u16 spi_subport;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in infinipath, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in infinipath, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where receive buffer queue is mapped into */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an infinipath
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.
+	 */
+	__u64 __spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ports available to user processes */
+	__u32 spi_nports;
+	/* unit number of chip we are using */
+	__u32 spi_unit;
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_filler_for_align;
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/* shared memory pages for subports if port is shared */
+	__u64 spi_subport_uregbase;
+	__u64 spi_subport_rcvegrbuf;
+	__u64 spi_subport_rcvhdr_base;
+
+	/* shared memory page for hardware port if it is shared */
+	__u64 spi_port_uregbase;
+	__u64 spi_port_rcvegrbuf;
+	__u64 spi_port_rcvhdr_base;
+	__u64 spi_port_rcvhdr_tailaddr;
+
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 6
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to IPATH_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	/* desired number of receive header queue entries */
+	__u32 spu_rcvhdrcnt;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	/*
+	 * number of words in KD protocol header
+	 * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
+	 * kernel uses a default.  Once set, attempts to set any other value
+	 * are an error (EAGAIN) until driver is reloaded.
+	 */
+	__u32 spu_rcvhdrsize;
+
+	/*
+	 * If two or more processes wish to share a port, each process
+	 * must set the spu_subport_cnt and spu_subport_id to the same
+	 * values.  The only restriction on the spu_subport_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subport_cnt;
+	__u16 spu_subport_id;
+
+	__u32 spu_unused; /* kept for compatible layout */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN		16
+
+#define __IPATH_CMD_USER_INIT	16	/* old set up userspace (for old user code) */
+#define IPATH_CMD_PORT_INFO	17	/* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL	18	/* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE	19	/* update expected TID entries */
+#define IPATH_CMD_TID_FREE	20	/* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY	21	/* add partition key */
+#define __IPATH_CMD_SLAVE_INFO	22	/* return info on slave processes (for old user code) */
+#define IPATH_CMD_ASSIGN_PORT	23	/* allocate HCA and port */
+#define IPATH_CMD_USER_INIT 	24	/* set up userspace */
+#define IPATH_CMD_UNUSED_1	25
+#define IPATH_CMD_UNUSED_2	26
+#define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL	29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31	/* sdma inflight counter request */
+#define IPATH_CMD_SDMA_COMPLETE 32	/* sdma completion counter request */
+
+/*
+ * Poll types
+ */
+#define IPATH_POLL_TYPE_URGENT	 0x01
+#define IPATH_POLL_TYPE_OVERFLOW 0x02
+
+struct ipath_port_info {
+	__u32 num_active;	/* number of active units */
+	__u32 unit;		/* unit (chip) assigned to caller */
+	__u16 port;		/* port on unit assigned to caller */
+	__u16 subport;		/* subport on unit assigned to caller */
+	__u16 num_ports;	/* number of ports available on unit */
+	__u16 num_subports;	/* number of subports opened on port */
+};
+
+struct ipath_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct ipath_cmd {
+	__u32 type;			/* command type */
+	union {
+		struct ipath_tid_info tid_info;
+		struct ipath_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct ipath_port_info to
+		   write result to */
+		__u64 port_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+	} cmd;
+};
+
+struct ipath_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * ipath_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+	__u32 sps_flags;	/* flags for packet (TBD) */
+	__u32 sps_cnt;		/* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct ipath_iovec sps_iov[4];
+};
+
+/*
+ * diagnostics can send a packet by "writing" one of the following
+ * two structs to diag data special file
+ * The first is the legacy version for backward compatibility
+ */
+struct ipath_diag_pkt {
+	__u32 unit;
+	__u64 data;
+	__u32 len;
+};
+
+/* The second diag_pkt struct is the expanded version that allows
+ * more control over the packet, specifically, by allowing a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+struct ipath_diag_xpkt {
+	__u64 data;
+	__u64 pbc_wd;
+	__u32 unit;
+	__u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+	/* flash layout version (IPATH_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;	/* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;	/* was Reserved6 */
+	__u64 RxP10HdrEgrOvflCnt;	/* was Reserved7 */
+	__u64 RxP11HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP12HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP13HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP14HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP15HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP16HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	/* The following are new for IBA7220 */
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
+#define INFINIPATH_RHF_L_USE_EGR   0x80000000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT	4
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_TEST    0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+#define INFINIPATH_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+	/*
+	 * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Port 4, TID 11, offset 13.
+	 */
+	__le32 ver_port_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct ipath_header iph;
+	__u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct ipath_header iph;
+	__u8 sub_opcode;
+	__u8 cmd;
+	__be16 lid;
+	__u16 mac[3];
+	__u8 frag_num;
+	__u8 seq_num;
+	__le32 len;
+	/* MUST be of word size due to PIO write requirements */
+	__le32 csum;
+	__le16 csum_offset;
+	__le16 flags;
+	__u16 first_2_bytes;
+	__u8 unused[2];		/* currently unused */
+};
+
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+
+/* sub OpCodes - ith4x  */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+	    & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+		& INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+	    & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+		& INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+		& INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+	    & INFINIPATH_I_VERS_MASK;
+}
+
+#endif				/* _IPATH_COMMON_H */
diff --git a/drivers/staging/rdma/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c
new file mode 100644
index 0000000..e9dd911
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_cq.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED && solicited)) {
+		cq->notify = IB_CQ_NONE;
+		cq->triggered++;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		tasklet_hi_schedule(&cq->comptask);
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (entry->status != IB_WC_SUCCESS)
+		to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+	struct ipath_cq *cq = (struct ipath_cq *)data;
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, tasklet_hi_schedule()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+			      const struct ib_cq_init_attr *attr,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cq *cq;
+	struct ipath_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (entries < 1 || entries > ib_ipath_max_cqes) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+	struct ipath_ibdev *dev = to_idev(ibcq->device);
+	struct ipath_cq *cq = to_icq(ibcq);
+
+	tasklet_kill(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *old_wc;
+	struct ipath_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > ib_ipath_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct ipath_ibdev *dev = to_idev(ibcq->device);
+		struct ipath_mmap_info *ip = cq->ip;
+
+		ipath_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See ipath_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h
new file mode 100644
index 0000000..65926cd
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_debug.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING	/* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1	/* generic low verbosity stuff */
+#define __IPATH_DBG         0x2	/* generic debug */
+#define __IPATH_TRSAMPLE    0x8	/* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40	/* very verbose debug */
+#define __IPATH_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+#define __IPATH_ERRPKTDBG   0x400
+#define __IPATH_USER_SEND   0x1000	/* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000	/* use kernel mode send */
+#define __IPATH_EPKTDBG     0x4000	/* print ethernet packet data */
+#define __IPATH_IPATHDBG    0x10000	/* Ethernet (IPATH) gen debug */
+#define __IPATH_IPATHWARN   0x20000	/* Ethernet (IPATH) warnings */
+#define __IPATH_IPATHERR    0x40000	/* Ethernet (IPATH) errors */
+#define __IPATH_IPATHPD     0x80000	/* Ethernet (IPATH) packet dump */
+#define __IPATH_IPATHTABLE  0x100000	/* Ethernet (IPATH) table dump */
+#define __IPATH_LINKVERBDBG 0x200000	/* very verbose linkchange debug */
+
+#else				/* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0	/* generic low verbosity stuff */
+#define __IPATH_DBG       0x0	/* generic debug */
+#define __IPATH_TRSAMPLE  0x0	/* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0	/* very verbose debug */
+#define __IPATH_PKTDBG    0x0	/* print packet data */
+#define __IPATH_PROCDBG   0x0	/* process startup (init)/exit messages */
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_EPKTDBG   0x0	/* print ethernet packet data */
+#define __IPATH_IPATHDBG  0x0	/* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0	/* Ethernet (IPATH) warnings on   */
+#define __IPATH_IPATHERR  0x0	/* Ethernet (IPATH) errors on   */
+#define __IPATH_IPATHPD   0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_IPATHTABLE 0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_LINKVERBDBG 0x0	/* very verbose linkchange debug */
+
+#endif				/* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif				/* _IPATH_DEBUG_H */
diff --git a/drivers/staging/rdma/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c
new file mode 100644
index 0000000..45802e9
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_diag.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the ipath_diag device, normally minor number 129.  Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <asm/uaccess.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diag_write,
+	.read = ipath_diag_read,
+	.open = ipath_diag_open,
+	.release = ipath_diag_release,
+	.llseek = default_llseek,
+};
+
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_dev;
+
+int ipath_diag_add(struct ipath_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
+				      "ipath_diagpkt", &diagpkt_file_ops,
+				      &diagpkt_cdev, &diagpkt_dev);
+
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
+				      "device: %d", ret);
+			goto done;
+		}
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
+
+	ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
+			      &diag_file_ops, &dd->diag_cdev,
+			      &dd->diag_dev);
+	if (ret)
+		ipath_dev_err(dd, "Couldn't create %s device: %d",
+			      name, ret);
+
+done:
+	return ret;
+}
+
+void ipath_diag_remove(struct ipath_devdata *dd)
+{
+	if (atomic_dec_and_test(&diagpkt_count))
+		ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
+
+	ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u32 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
+	struct ipath_devdata *dd;
+	int ret;
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_diag_inuse) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	dd = ipath_lookup(unit);
+
+	if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	fp->private_data = dd;
+	ipath_diag_inuse = -2;
+	diag_set_link = 0;
+	ret = 0;
+
+	/* Only expose a way to reset the device if we
+	   make it into diag mode. */
+	ipath_expose_reset(&dd->pcidev->dev);
+
+bail:
+	mutex_unlock(&ipath_mutex);
+
+	return ret;
+}
+
+/**
+ * ipath_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: ipath_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct ipath_diag_pkt odp;
+	struct ipath_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct ipath_devdata *dd;
+	ssize_t ret = 0;
+	u64 val;
+	u32 l_state, lt_state; /* LinkState, LinkTrainingState */
+
+
+	if (count == sizeof(dp)) {
+		if (copy_from_user(&dp, data, sizeof(dp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+	} else if (count == sizeof(odp)) {
+		if (copy_from_user(&odp, data, sizeof(odp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		dp.len = odp.len;
+		dp.unit = odp.unit;
+		dp.data = odp.data;
+		dp.pbc_wd = 0;
+	} else {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = dp.len >> 2;
+
+	dd = ipath_lookup(dp.unit);
+	if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
+			   dp.unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (ipath_diag_inuse && !diag_set_link &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		diag_set_link = 1;
+		ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+			   "diag pkt\n");
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+	}
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+	/*
+	 * Want to skip check for l_state if using custom PBC,
+	 * because we might be trying to force an SM packet out.
+	 * first-cut, skip _all_ state checking in that case.
+	 */
+	val = ipath_ib_state(dd, dd->ipath_lastibcstat);
+	lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+	l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
+	    (val != dd->ib_init && val != dd->ib_arm &&
+	    val != dd->ib_active))) {
+		ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
+			   dd->ipath_unit, (unsigned long long) val);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
+		ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
+			  dp.len, dd->ipath_ibmaxlen);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
+			 "failing\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp.data,
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;		/* in dwords */
+
+	piobuf = ipath_getpiobuf(dd, plen, &pbufn);
+	if (!piobuf) {
+		ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
+			   dd->ipath_unit);
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	ipath_disarm_piobufs(dd, pbufn, 1);
+
+	if (ipath_debug & __IPATH_PKTDBG)
+		ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
+			   dd->ipath_unit, plen - 1, pbufn);
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all by the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
+		ipath_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen);
+
+	ipath_flush_wc();
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int ipath_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&ipath_mutex);
+	ipath_diag_inuse = 0;
+	fp->private_data = NULL;
+	mutex_unlock(&ipath_mutex);
+	return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (ipath_diag_inuse < 1 && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit reads */
+		ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+	else
+		ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -2)
+			ipath_diag_inuse++;
+	}
+
+	return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
+		 ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
+		ret = -EINVAL;  /* before any other write allowed */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit writes */
+		ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+	else
+		ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -1)
+			ipath_diag_inuse = 1; /* all read/write OK now */
+	}
+
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c
new file mode 100644
index 0000000..123a8c0
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_dma.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 ipath_dma_map_single(struct ib_device *dev,
+			        void *cpu_addr, size_t size,
+			        enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+	return (u64) cpu_addr;
+}
+
+static void ipath_dma_unmap_single(struct ib_device *dev,
+				   u64 addr, size_t size,
+				   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 ipath_dma_map_page(struct ib_device *dev,
+			      struct page *page,
+			      unsigned long offset,
+			      size_t size,
+			      enum dma_data_direction direction)
+{
+	u64 addr;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	if (offset + size > PAGE_SIZE) {
+		addr = BAD_DMA_ADDRESS;
+		goto done;
+	}
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+	/* TODO: handle highmem pages */
+
+done:
+	return addr;
+}
+
+static void ipath_dma_unmap_page(struct ib_device *dev,
+				 u64 addr, size_t size,
+				 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		/* TODO: handle highmem pages */
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void ipath_unmap_sg(struct ib_device *dev,
+			   struct scatterlist *sg, int nents,
+			   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static void ipath_sync_single_for_cpu(struct ib_device *dev,
+				      u64 addr,
+				      size_t size,
+				      enum dma_data_direction dir)
+{
+}
+
+static void ipath_sync_single_for_device(struct ib_device *dev,
+					 u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+}
+
+static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				      u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
+				    void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
+	.mapping_error = ipath_mapping_error,
+	.map_single = ipath_dma_map_single,
+	.unmap_single = ipath_dma_unmap_single,
+	.map_page = ipath_dma_map_page,
+	.unmap_page = ipath_dma_unmap_page,
+	.map_sg = ipath_map_sg,
+	.unmap_sg = ipath_unmap_sg,
+	.sync_single_for_cpu = ipath_sync_single_for_cpu,
+	.sync_single_for_device = ipath_sync_single_for_device,
+	.alloc_coherent = ipath_dma_alloc_coherent,
+	.free_coherent = ipath_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c
new file mode 100644
index 0000000..871dbe5
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_driver.c
@@ -0,0 +1,2789 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#ifdef CONFIG_X86_64
+#include <asm/pat.h>
+#endif
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+	static char iname[16];
+	snprintf(iname, sizeof iname, "infinipath%u", unit);
+	return iname;
+}
+
+#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_ipath_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
+module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
+MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
+
+static unsigned ipath_hol_timeout_ms = 13000;
+module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+	"duration of user app suspension after link failure");
+
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic <support@qlogic.com>");
+MODULE_DESCRIPTION("QLogic InfiniPath driver");
+
+/*
+ * Table to translate the LINKTRAININGSTATE portion of
+ * IBCStatus to a human-readable form.
+ */
+const char *ipath_ibcstatus_str[] = {
+	"Disabled",
+	"LinkUp",
+	"PollActive",
+	"PollQuiet",
+	"SleepDelay",
+	"SleepQuiet",
+	"LState6",		/* unused */
+	"LState7",		/* unused */
+	"CfgDebounce",
+	"CfgRcvfCfg",
+	"CfgWaitRmt",
+	"CfgIdle",
+	"RecovRetrain",
+	"CfgTxRevLane",		/* unused before IBA7220 */
+	"RecovWaitRmt",
+	"RecovIdle",
+	/* below were added for IBA7220 */
+	"CfgEnhanced",
+	"CfgTest",
+	"CfgWaitRmtTest",
+	"CfgWaitCfgEnhanced",
+	"SendTS_T",
+	"SendTstIdles",
+	"RcvTS_T",
+	"SendTst_TS1s",
+	"LTState18", "LTState19", "LTState1A", "LTState1B",
+	"LTState1C", "LTState1D", "LTState1E", "LTState1F"
+};
+
+static void ipath_remove_one(struct pci_dev *);
+static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+	.name = IPATH_DRV_NAME,
+	.probe = ipath_init_one,
+	.remove = ipath_remove_one,
+	.id_table = ipath_pci_tbl,
+	.driver = {
+		.groups = ipath_driver_attr_groups,
+	},
+};
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+			     u32 *bar0, u32 *bar1)
+{
+	int ret;
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar0 before enable: "
+			      "error %d\n", -ret);
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar1 before enable: "
+			      "error %d\n", -ret);
+
+	ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+			       struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	pci_set_drvdata(pdev, NULL);
+
+	if (dd->ipath_unit != -1) {
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+		idr_remove(&unit_table, dd->ipath_unit);
+		list_del(&dd->ipath_list);
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	}
+	vfree(dd);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+	unsigned long flags;
+	struct ipath_devdata *dd;
+	int ret;
+
+	dd = vzalloc(sizeof(*dd));
+	if (!dd) {
+		dd = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+	dd->ipath_unit = -1;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate unit ID: error %d\n", -ret);
+		ipath_free_devdata(pdev, dd);
+		dd = ERR_PTR(ret);
+		goto bail_unlock;
+	}
+	dd->ipath_unit = ret;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	idr_preload_end();
+bail:
+	return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+	return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+	dd = __ipath_lookup(unit);
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
+{
+	int nunits, npresent, nup;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	int maxports;
+
+	nunits = npresent = nup = maxports = 0;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		nunits++;
+		if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+			npresent++;
+		if (dd->ipath_lid &&
+		    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+					 | IPATH_LINKUNK)))
+			nup++;
+		if (dd->ipath_cfgports > maxports)
+			maxports = dd->ipath_cfgports;
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+	if (maxportsp)
+		*maxportsp = maxports;
+
+	return nunits;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = ipath_getpiobuf(dd, 0, &pbnum);
+	if (!piobuf) {
+		dev_info(&dd->pcidev->dev,
+			"No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		dev_info(&dd->pcidev->dev,
+			"Couldn't get memory for checking PIO perf,"
+			" skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	ipath_disable_armlaunch(dd);
+
+	/*
+	 * length 0, no dwords actually sent, and mark as VL15
+	 * on chips where that may matter (due to IB flowcontrol)
+	 */
+	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+		writeq(1UL << 63, piobuf);
+	else
+		writeq(0, piobuf);
+	ipath_flush_wc();
+
+	/*
+	 * this is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		ipath_dev_err(dd,
+			"Performance problem: bandwidth to PIO buffers is "
+			"only %u MiB/sec\n",
+			lcnt / (u32) emsecs);
+	else
+		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+			lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	ipath_disarm_piobufs(dd, pbnum, 1);
+	ipath_enable_armlaunch(dd);
+}
+
+static void cleanup_device(struct ipath_devdata *dd);
+
+static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, len, j;
+	struct ipath_devdata *dd;
+	unsigned long long addr;
+	u32 bar0 = 0, bar1 = 0;
+
+#ifdef CONFIG_X86_64
+	if (pat_enabled()) {
+		pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+#endif
+
+	dd = ipath_alloc_devdata(pdev);
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate devdata: error %d\n", -ret);
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/* This can happen iff:
+		 *
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+			      dd->ipath_unit, -ret);
+		goto bail_devdata;
+	}
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+	ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
+		   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+		   ent->device, ent->driver_data);
+
+	read_bars(dd, pdev, &bar0, &bar1);
+
+	if (!bar1 && !(bar0 & ~0xf)) {
+		if (addr) {
+			dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+				 "rewriting as %llx\n", addr);
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_0, addr);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR0 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR1 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+		} else {
+			ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+				      "not usable until reboot\n");
+			ret = -ENODEV;
+			goto bail_disable;
+		}
+	}
+
+	ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+	if (ret) {
+		dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+			 "err %d\n", dd->ipath_unit, -ret);
+		goto bail_disable;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * if the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			dev_info(&pdev->dev,
+				"Unable to set DMA mask for unit %u: %d\n",
+				dd->ipath_unit, ret);
+			goto bail_regions;
+		}
+		else {
+			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
+			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+			if (ret)
+				dev_info(&pdev->dev,
+					"Unable to set DMA consistent mask "
+					"for unit %u: %d\n",
+					dd->ipath_unit, ret);
+
+		}
+	}
+	else {
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (ret)
+			dev_info(&pdev->dev,
+				"Unable to set DMA consistent mask "
+				"for unit %u: %d\n",
+				dd->ipath_unit, ret);
+	}
+
+	pci_set_master(pdev);
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->ipath_pcibar0 = addr;
+	dd->ipath_pcibar1 = addr >> 32;
+	dd->ipath_deviceid = ent->device;	/* save for later use */
+	dd->ipath_vendorid = ent->vendor;
+
+	/* setup the chip-specific functions, as early as possible. */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_INFINIPATH_HT:
+		ipath_init_iba6110_funcs(dd);
+		break;
+
+	default:
+		ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
+			      "failing\n", ent->device);
+		return -ENODEV;
+	}
+
+	for (j = 0; j < 6; j++) {
+		if (!pdev->resource[j].start)
+			continue;
+		ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
+			   j, &pdev->resource[j],
+			   (unsigned long long)pci_resource_len(pdev, j));
+	}
+
+	if (!addr) {
+		ipath_dev_err(dd, "No valid address in BAR 0!\n");
+		ret = -ENODEV;
+		goto bail_regions;
+	}
+
+	dd->ipath_pcirev = pdev->revision;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	dd->ipath_kregbase = __ioremap(addr, len,
+		(_PAGE_NO_CACHE|_PAGE_WRITETHRU));
+#else
+	/* XXX: split this properly to enable on PAT */
+	dd->ipath_kregbase = ioremap_nocache(addr, len);
+#endif
+
+	if (!dd->ipath_kregbase) {
+		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+			  addr);
+		ret = -ENOMEM;
+		goto bail_iounmap;
+	}
+	dd->ipath_kregend = (u64 __iomem *)
+		((void __iomem *)dd->ipath_kregbase + len);
+	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
+	/* for user mmap */
+	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
+		   addr, dd->ipath_kregbase);
+
+	if (dd->ipath_f_bus(dd, pdev))
+		ipath_dev_err(dd, "Failed to setup config space; "
+			      "continuing anyway\n");
+
+	/*
+	 * set up our interrupt handler; IRQF_SHARED probably not needed,
+	 * since MSI interrupts shouldn't be shared but won't  hurt for now.
+	 * check 0 irq after we return from chip-specific bus setup, since
+	 * that can affect this due to setup
+	 */
+	if (!dd->ipath_irq)
+		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
+			      "work\n");
+	else {
+		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+				  IPATH_DRV_NAME, dd);
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't setup irq handler, "
+				      "irq=%d: %d\n", dd->ipath_irq, ret);
+			goto bail_iounmap;
+		}
+	}
+
+	ret = ipath_init_chip(dd, 0);	/* do the chip-specific init */
+	if (ret)
+		goto bail_irqsetup;
+
+	ret = ipath_enable_wc(dd);
+
+	if (ret)
+		ret = 0;
+
+	ipath_verify_pioperf(dd);
+
+	ipath_device_create_group(&pdev->dev, dd);
+	ipathfs_add_device(dd);
+	ipath_user_add(dd);
+	ipath_diag_add(dd);
+	ipath_register_ib_device(dd);
+
+	goto bail;
+
+bail_irqsetup:
+	cleanup_device(dd);
+
+	if (dd->ipath_irq)
+		dd->ipath_f_free_irq(dd);
+
+	if (dd->ipath_f_cleanup)
+		dd->ipath_f_cleanup(dd);
+
+bail_iounmap:
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+	pci_release_regions(pdev);
+
+bail_disable:
+	pci_disable_device(pdev);
+
+bail_devdata:
+	ipath_free_devdata(pdev, dd);
+
+bail:
+	return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+	int port;
+	struct ipath_portdata **tmp;
+	unsigned long flags;
+
+	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+		/* can't do anything more with chip; needs re-init */
+		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+		if (dd->ipath_kregbase) {
+			/*
+			 * if we haven't already cleaned up before these are
+			 * to ensure any register reads/writes "fail" until
+			 * re-init
+			 */
+			dd->ipath_kregbase = NULL;
+			dd->ipath_uregbase = 0;
+			dd->ipath_sregbase = 0;
+			dd->ipath_cregbase = 0;
+			dd->ipath_kregsize = 0;
+		}
+		ipath_disable_wc(dd);
+	}
+
+	if (dd->ipath_spectriggerhit)
+		dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
+			 dd->ipath_spectriggerhit);
+
+	if (dd->ipath_pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->ipath_pioavailregs_dma,
+				  dd->ipath_pioavailregs_phys);
+		dd->ipath_pioavailregs_dma = NULL;
+	}
+	if (dd->ipath_dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+			dd->ipath_pd[0]->port_rcvhdrq_size,
+			dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+		dd->ipath_dummy_hdrq = NULL;
+	}
+
+	if (dd->ipath_pageshadow) {
+		struct page **tmpp = dd->ipath_pageshadow;
+		dma_addr_t *tmpd = dd->ipath_physshadow;
+		int i, cnt = 0;
+
+		ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+			   "locked\n");
+		for (port = 0; port < dd->ipath_cfgports; port++) {
+			int port_tidbase = port * dd->ipath_rcvtidcnt;
+			int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+			for (i = port_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				ipath_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+				cnt++;
+			}
+		}
+		if (cnt) {
+			ipath_stats.sps_pageunlocks += cnt;
+			ipath_cdbg(VERBOSE, "There were still %u expTID "
+				   "entries locked\n", cnt);
+		}
+		if (ipath_stats.sps_pagelocks ||
+		    ipath_stats.sps_pageunlocks)
+			ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+				   "unlocked via ipath_m{un}lock\n",
+				   (unsigned long long)
+				   ipath_stats.sps_pagelocks,
+				   (unsigned long long)
+				   ipath_stats.sps_pageunlocks);
+
+		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+			   dd->ipath_pageshadow);
+		tmpp = dd->ipath_pageshadow;
+		dd->ipath_pageshadow = NULL;
+		vfree(tmpp);
+
+		dd->ipath_egrtidbase = NULL;
+	}
+
+	/*
+	 * free any resources still in use (usually just kernel ports)
+	 * at unload; we do for portcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that ipath_pd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	tmp = dd->ipath_pd;
+	dd->ipath_pd = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	for (port = 0; port < dd->ipath_portcnt; port++) {
+		struct ipath_portdata *pd = tmp[port];
+		tmp[port] = NULL; /* debugging paranoia */
+		ipath_free_pddata(dd, pd);
+	}
+	kfree(tmp);
+}
+
+static void ipath_remove_one(struct pci_dev *pdev)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(pdev);
+
+	ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+	/*
+	 * disable the IB link early, to be sure no new packets arrive, which
+	 * complicates the shutdown process
+	 */
+	ipath_shutdown_device(dd);
+
+	flush_workqueue(ib_wq);
+
+	if (dd->verbs_dev)
+		ipath_unregister_ib_device(dd->verbs_dev);
+
+	ipath_diag_remove(dd);
+	ipath_user_remove(dd);
+	ipathfs_remove_device(dd);
+	ipath_device_remove_group(&pdev->dev, dd);
+
+	ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+		   "unit %u\n", dd, (u32) dd->ipath_unit);
+
+	cleanup_device(dd);
+
+	/*
+	 * turn off rcv, send, and interrupts for all ports, all drivers
+	 * should also hard reset the chip here?
+	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+	 * for all versions of the driver, if they were allocated
+	 */
+	if (dd->ipath_irq) {
+		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+			   dd->ipath_unit, dd->ipath_irq);
+		dd->ipath_f_free_irq(dd);
+	} else
+		ipath_dbg("irq is 0, not doing free_irq "
+			  "for unit %u\n", dd->ipath_unit);
+	/*
+	 * we check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.	Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->ipath_f_cleanup)
+		/* clean up chip-specific stuff */
+		dd->ipath_f_cleanup(dd);
+
+	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+	pci_release_regions(pdev);
+	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+	pci_disable_device(pdev);
+
+	ipath_free_devdata(pdev, dd);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+			  unsigned cnt)
+{
+	unsigned i, last = first + cnt;
+	unsigned long flags;
+
+	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+	for (i = first; i < last; i++) {
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		/*
+		 * The disarm-related bits are write-only, so it
+		 * is ok to OR them in with our copy of sendctrl
+		 * while we hold the lock.
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+		/* can't disarm bufs back-to-back per iba7220 spec */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+	/* on some older chips, update may not happen after cancel */
+	ipath_force_pio_avail_update(dd);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * ipath_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+	dd->ipath_state_wanted = state;
+	wait_event_interruptible_timeout(ipath_state_wait,
+					 (dd->ipath_flags & state),
+					 msecs_to_jiffies(msecs));
+	dd->ipath_state_wanted = 0;
+
+	if (!(dd->ipath_flags & state)) {
+		u64 val;
+		ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
+			   " ms\n",
+			   /* test INIT ahead of DOWN, both can be set */
+			   (state & IPATH_LINKINIT) ? "INIT" :
+			   ((state & IPATH_LINKDOWN) ? "DOWN" :
+			    ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+			   msecs);
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+		ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+			   (unsigned long long) ipath_read_kreg64(
+				   dd, dd->ipath_kregs->kr_ibcctrl),
+			   (unsigned long long) val,
+			   ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
+	}
+	return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
+	char *buf, size_t blen)
+{
+	static const struct {
+		ipath_err_t err;
+		const char *msg;
+	} errs[] = {
+		{ INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
+		{ INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
+		{ INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
+		{ INFINIPATH_E_SDMABASE, "SDmaBase" },
+		{ INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
+		{ INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
+		{ INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
+		{ INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
+		{ INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
+		{ INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
+		{ INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
+		{ INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
+	};
+	int i;
+	int expected;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
+			test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+		if ((err & errs[i].err) && !expected)
+			bidx += snprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+	ipath_err_t err)
+{
+	int iserr = 1;
+	*buf = '\0';
+	if (err & INFINIPATH_E_PKTERRS) {
+		if (!(err & ~INFINIPATH_E_PKTERRS))
+			iserr = 0; // if only packet errors.
+		if (ipath_debug & __IPATH_ERRPKTDBG) {
+			if (err & INFINIPATH_E_REBP)
+				strlcat(buf, "EBP ", blen);
+			if (err & INFINIPATH_E_RVCRC)
+				strlcat(buf, "VCRC ", blen);
+			if (err & INFINIPATH_E_RICRC) {
+				strlcat(buf, "CRC ", blen);
+				// clear for check below, so only once
+				err &= INFINIPATH_E_RICRC;
+			}
+			if (err & INFINIPATH_E_RSHORTPKTLEN)
+				strlcat(buf, "rshortpktlen ", blen);
+			if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+				strlcat(buf, "sdroppeddatapkt ", blen);
+			if (err & INFINIPATH_E_SPKTLEN)
+				strlcat(buf, "spktlen ", blen);
+		}
+		if ((err & INFINIPATH_E_RICRC) &&
+			!(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & INFINIPATH_E_RHDRLEN)
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & INFINIPATH_E_RBADTID)
+		strlcat(buf, "rbadtid ", blen);
+	if (err & INFINIPATH_E_RBADVERSION)
+		strlcat(buf, "rbadversion ", blen);
+	if (err & INFINIPATH_E_RHDR)
+		strlcat(buf, "rhdr ", blen);
+	if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & INFINIPATH_E_RLONGPKTLEN)
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & INFINIPATH_E_RMAXPKTLEN)
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & INFINIPATH_E_RMINPKTLEN)
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & INFINIPATH_E_SMINPKTLEN)
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & INFINIPATH_E_RFORMATERR)
+		strlcat(buf, "rformaterr ", blen);
+	if (err & INFINIPATH_E_RUNSUPVL)
+		strlcat(buf, "runsupvl ", blen);
+	if (err & INFINIPATH_E_RUNEXPCHAR)
+		strlcat(buf, "runexpchar ", blen);
+	if (err & INFINIPATH_E_RIBFLOW)
+		strlcat(buf, "ribflow ", blen);
+	if (err & INFINIPATH_E_SUNDERRUN)
+		strlcat(buf, "sunderrun ", blen);
+	if (err & INFINIPATH_E_SPIOARMLAUNCH)
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & INFINIPATH_E_SMAXPKTLEN)
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & INFINIPATH_E_SUNSUPVL)
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & INFINIPATH_E_INVALIDADDR)
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & INFINIPATH_E_RRCVEGRFULL)
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & INFINIPATH_E_RRCVHDRFULL)
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & INFINIPATH_E_IBSTATUSCHANGED)
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & INFINIPATH_E_RIBLOSTLINK)
+		strlcat(buf, "riblostlink ", blen);
+	if (err & INFINIPATH_E_HARDWARE)
+		strlcat(buf, "hardware ", blen);
+	if (err & INFINIPATH_E_RESET)
+		strlcat(buf, "reset ", blen);
+	if (err & INFINIPATH_E_SDMAERRS)
+		decode_sdma_errs(dd, err, buf, blen);
+	if (err & INFINIPATH_E_INVALIDEEPCMD)
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+	/* if no errors, and so don't need to check what's first */
+	*msg = '\0';
+
+	if (err & INFINIPATH_RHF_H_ICRCERR)
+		strlcat(msg, "icrcerr ", len);
+	if (err & INFINIPATH_RHF_H_VCRCERR)
+		strlcat(msg, "vcrcerr ", len);
+	if (err & INFINIPATH_RHF_H_PARITYERR)
+		strlcat(msg, "parityerr ", len);
+	if (err & INFINIPATH_RHF_H_LENERR)
+		strlcat(msg, "lenerr ", len);
+	if (err & INFINIPATH_RHF_H_MTUERR)
+		strlcat(msg, "mtuerr ", len);
+	if (err & INFINIPATH_RHF_H_IHDRERR)
+		/* infinipath hdr checksum error */
+		strlcat(msg, "ipathhdrerr ", len);
+	if (err & INFINIPATH_RHF_H_TIDERR)
+		strlcat(msg, "tiderr ", len);
+	if (err & INFINIPATH_RHF_H_MKERR)
+		/* bad port, offset, etc. */
+		strlcat(msg, "invalid ipathhdr ", len);
+	if (err & INFINIPATH_RHF_H_IBERR)
+		strlcat(msg, "iberr ", len);
+	if (err & INFINIPATH_RHF_L_SWA)
+		strlcat(msg, "swA ", len);
+	if (err & INFINIPATH_RHF_L_SWB)
+		strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
+{
+	return dd->ipath_port0_skbinfo ?
+		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+				gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+	u32 len;
+
+	/*
+	 * Only fully supported way to handle this is to allocate lots
+	 * extra, align as needed, and then do skb_reserve().  That wastes
+	 * a lot of memory...  I'll have to hack this into infinipath_copy
+	 * also.
+	 */
+
+	/*
+	 * We need 2 extra bytes for ipath_ether data sent in the
+	 * key header.  In order to keep everything dword aligned,
+	 * we'll reserve 4 bytes.
+	 */
+	len = dd->ipath_ibmaxlen + 4;
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		/* We need a 2KB multiple alignment, and there is no way
+		 * to do it except to allocate extra and then skb_reserve
+		 * enough to bring it up to the right alignment.
+		 */
+		len += 2047;
+	}
+
+	skb = __dev_alloc_skb(len, gfp_mask);
+	if (!skb) {
+		ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+			      len);
+		goto bail;
+	}
+
+	skb_reserve(skb, 4);
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		u32 una = (unsigned long)skb->data & 2047;
+		if (una)
+			skb_reserve(skb, 2048 - una);
+	}
+
+bail:
+	return skb;
+}
+
+static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
+			     u32 eflags,
+			     u32 l,
+			     u32 etail,
+			     __le32 *rhf_addr,
+			     struct ipath_message_header *hdr)
+{
+	char emsg[128];
+
+	get_rhf_errstring(eflags, emsg, sizeof emsg);
+	ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+		   "tlen=%x opcode=%x egridx=%x: %s\n",
+		   eflags, l,
+		   ipath_hdrget_rcv_type(rhf_addr),
+		   ipath_hdrget_length_in_bytes(rhf_addr),
+		   be32_to_cpu(hdr->bth[0]) >> 24,
+		   etail, emsg);
+
+	/* Count local link integrity errors. */
+	if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
+		u8 n = (dd->ipath_ibcctrl >>
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+
+		if (++dd->ipath_lli_counter > n) {
+			dd->ipath_lli_counter = 0;
+			dd->ipath_lli_errors++;
+		}
+	}
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @pd: the infinipath port
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
+	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
+	u32 etail = -1, l, hdrqtail;
+	struct ipath_message_header *hdr;
+	u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
+	static u64 totcalls;	/* stats, may eventually remove */
+	int last;
+
+	l = pd->port_head;
+	rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		u32 seq = ipath_hdrget_seq(rhf_addr);
+
+		if (seq != pd->port_seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();
+	}
+
+reloop:
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
+		eflags = ipath_hdrget_err_flags(rhf_addr);
+		etype = ipath_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = ipath_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
+		    ipath_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			/*
+			 * It turns out that the chip uses an eager buffer
+			 * for all non-expected packets, whether it "needs"
+			 * one or not.  So always get the index, but don't
+			 * set ebuf (so we try to copy data) unless the
+			 * length requires it.
+			 */
+			etail = ipath_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype == RCVHQ_RCV_TYPE_NON_KD)
+				ebuf = ipath_get_egrbuf(dd, etail);
+		}
+
+		/*
+		 * both tiderr and ipathhdrerr are set for all plain IB
+		 * packets; only ipathhdrerr should be set.
+		 */
+
+		if (etype != RCVHQ_RCV_TYPE_NON_KD &&
+		    etype != RCVHQ_RCV_TYPE_ERROR &&
+		    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+		    IPS_PROTO_VERSION)
+			ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+				   "%x\n", etype);
+
+		if (unlikely(eflags))
+			ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
+			if (dd->ipath_lli_counter)
+				dd->ipath_lli_counter--;
+		} else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+			u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
+			u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
+			ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+				   "qp=%x), len %x; ignored\n",
+				   etype, opcode, qp, tlen);
+		}
+		else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+			ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+				  be32_to_cpu(hdr->bth[0]) >> 24);
+		else {
+			/*
+			 * error packet, type of error unknown.
+			 * Probably type 3, but we don't know, so don't
+			 * even try to print the opcode, etc.
+			 * Usually caused by a "bad packet", that has no
+			 * BTH, when the LRH says it should.
+			 */
+			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
+				  " %x, len %x hdrq+%x rhf: %Lx\n",
+				  etail, tlen, l, (unsigned long long)
+				  le64_to_cpu(*(__le64 *) rhf_addr));
+			if (ipath_debug & __IPATH_ERRPKTDBG) {
+				u32 j, *d, dw = rsize-2;
+				if (rsize > (tlen>>2))
+					dw = tlen>>2;
+				d = (u32 *)hdr;
+				printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
+					dw);
+				for (j = 0; j < dw; j++)
+					printk(KERN_DEBUG "%8x%s", d[j],
+						(j%8) == 7 ? "\n" : " ");
+				printk(KERN_DEBUG ".\n");
+			}
+		}
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			l + dd->ipath_rhf_offset;
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+			u32 seq = ipath_hdrget_seq(rhf_addr);
+
+			if (++pd->port_seq_cnt > 13)
+				pd->port_seq_cnt = 1;
+			if (seq != pd->port_seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * update head regs on last packet, and every 16 packets.
+		 * Reduce bus traffic, while still trying to prevent
+		 * rcvhdrq overflows, for when the queue is nearly full
+		 */
+		if (last || !(i & 0xf)) {
+			u64 lval = l;
+
+			/* request IBA6120 and 7220 interrupt only on last */
+			if (last)
+				lval |= dd->ipath_rhdrhead_intr_off;
+			ipath_write_ureg(dd, ur_rcvhdrhead, lval,
+				pd->port_port);
+			if (updegr) {
+				ipath_write_ureg(dd, ur_rcvegrindexhead,
+						 etail, pd->port_port);
+				updegr = 0;
+			}
+		}
+	}
+
+	if (!dd->ipath_rhdrhead_intr_off && !reloop &&
+	    !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		/* IBA6110 workaround; we can have a race clearing chip
+		 * interrupt with another interrupt about to be delivered,
+		 * and can clear it before it is delivered on the GPIO
+		 * workaround.  By doing the extra check here for the
+		 * in-memory tail register updating while we were doing
+		 * earlier packets, we "almost" guarantee we have covered
+		 * that case.
+		 */
+		u32 hqtail = ipath_get_rcvhdrtail(pd);
+		if (hqtail != hdrqtail) {
+			hdrqtail = hqtail;
+			reloop = 1; /* loop 1 extra time at most */
+			goto reloop;
+		}
+	}
+
+	pkttot += i;
+
+	pd->port_head = l;
+
+	if (pkttot > ipath_stats.sps_maxpkts_call)
+		ipath_stats.sps_maxpkts_call = pkttot;
+	ipath_stats.sps_port0pkts += pkttot;
+	ipath_stats.sps_avgpkts_call =
+		ipath_stats.sps_port0pkts / ++totcalls;
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int i;
+	const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+	/* If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+		return;
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		/* only if packet debug and verbose */
+		volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+		unsigned long *shadow = dd->ipath_pioavailshadow;
+
+		ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+			   "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+			   "s3=%lx\n",
+			   (unsigned long long) le64_to_cpu(dma[0]),
+			   shadow[0],
+			   (unsigned long long) le64_to_cpu(dma[1]),
+			   shadow[1],
+			   (unsigned long long) le64_to_cpu(dma[2]),
+			   shadow[2],
+			   (unsigned long long) le64_to_cpu(dma[3]),
+			   shadow[3]);
+		if (piobregs > 4)
+			ipath_cdbg(
+				PKT, "2nd group, dma4=%llx shad4=%lx, "
+				"d5=%llx s5=%lx, d6=%llx s6=%lx, "
+				"d7=%llx s7=%lx\n",
+				(unsigned long long) le64_to_cpu(dma[4]),
+				shadow[4],
+				(unsigned long long) le64_to_cpu(dma[5]),
+				shadow[5],
+				(unsigned long long) le64_to_cpu(dma[6]),
+				shadow[6],
+				(unsigned long long) le64_to_cpu(dma[7]),
+				shadow[7]);
+	}
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+		/*
+		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+		else
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+		pchg = dd->ipath_pioavailkernel[i] &
+			~(dd->ipath_pioavailshadow[i] ^ piov);
+		pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+			pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->ipath_pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+	int i, im;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		u64 val, oldval;
+		/* deal with 6110 chip bug on high register #s */
+		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+			i ^ 1 : i;
+		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+		/*
+		 * busy out the buffers not in the kernel avail list,
+		 * without changing the generation bits.
+		 */
+		oldval = dd->ipath_pioavailshadow[i];
+		dd->ipath_pioavailshadow[i] = val |
+			((~dd->ipath_pioavailkernel[i] <<
+			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+		if (oldval != dd->ipath_pioavailshadow[i])
+			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+				i, (unsigned long long) oldval,
+				dd->ipath_pioavailshadow[i]);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+	int ret = 0;
+
+	if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+		if (dd->ipath_rcvhdrsize != rhdrsize) {
+			dev_info(&dd->pcidev->dev,
+				 "Error: can't set protocol header "
+				 "size %u, already %u\n",
+				 rhdrsize, dd->ipath_rcvhdrsize);
+			ret = -EAGAIN;
+		} else
+			ipath_cdbg(VERBOSE, "Reuse same protocol header "
+				   "size %u\n", dd->ipath_rcvhdrsize);
+	} else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+			       (sizeof(u64) / sizeof(u32)))) {
+		ipath_dbg("Error: can't set protocol header size %u "
+			  "(> max %u)\n", rhdrsize,
+			  dd->ipath_rcvhdrentsize -
+			  (u32) (sizeof(u64) / sizeof(u32)));
+		ret = -EOVERFLOW;
+	} else {
+		dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+		dd->ipath_rcvhdrsize = rhdrsize;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+				 dd->ipath_rcvhdrsize);
+		ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+			   dd->ipath_rcvhdrsize);
+	}
+	return ret;
+}
+
+/*
+ * debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	__le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
+
+	dd->ipath_upd_pio_shadow = 1;
+
+	/*
+	 * not atomic, but if we lose a stat count in a while, that's OK
+	 */
+	ipath_stats.sps_nopiobufs++;
+	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+		ipath_force_pio_avail_update(dd); /* at start */
+		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+			"%llx %llx %llx %llx\n"
+			"ipath  shadow:  %lx %lx %lx %lx\n",
+			dd->ipath_consec_nopiobuf,
+			(unsigned long)get_cycles(),
+			(unsigned long long) le64_to_cpu(dma[0]),
+			(unsigned long long) le64_to_cpu(dma[1]),
+			(unsigned long long) le64_to_cpu(dma[2]),
+			(unsigned long long) le64_to_cpu(dma[3]),
+			shadow[0], shadow[1], shadow[2], shadow[3]);
+		/*
+		 * 4 buffers per byte, 4 registers above, cover rest
+		 * below
+		 */
+		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+		    (sizeof(shadow[0]) * 4 * 4))
+			ipath_dbg("2nd group: dmacopy: "
+				  "%llx %llx %llx %llx\n"
+				  "ipath  shadow:  %lx %lx %lx %lx\n",
+				  (unsigned long long)le64_to_cpu(dma[4]),
+				  (unsigned long long)le64_to_cpu(dma[5]),
+				  (unsigned long long)le64_to_cpu(dma[6]),
+				  (unsigned long long)le64_to_cpu(dma[7]),
+				  shadow[4], shadow[5], shadow[6], shadow[7]);
+
+		/* at end, so update likely happened */
+		ipath_reset_availshadow(dd);
+	}
+}
+
+/*
+ * common code for normal driver pio buffer allocation, and reserved
+ * allocation.
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ */
+static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
+	u32 *pbufnum, u32 first, u32 last, u32 firsti)
+{
+	int i, j, updated = 0;
+	unsigned piobcnt;
+	unsigned long flags;
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	u32 __iomem *buf;
+
+	piobcnt = last - first;
+	if (dd->ipath_upd_pio_shadow) {
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid
+		 */
+		ipath_update_pio_bufs(dd);
+		updated++;
+		i = first;
+	} else
+		i = firsti;
+rescan:
+	/*
+	 * while test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (j = 0; j < piobcnt; j++, i++) {
+		if (i >= last)
+			i = first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		break;
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	if (j == piobcnt) {
+		if (!updated) {
+			/*
+			 * first time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		} else if (updated == 1 && piobcnt <=
+			((dd->ipath_sendctrl
+			>> INFINIPATH_S_UPDTHRESH_SHIFT) &
+			INFINIPATH_S_UPDTHRESH_MASK)) {
+			/*
+			 * for chips supporting and using the update
+			 * threshold we need to force an update of the
+			 * in-memory copy if the count is less than the
+			 * thershold, then check one more time.
+			 */
+			ipath_force_pio_avail_update(dd);
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		}
+
+		no_pio_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->ipath_piobcnt2k)
+			buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+					       i * dd->ipath_palign);
+		else
+			buf = (u32 __iomem *)
+				(dd->ipath_pio4kbase +
+				 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+		if (pbufnum)
+			*pbufnum = i;
+	}
+
+	return buf;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @plen: the size of the PIO buffer needed in 32-bit words
+ * @pbufnum: the buffer number is placed here
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
+{
+	u32 __iomem *buf;
+	u32 pnum, nbufs;
+	u32 first, lasti;
+
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
+		first = dd->ipath_piobcnt2k;
+		lasti = dd->ipath_lastpioindexl;
+	} else {
+		first = 0;
+		lasti = dd->ipath_lastpioindex;
+	}
+	nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
+
+	if (buf) {
+		/*
+		 * Set next starting place.  It's just an optimization,
+		 * it doesn't matter who wins on this, so no locking
+		 */
+		if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+			dd->ipath_lastpioindexl = pnum + 1;
+		else
+			dd->ipath_lastpioindex = pnum + 1;
+		if (dd->ipath_upd_pio_shadow)
+			dd->ipath_upd_pio_shadow = 0;
+		if (dd->ipath_consec_nopiobuf)
+			dd->ipath_consec_nopiobuf = 0;
+		ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+			   pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+		if (pbufnum)
+			*pbufnum = pnum;
+
+	}
+	return buf;
+}
+
+/**
+ * ipath_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the infinipath device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+			      unsigned len, int avail)
+{
+	unsigned long flags;
+	unsigned end, cnt = 0;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i, im;
+			/*
+			 * the BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			/* deal with 6110 chip bug on high register #s */
+			i = start / BITS_PER_LONG;
+			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+				i ^ 1 : i;
+			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+				+ start, dd->ipath_pioavailshadow);
+			dma = (unsigned long) le64_to_cpu(
+				dd->ipath_pioavailregs_dma[im]);
+			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+				+ start) % BITS_PER_LONG, &dma))
+				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			else
+				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			__set_bit(start, dd->ipath_pioavailkernel);
+		} else {
+			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
+				dd->ipath_pioavailshadow);
+			__clear_bit(start, dd->ipath_pioavailkernel);
+		}
+		start += 2;
+	}
+
+	if (dd->ipath_pioupd_thresh) {
+		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+		cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	/*
+	 * When moving buffers from kernel to user, if number assigned to
+	 * the user is less than the pio update threshold, and threshold
+	 * is supported (cnt was computed > 0), drop the update threshold
+	 * so we update at least once per allocated number of buffers.
+	 * In any case, if the kernel buffers are less than the threshold,
+	 * drop the threshold.  We don't bother increasing it, having once
+	 * decreased it, since it would typically just cycle back and forth.
+	 * If we don't decrease below buffers in use, we can wait a long
+	 * time for an update, until some other context uses PIO buffers.
+	 */
+	if (!avail && len < cnt)
+		cnt = len;
+	if (cnt < dd->ipath_pioupd_thresh) {
+		dd->ipath_pioupd_thresh = cnt;
+		ipath_dbg("Decreased pio update threshold to %u\n",
+			dd->ipath_pioupd_thresh);
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+			<< INFINIPATH_S_UPDTHRESH_SHIFT);
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd)
+{
+	int ret = 0;
+
+	if (!pd->port_rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+		int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+				sizeof(u32), PAGE_SIZE);
+
+		pd->port_rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+			gfp_flags);
+
+		if (!pd->port_rcvhdrq) {
+			ipath_dev_err(dd, "attempt to allocate %d bytes "
+				      "for port %u rcvhdrq failed\n",
+				      amt, pd->port_port);
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+			pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				GFP_KERNEL);
+			if (!pd->port_rcvhdrtail_kvaddr) {
+				ipath_dev_err(dd, "attempt to allocate 1 page "
+					"for port %u rcvhdrqtailaddr "
+					"failed\n", pd->port_port);
+				ret = -ENOMEM;
+				dma_free_coherent(&dd->pcidev->dev, amt,
+					pd->port_rcvhdrq,
+					pd->port_rcvhdrq_phys);
+				pd->port_rcvhdrq = NULL;
+				goto bail;
+			}
+			pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
+			ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
+				   "physical\n", pd->port_port,
+				   (unsigned long long) phys_hdrqtail);
+		}
+
+		pd->port_rcvhdrq_size = amt;
+
+		ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+			   "for port %u rcvhdr Q\n",
+			   amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_phys,
+			   (unsigned long) pd->port_rcvhdrq_size,
+			   pd->port_port);
+	}
+	else
+		ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
+			   "hdrtailaddr@%p %llx physical\n",
+			   pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long long) pd->port_rcvhdrq_phys,
+			   pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+			   pd->port_rcvhdrqtailaddr_phys);
+
+	/* clear for security and sanity on each use */
+	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
+	if (pd->port_rcvhdrtail_kvaddr)
+		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+
+	/*
+	 * tell chip each time we init it, even if we are re-using previous
+	 * memory (we zero the register at process close)
+	 */
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+			      pd->port_port, pd->port_rcvhdrqtailaddr_phys);
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			      pd->port_port, pd->port_rcvhdrq_phys);
+
+bail:
+	return ret;
+}
+
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.   Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if normal send had happened.
+ */
+void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
+{
+	unsigned long flags;
+
+	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+		ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
+		goto bail;
+	}
+	/*
+	 * If we have SDMA, and it's not disabled, we have to kick off the
+	 * abort state machine, provided we aren't already aborting.
+	 * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
+	 * we skip the rest of this routine. It is already "in progress"
+	 */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
+		int skip_cancel;
+		unsigned long *statp = &dd->ipath_sdma_status;
+
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		skip_cancel =
+			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+			&& !test_bit(IPATH_SDMA_DISABLED, statp);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (skip_cancel)
+			goto bail;
+	}
+
+	ipath_dbg("Cancelling all in-progress send buffers\n");
+
+	/* skip armlaunch errs for a while */
+	dd->ipath_lastcancel = jiffies + HZ / 2;
+
+	/*
+	 * The abort bit is auto-clearing.  We also don't want pioavail
+	 * update happening during this, and we don't want any other
+	 * sends going out, so turn those off for the duration.  We read
+	 * the scratch register to be sure that cancels and the abort
+	 * have taken effect in the chip.  Otherwise two parts are same
+	 * as ipath_force_pio_avail_update()
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
+		| INFINIPATH_S_PIOENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl | INFINIPATH_S_ABORT);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* disarm all send buffers */
+	ipath_disarm_piobufs(dd, 0,
+		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
+	if (restore_sendctrl) {
+		/* else done by caller later if needed */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
+			INFINIPATH_S_PIOENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		/* and again, be sure all have hit the chip */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+
+	if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
+	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
+	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		/* only wait so long for intr */
+		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
+		dd->ipath_sdma_reset_wait = 200;
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	}
+bail:;
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.  We read the scratch register
+ * to make it highly likely that the update will have happened by the
+ * time we return.  If already off (as in cancel_sends above), this
+ * routine is a nop, on the assumption that the caller will "do the
+ * right thing".
+ */
+void ipath_force_pio_avail_update(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
+				int linitcmd)
+{
+	u64 mod_wd;
+	static const char *what[4] = {
+		[0] = "NOP",
+		[INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
+		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+	};
+
+	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	} else if (linitcmd) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	}
+
+	mod_wd = (linkcmd << dd->ibcc_lc_shift) |
+		(linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+	ipath_cdbg(VERBOSE,
+		"Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
+		dd->ipath_unit, what[linkcmd], linitcmd,
+		ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
+			ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl | mod_wd);
+	/* read from chip so write is flushed */
+	(void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+}
+
+int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+
+	switch (newstate) {
+	case IPATH_IB_LINKDOWN_ONLY:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_SLEEP:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_DISABLE:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKARM:
+		if (dd->ipath_flags & IPATH_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags &
+		      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
+
+		/*
+		 * Since the port can transition to ACTIVE by receiving
+		 * a non VL 15 packet, wait for either state.
+		 */
+		lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINKACTIVE:
+		if (dd->ipath_flags & IPATH_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
+		lstate = IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINK_LOOPBACK:
+		dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+
+		/* turn heartbeat off, as it causes loopback to fail */
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_OFF);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINK_EXTERNAL:
+		dev_info(&dd->pcidev->dev,
+			"Disabling IB local loopback (normal)\n");
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_ON);
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	/*
+	 * Heartbeat can be explicitly enabled by the user via
+	 * "hrtbt_enable" "file", and if disabled, trying to enable here
+	 * will have no effect.  Implicit changes (heartbeat off when
+	 * loopback on, and vice versa) are included to ease testing.
+	 */
+	case IPATH_IB_LINK_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_ON);
+		goto bail;
+
+	case IPATH_IB_LINK_NO_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_OFF);
+		goto bail;
+
+	default:
+		ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+	u32 piosize;
+	int changed = 0;
+	int ret;
+
+	/*
+	 * mtu is IB data payload max.  It's the largest power of 2 less
+	 * than piosize (or even larger, since it only really controls the
+	 * largest we can receive; we can send the max of the mtu and
+	 * piosize).  We check that it's one of the valid IB sizes.
+	 */
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    (arg != 4096 || !ipath_mtu4096)) {
+		ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (dd->ipath_ibmtu == arg) {
+		ret = 0;        /* same as current */
+		goto bail;
+	}
+
+	piosize = dd->ipath_ibmaxlen;
+	dd->ipath_ibmtu = arg;
+
+	if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != dd->ipath_init_ibmaxlen) {
+			if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
+				piosize = dd->ipath_init_ibmaxlen;
+			dd->ipath_ibmaxlen = piosize;
+			changed = 1;
+		}
+	} else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+		piosize = arg + IPATH_PIO_MAXIBHDR;
+		ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+			   "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+			   arg);
+		dd->ipath_ibmaxlen = piosize;
+		changed = 1;
+	}
+
+	if (changed) {
+		u64 ibc = dd->ipath_ibcctrl, ibdw;
+		/*
+		 * update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 */
+		dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
+		ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
+		ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+			 dd->ibcc_mpl_shift);
+		ibc |= ibdw << dd->ibcc_mpl_shift;
+		dd->ipath_ibcctrl = ibc;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		dd->ipath_f_tidtemplate(dd);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
+{
+	dd->ipath_lid = lid;
+	dd->ipath_lmc = lmc;
+
+	dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
+		(~((1U << lmc) - 1)) << 16);
+
+	dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
+
+	return 0;
+}
+
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+			  unsigned port, u64 value)
+{
+	u16 where;
+
+	if (port < dd->ipath_portcnt &&
+	    (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+	     regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+		where = regno + port;
+	else
+		where = -1;
+
+	ipath_write_kreg(dd, where, value);
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void ipath_run_led_override(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+	int timeoff;
+	int pidx;
+	u64 lstate, ltstate, val;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	pidx = dd->ipath_led_override_phase++ & 1;
+	dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
+	timeoff = dd->ipath_led_override_timeoff;
+
+	/*
+	 * below potentially restores the LED values per current status,
+	 * should also possibly setup the traffic-blink register,
+	 * but leave that to per-chip functions.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+	ltstate = ipath_ib_linktrstate(dd, val);
+	lstate = ipath_ib_linkstate(dd, val);
+
+	dd->ipath_f_setextled(dd, lstate, ltstate);
+	mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
+}
+
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
+{
+	int timeoff, freq;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = val & 0xF;
+	}
+	dd->ipath_led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&dd->ipath_led_override_timer);
+		dd->ipath_led_override_timer.function =
+						 ipath_run_led_override;
+		dd->ipath_led_override_timer.data = (unsigned long) dd;
+		dd->ipath_led_override_timer.expires = jiffies + 1;
+		add_timer(&dd->ipath_led_override_timer);
+	} else
+		atomic_dec(&dd->ipath_led_override_timer_active);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	ipath_dbg("Shutting down the device\n");
+
+	ipath_hol_up(dd); /* make sure user processes aren't suspended */
+
+	dd->ipath_flags |= IPATH_LINKUNK;
+	dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+			     IPATH_LINKINIT | IPATH_LINKARMED |
+			     IPATH_LINKACTIVE);
+	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+				IPATH_STATUS_IB_READY);
+
+	/* mask interrupts, but not errors */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	dd->ipath_rcvctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	/*
+	 * gracefully stop all sends allowing any in progress to trickle out
+	 * first.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	/* flush it */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(5);
+
+	dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
+
+	ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+	ipath_cancel_sends(dd, 0);
+
+	/*
+	 * we are shutting down, so tell components that care.  We don't do
+	 * this on just a link state change, much like ethernet, a cable
+	 * unplug, etc. doesn't change driver state
+	 */
+	signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
+	/* disable IBC */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control | INFINIPATH_C_FREEZEMODE);
+
+	/*
+	 * clear SerdesEnable and turn the leds off; do this here because
+	 * we are unloading, so don't count on interrupts to move along
+	 * Turn the LEDs off explicitly for the same reason.
+	 */
+	dd->ipath_f_quiet_serdes(dd);
+
+	/* stop all the timers that might still be running */
+	del_timer_sync(&dd->ipath_hol_timer);
+	if (dd->ipath_stats_timer_active) {
+		del_timer_sync(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 0;
+	}
+	if (dd->ipath_intrchk_timer.data) {
+		del_timer_sync(&dd->ipath_intrchk_timer);
+		dd->ipath_intrchk_timer.data = 0;
+	}
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/*
+	 * clear all interrupts and errors, so that the next time the driver
+	 * is loaded or device is enabled, we know that whatever is set
+	 * happened while we were unloaded
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
+	ipath_update_eeprom_log(dd);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @pd: the portdata structure
+ *
+ * free up any allocated data for a port
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of port data, because it is called after ipath_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ * (The only exception to global state is freeing the port0 port0_skbs.)
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
+{
+	if (!pd)
+		return;
+
+	if (pd->port_rcvhdrq) {
+		ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+			   "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_size);
+		dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+				  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+		pd->port_rcvhdrq = NULL;
+		if (pd->port_rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					 pd->port_rcvhdrtail_kvaddr,
+					 pd->port_rcvhdrqtailaddr_phys);
+			pd->port_rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (pd->port_port && pd->port_rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+			void *base = pd->port_rcvegrbuf[e];
+			size_t size = pd->port_rcvegrbuf_size;
+
+			ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+				   "chunk %u/%u\n", base,
+				   (unsigned long) size,
+				   e, pd->port_rcvegrbuf_chunks);
+			dma_free_coherent(&dd->pcidev->dev, size,
+				base, pd->port_rcvegrbuf_phys[e]);
+		}
+		kfree(pd->port_rcvegrbuf);
+		pd->port_rcvegrbuf = NULL;
+		kfree(pd->port_rcvegrbuf_phys);
+		pd->port_rcvegrbuf_phys = NULL;
+		pd->port_rcvegrbuf_chunks = 0;
+	} else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
+		unsigned e;
+		struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
+
+		dd->ipath_port0_skbinfo = NULL;
+		ipath_cdbg(VERBOSE, "free closed port %d "
+			   "ipath_port0_skbinfo @ %p\n", pd->port_port,
+			   skbinfo);
+		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
+			if (skbinfo[e].skb) {
+				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+						 dd->ipath_ibmaxlen,
+						 PCI_DMA_FROMDEVICE);
+				dev_kfree_skb(skbinfo[e].skb);
+			}
+		vfree(skbinfo);
+	}
+	kfree(pd->port_tid_pg_list);
+	vfree(pd->subport_uregbase);
+	vfree(pd->subport_rcvegrbuf);
+	vfree(pd->subport_rcvhdr_base);
+	kfree(pd);
+}
+
+static int __init infinipath_init(void)
+{
+	int ret;
+
+	if (ipath_debug & __IPATH_DBG)
+		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&unit_table);
+
+	ret = pci_register_driver(&ipath_driver);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Unable to register driver: error %d\n", -ret);
+		goto bail_unit;
+	}
+
+	ret = ipath_init_ipathfs();
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+		       "ipathfs: error %d\n", -ret);
+		goto bail_pci;
+	}
+
+	goto bail;
+
+bail_pci:
+	pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+	idr_destroy(&unit_table);
+
+bail:
+	return ret;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+	ipath_exit_ipathfs();
+
+	ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+	pci_unregister_driver(&ipath_driver);
+
+	idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+	int ret, i;
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	unsigned long flags;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		/* Need to stop LED timer, _then_ shut off LEDs */
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/* Shut off LEDs after we are sure timer is not running */
+	dd->ipath_led_override = LED_OVER_BOTH_OFF;
+	dd->ipath_f_setextled(dd, 0, 0);
+
+	dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+		dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+			 "not initialized or not present\n", unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	if (dd->ipath_pd)
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+			ipath_dbg("unit %u port %d is in use "
+				  "(PID %u cmd %s), can't reset\n",
+				  unit, i,
+				  pid_nr(dd->ipath_pd[i]->port_pid),
+				  dd->ipath_pd[i]->port_comm);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	dd->ipath_flags &= ~IPATH_INITTED;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+	ret = dd->ipath_f_reset(dd);
+	if (ret == 1) {
+		ipath_dbg("Reinitializing unit %u after reset attempt\n",
+			  unit);
+		ret = ipath_init_chip(dd, 1);
+	} else
+		ret = -EAGAIN;
+	if (ret)
+		ipath_dev_err(dd, "Reinitialize unit %u after "
+			      "reset failed with %d\n", unit, ret);
+	else
+		dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+			 "resetting\n", unit);
+
+bail:
+	return ret;
+}
+
+/*
+ * send a signal to all the processes that have the driver open
+ * through the normal interfaces (i.e., everything other than diags
+ * interface).  Returns number of signalled processes.
+ */
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+{
+	int i, sub, any = 0;
+	struct pid *pid;
+	unsigned long flags;
+
+	if (!dd->ipath_pd)
+		return 0;
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+			continue;
+		pid = dd->ipath_pd[i]->port_pid;
+		if (!pid)
+			continue;
+
+		dev_info(&dd->pcidev->dev, "context %d in use "
+			  "(PID %u), sending signal %d\n",
+			  i, pid_nr(pid), sig);
+		kill_pid(pid, sig, 1);
+		any++;
+		for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
+			pid = dd->ipath_pd[i]->port_subpid[sub];
+			if (!pid)
+				continue;
+			dev_info(&dd->pcidev->dev, "sub-context "
+				"%d:%d in use (PID %u), sending "
+				"signal %d\n", i, sub, pid_nr(pid), sig);
+			kill_pid(pid, sig, 1);
+			any++;
+		}
+	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	return any;
+}
+
+static void ipath_hol_signal_down(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGSTOP))
+		ipath_dbg("Stopped some processes\n");
+	ipath_cancel_sends(dd, 1);
+}
+
+
+static void ipath_hol_signal_up(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGCONT))
+		ipath_dbg("Continued some processes\n");
+}
+
+/*
+ * link is down, stop any users processes, and flush pending sends
+ * to prevent HoL blocking, then start the HoL timer that
+ * periodically continues, then stop procs, so they can detect
+ * link down if they want, and do something about it.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void ipath_hol_down(struct ipath_devdata *dd)
+{
+	dd->ipath_hol_state = IPATH_HOL_DOWN;
+	ipath_hol_signal_down(dd);
+	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+	dd->ipath_hol_timer.expires = jiffies +
+		msecs_to_jiffies(ipath_hol_timeout_ms);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+}
+
+/*
+ * link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up
+ */
+void ipath_hol_up(struct ipath_devdata *dd)
+{
+	ipath_hol_signal_up(dd);
+	dd->ipath_hol_state = IPATH_HOL_UP;
+}
+
+/*
+ * toggle the running/not running state of user proceses
+ * to prevent HoL blocking on chip resources, but still allow
+ * user processes to do link down special case handling.
+ * Should only be called via the timer
+ */
+void ipath_hol_event(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
+		&& dd->ipath_hol_state != IPATH_HOL_UP) {
+		dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+		ipath_dbg("Stopping processes\n");
+		ipath_hol_signal_down(dd);
+	} else { /* may do "extra" if also in ipath_hol_up() */
+		dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
+		ipath_dbg("Continuing processes\n");
+		ipath_hol_signal_up(dd);
+	}
+	if (dd->ipath_hol_state == IPATH_HOL_UP)
+		ipath_dbg("link's up, don't resched timer\n");
+	else {
+		dd->ipath_hol_timer.expires = jiffies +
+			msecs_to_jiffies(ipath_hol_timeout_ms);
+		mod_timer(&dd->ipath_hol_timer,
+			dd->ipath_hol_timer.expires);
+	}
+}
+
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
+{
+	u64 val;
+
+	if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
+		return -1;
+	if (dd->ipath_rx_pol_inv != new_pol_inv) {
+		dd->ipath_rx_pol_inv = new_pol_inv;
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+			 INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= ((u64)dd->ipath_rx_pol_inv) <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	}
+	return 0;
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based.  Safe for the
+ * driver check, since it's at init.   Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code.  There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		INFINIPATH_E_SPIOARMLAUNCH);
+	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+	/* so don't re-enable if already set */
+	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/staging/rdma/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c
new file mode 100644
index 0000000..fc71819
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_eeprom.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom.  This is not a generic
+ * I2C interface.  For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device.  It looks like one
+ * electrically, but not logically.  Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
+ * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.)  The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses.  It's designed to be the only device on a given I2C
+ * bus.  A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible.  For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only.  It doesn't
+ * allow writing to an eeprom.  It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+/* Added functionality for IBA7220-based cards */
+#define IPATH_EEPROM_DEV_V1 0xA0
+#define IPATH_EEPROM_DEV_V2 0xA2
+#define IPATH_TEMP_DEV 0x98
+#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
+#define IPATH_NO_DEV (0xFF)
+
+/*
+ * The number of I2C chains is proliferating. Table below brings
+ * some order to the madness. The basic principle is that the
+ * table is scanned from the top, and a "probe" is made to the
+ * device probe_dev. If that succeeds, the chain is considered
+ * to be of that type, and dd->i2c_chain_type is set to the index+1
+ * of the entry.
+ * The +1 is so static initialization can mean "unknown, do probe."
+ */
+static struct i2c_chain_desc {
+	u8 probe_dev;	/* If seen at probe, chain is this type */
+	u8 eeprom_dev;	/* Dev addr (if any) for EEPROM */
+	u8 temp_dev;	/* Dev Addr (if any) for Temp-sense */
+} i2c_chains[] = {
+	{ IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
+	{ IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
+	{ IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
+	{ IPATH_NO_DEV }
+};
+
+enum i2c_type {
+	i2c_line_scl = 0,
+	i2c_line_sda
+};
+
+enum i2c_state {
+	i2c_line_low = 0,
+	i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state new_line_state)
+{
+	u64 out_mask, dir_mask, *gpioval;
+	unsigned long flags = 0;
+
+	gpioval = &dd->ipath_gpio_out;
+
+	if (line == i2c_line_scl) {
+		dir_mask = dd->ipath_gpio_scl;
+		out_mask = (1UL << dd->ipath_gpio_scl_num);
+	} else {
+		dir_mask = dd->ipath_gpio_sda;
+		out_mask = (1UL << dd->ipath_gpio_sda_num);
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	if (new_line_state == i2c_line_high) {
+		/* tri-state the output rather than force high */
+		dd->ipath_extctrl &= ~dir_mask;
+	} else {
+		/* config line to be an output */
+		dd->ipath_extctrl |= dir_mask;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+
+	/* set output as well (no real verify) */
+	if (new_line_state == i2c_line_high)
+		*gpioval |= out_mask;
+	else
+		*gpioval &= ~out_mask;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.  curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state *curr_statep)
+{
+	u64 read_val, mask;
+	int ret;
+	unsigned long flags = 0;
+
+	/* check args */
+	if (curr_statep == NULL) {
+		ret = 1;
+		goto bail;
+	}
+
+	/* config line to be an input */
+	if (line == i2c_line_scl)
+		mask = dd->ipath_gpio_scl;
+	else
+		mask = dd->ipath_gpio_sda;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	dd->ipath_extctrl &= ~mask;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+	/*
+	 * Below is very unlikely to reflect true input state if Output
+	 * Enable actually changed.
+	 */
+	read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	if (read_val & mask)
+		*curr_statep = i2c_line_high;
+	else
+		*curr_statep = i2c_line_low;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+	(void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	rmb();
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+	udelay(1);
+	i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+	i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+	enum i2c_state bit;
+
+	if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+		ipath_dbg("get bit failed!\n");
+
+	if (wait)
+		i2c_wait_for_writes(dd);
+
+	return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, i2c_line_high);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, i2c_line_low);
+	return ack_received;
+}
+
+/**
+ * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
+ * @dd: the infinipath device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct ipath_devdata *dd)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, i2c_line_high);
+		data |= sda_in(dd, 0);
+		scl_out(dd, i2c_line_low);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, i2c_line_high);
+		scl_out(dd, i2c_line_low);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ *      (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+	int res;
+
+	/* issue start sequence */
+	sda_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_low);
+
+	/* issue length and direction byte */
+	res = wr_byte(dd, offset_dir);
+
+	if (res)
+		ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+	return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_high);
+	udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	u64 *gpioval = &dd->ipath_gpio_out;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/* Make sure shadows are consistent */
+	dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+	*gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+		   "is %llx\n", (unsigned long long) *gpioval);
+
+	/*
+	 * This is to get the i2c into a known state, by first going low,
+	 * then tristate sda (and then tristate scl as first thing
+	 * in loop)
+	 */
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+
+	/* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
+	while (clock_cycles_left--) {
+		scl_out(dd, i2c_line_high);
+
+		/* SDA seen high, issue START by dropping it while SCL high */
+		if (sda_in(dd, 0)) {
+			sda_out(dd, i2c_line_low);
+			scl_out(dd, i2c_line_low);
+			/* ATMEL spec says must be followed by STOP. */
+			scl_out(dd, i2c_line_high);
+			sda_out(dd, i2c_line_high);
+			ret = 0;
+			goto bail;
+		}
+
+		scl_out(dd, i2c_line_low);
+	}
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/*
+ * Probe for I2C device at specified address. Returns 0 for "success"
+ * to match rest of this file.
+ * Leave bus in "reasonable" state for further commands.
+ */
+static int i2c_probe(struct ipath_devdata *dd, int devaddr)
+{
+	int ret = 0;
+
+	ret = eeprom_reset(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
+			      devaddr);
+		return ret;
+	}
+	/*
+	 * Reset no longer leaves bus in start condition, so normal
+	 * i2c_startcmd() will do.
+	 */
+	ret = i2c_startcmd(dd, devaddr | READ_CMD);
+	if (ret)
+		ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
+			   devaddr);
+	else {
+		/*
+		 * Device did respond. Complete a single-byte read, because some
+		 * devices apparently cannot handle STOP immediately after they
+		 * ACK the start-cmd.
+		 */
+		int data;
+		data = rd_byte(dd);
+		stop_cmd(dd);
+		ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
+	}
+	return ret;
+}
+
+/*
+ * Returns the "i2c type". This is a pointer to a struct that describes
+ * the I2C chain on this board. To minimize impact on struct ipath_devdata,
+ * the (small integer) index into the table is actually memoized, rather
+ * then the pointer.
+ * Memoization is because the type is determined on the first call per chip.
+ * An alternative would be to move type determination to early
+ * init code.
+ */
+static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
+{
+	int idx;
+
+	/* Get memoized index, from previous successful probes */
+	idx = dd->ipath_i2c_chain_type - 1;
+	if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
+		goto done;
+
+	idx = 0;
+	while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
+		/* if probe succeeds, this is type */
+		if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
+			break;
+		++idx;
+	}
+
+	/*
+	 * Old EEPROM (first entry) may require a reset after probe,
+	 * rather than being able to "start" after "stop"
+	 */
+	if (idx == 0)
+		eeprom_reset(dd);
+
+	if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
+		idx = -1;
+	else
+		dd->ipath_i2c_chain_type = idx + 1;
+done:
+	return (idx >= 0) ? i2c_chains + idx : NULL;
+}
+
+static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
+					u8 eeprom_offset, void *buffer, int len)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+	u8 *bp = buffer;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->eeprom_dev == IPATH_NO_DEV) {
+		/* legacy not-really-I2C */
+		ipath_cdbg(VERBOSE, "Start command only address\n");
+		eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+		ret = i2c_startcmd(dd, eeprom_offset);
+	} else {
+		/* Actual I2C */
+		ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
+		if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+			ipath_dbg("Failed EEPROM startcmd\n");
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		ret = wr_byte(dd, eeprom_offset);
+		stop_cmd(dd);
+		if (ret) {
+			ipath_dev_err(dd, "Failed to write EEPROM address\n");
+			ret = 1;
+			goto bail;
+		}
+		ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
+	}
+	if (ret) {
+		ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * eeprom keeps clocking data out as long as we ack, automatically
+	 * incrementing the address.
+	 */
+	while (len-- > 0) {
+		/* get and store data */
+		*bp++ = rd_byte(dd);
+		/* send ack if not the last byte */
+		if (len)
+			send_ack(dd);
+	}
+
+	stop_cmd(dd);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
+				       const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	while (len > 0) {
+		if (icd->eeprom_dev == IPATH_NO_DEV) {
+			if (i2c_startcmd(dd,
+					 (eeprom_offset << 1) | WRITE_CMD)) {
+				ipath_dbg("Failed to start cmd offset %u\n",
+					eeprom_offset);
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+				ipath_dbg("Failed EEPROM startcmd\n");
+				goto failed_write;
+			}
+			ret = wr_byte(dd, eeprom_offset);
+			if (ret) {
+				ipath_dev_err(dd, "Failed to write EEPROM "
+					      "address\n");
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		eeprom_offset += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++) {
+			if (wr_byte(dd, *bp++)) {
+				ipath_dbg("no ack after byte %u/%u (%u "
+					  "total remain)\n", i, sub_len,
+					  len + sub_len - i);
+				goto failed_write;
+			}
+		}
+
+		stop_cmd(dd);
+
+		/*
+		 * wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
+			stop_cmd(dd);
+			if (!--max_wait_time) {
+				ipath_dbg("Did not get successful read to "
+					  "complete write\n");
+				goto failed_write;
+			}
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd);
+		stop_cmd(dd);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+			void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+			const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct ipath_flash))
+		len = sizeof(struct ipath_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * We have the capability to use the ipath_nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void ipath_get_eeprom_info(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->ipath_unit;
+	struct ipath_devdata *dd0 = ipath_lookup(0);
+
+	if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+		u8 oguid;
+		dd->ipath_guid = dd0->ipath_guid;
+		bguid = (u8 *) & dd->ipath_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					ipath_dev_err(
+						dd,
+						"Can't set %s GUID from "
+						"base, wraps to OUI!\n",
+						ipath_get_unit_name(t));
+					dd->ipath_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->ipath_nguid = 1;
+
+		ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+			  "for %llx\n",
+			  dd0->ipath_nguid, t,
+			  (unsigned long long) be64_to_cpu(dd->ipath_guid));
+		goto bail;
+	}
+
+	/*
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 * */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+			      "bytes from eeprom for GUID\n", len);
+		goto bail;
+	}
+
+	mutex_lock(&dd->ipath_eep_lock);
+	eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
+	mutex_unlock(&dd->ipath_eep_lock);
+
+	if (eep_stat) {
+		ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+			 "0x%x, not 0x%x\n", csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+			      "ignoring\n",
+			      *(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+			 "default, probably not correct!\n",
+			 *(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/* original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are
+		 * 0.. */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+		ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+			   "shifting 2 octets\n");
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->ipath_guid = guid;
+	dd->ipath_nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
+		&& ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
+		/* This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		char *snp = dd->ipath_serial;
+		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+		snp[sizeof ifp->if_sprefix] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = (sizeof dd->ipath_serial) - len;
+		if (len > sizeof ifp->if_serial) {
+			len = sizeof ifp->if_serial;
+		}
+		memcpy(snp, ifp->if_serial, len);
+	} else
+		memcpy(dd->ipath_serial, ifp->if_serial,
+		       sizeof ifp->if_serial);
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		ipath_dev_err(dd, "Board SN %s did not pass functional "
+			"test: %s\n", dd->ipath_serial,
+			ifp->if_comment);
+
+	ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+		   (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+	memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
+	/*
+	 * Power-on (actually "active") hours are kept as little-endian value
+	 * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+	 * atomic_t while running.
+	 */
+	atomic_set(&dd->ipath_active_time, 0);
+	dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
+/**
+ * ipath_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the infinipath device
+ *
+ * Although the time is kept as seconds in the ipath_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct ipath_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+
+int ipath_update_eeprom_log(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	int len, hi_water;
+	uint32_t new_time, new_hrs;
+	u8 csum;
+	int ret, idx;
+	unsigned long flags;
+
+	/* first, check if we actually need to do anything. */
+	ret = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		if (dd->ipath_eep_st_new_errs[idx]) {
+			ret = 1;
+			break;
+		}
+	}
+	new_time = atomic_read(&dd->ipath_active_time);
+
+	if (ret == 0 && new_time < 3600)
+		return 0;
+
+	/*
+	 * The quick-check above determined that there is something worthy
+	 * of logging, so get current contents and do a more detailed idea.
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	ret = 1;
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+				"bytes from eeprom for logging\n", len);
+		goto bail;
+	}
+
+	/* Grab semaphore and read current EEPROM. If we get an
+	 * error, let go, but if not, keep it until we finish write.
+	 */
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (ret) {
+		ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+		goto free_bail;
+	}
+	ret = ipath_eeprom_internal_read(dd, 0, buf, len);
+	if (ret) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "Unable read EEPROM for logging\n");
+		goto free_bail;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+				csum, ifp->if_csum);
+		ret = 1;
+		goto free_bail;
+	}
+	hi_water = 0;
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		int new_val = dd->ipath_eep_st_new_errs[idx];
+		if (new_val) {
+			/*
+			 * If we have seen any errors, add to EEPROM values
+			 * We need to saturate at 0xFF (255) and we also
+			 * would need to adjust the checksum if we were
+			 * trying to minimize EEPROM traffic
+			 * Note that we add to actual current count in EEPROM,
+			 * in case it was altered while we were running.
+			 */
+			new_val += ifp->if_errcntp[idx];
+			if (new_val > 0xFF)
+				new_val = 0xFF;
+			if (ifp->if_errcntp[idx] != new_val) {
+				ifp->if_errcntp[idx] = new_val;
+				hi_water = offsetof(struct ipath_flash,
+						if_errcntp) + idx;
+			}
+			/*
+			 * update our shadow (used to minimize EEPROM
+			 * traffic), to match what we are about to write.
+			 */
+			dd->ipath_eep_st_errs[idx] = new_val;
+			dd->ipath_eep_st_new_errs[idx] = 0;
+		}
+	}
+	/*
+	 * now update active-time. We would like to round to the nearest hour
+	 * but unless atomic_t are sure to be proper signed ints we cannot,
+	 * because we need to account for what we "transfer" to EEPROM and
+	 * if we log an hour at 31 minutes, then we would need to set
+	 * active_time to -29 to accurately count the _next_ hour.
+	 */
+	if (new_time >= 3600) {
+		new_hrs = new_time / 3600;
+		atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
+		new_hrs += dd->ipath_eep_hrs;
+		if (new_hrs > 0xFFFF)
+			new_hrs = 0xFFFF;
+		dd->ipath_eep_hrs = new_hrs;
+		if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+			ifp->if_powerhour[0] = new_hrs & 0xFF;
+			hi_water = offsetof(struct ipath_flash, if_powerhour);
+		}
+		if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+			ifp->if_powerhour[1] = new_hrs >> 8;
+			hi_water = offsetof(struct ipath_flash, if_powerhour)
+					+ 1;
+		}
+	}
+	/*
+	 * There is a tiny possibility that we could somehow fail to write
+	 * the EEPROM after updating our shadows, but problems from holding
+	 * the spinlock too long are a much bigger issue.
+	 */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	if (hi_water) {
+		/* we made some change to the data, uopdate cksum and write */
+		csum = flash_csum(ifp, 1);
+		ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
+	}
+	mutex_unlock(&dd->ipath_eep_lock);
+	if (ret)
+		ipath_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+	vfree(buf);
+bail:
+	return ret;
+
+}
+
+/**
+ * ipath_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the infinipath device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
+{
+	uint new_val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
+	if (new_val > 255)
+		new_val = 255;
+	dd->ipath_eep_st_new_errs[eidx] = new_val;
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	return;
+}
+
+static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = -ENOENT;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	stop_cmd(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
+		ipath_dbg("Failed tempsense RD startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	/*
+	 * We can only clock out one byte per command, sensibly
+	 */
+	ret = rd_byte(dd);
+	stop_cmd(dd);
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+
+/**
+ * ipath_tempsense_read - read register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+
+	if (regnum > 7)
+		return -EINVAL;
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
+		return 0;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_read(dd, regnum);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
+
+static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
+					  u8 regnum, u8 data)
+{
+	int ret = -ENOENT;
+	struct i2c_chain_desc *icd;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	if (ret) {
+		stop_cmd(dd);
+		ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, data);
+	stop_cmd(dd);
+	ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
+			      regnum);
+		ret = -ENXIO;
+	}
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
+
+/**
+ * ipath_tempsense_write - write register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to write
+ * @data: data to write
+ *
+ * returns 0 for success or < 0 for error
+ */
+int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
+{
+	int ret;
+
+	if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_write(dd, regnum, data);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is 0 for success
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c
new file mode 100644
index 0000000..450d159
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_file_ops.c
@@ -0,0 +1,2620 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+#include "ipath_user_sdma.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+			   loff_t *);
+static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics.  Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations ipath_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_write,
+	.write_iter = ipath_write_iter,
+	.open = ipath_open,
+	.release = ipath_close,
+	.poll = ipath_poll,
+	.mmap = ipath_mmap,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int ipath_get_base_info(struct file *fp,
+			       void __user *ubase, size_t ubase_size)
+{
+	struct ipath_portdata *pd = port_fp(fp);
+	int ret = 0;
+	struct ipath_base_info *kinfo = NULL;
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned subport_cnt;
+	int shared, master;
+	size_t sz;
+
+	subport_cnt = pd->port_subport_cnt;
+	if (!subport_cnt) {
+		shared = 0;
+		master = 0;
+		subport_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subport_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If port sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ipath_cdbg(PROC,
+			   "Base size %zu, need %zu (version mismatch?)\n",
+			   ubase_size, sz);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->ipath_f_get_base_info(pd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+	kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		pd->port_rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
+	/*
+	 * for this use, may be ipath_cfgports summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nports = dd->ipath_cfgports;
+	/* unit (chip/board) our port is on */
+	kinfo->spi_unit = dd->ipath_unit;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per port, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
+	kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(void *) dd->ipath_statusp -
+		(void *) dd->ipath_pioavailregs_dma;
+	if (!shared) {
+		kinfo->spi_piocnt = pd->port_piocnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+	} else if (master) {
+		kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+				    (pd->port_piocnt % subport_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign *
+			(pd->port_piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+		kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
+		kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
+		kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
+
+		kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
+			PAGE_SIZE * subport_fp(fp));
+
+		kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
+			pd->port_rcvhdrq_size * subport_fp(fp));
+		kinfo->spi_rcvhdr_tailaddr = 0;
+		kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
+			pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
+			subport_fp(fp));
+
+		kinfo->spi_subport_uregbase =
+			cvt_kvaddr(pd->subport_uregbase);
+		kinfo->spi_subport_rcvegrbuf =
+			cvt_kvaddr(pd->subport_rcvegrbuf);
+		kinfo->spi_subport_rcvhdr_base =
+			cvt_kvaddr(pd->subport_rcvhdr_base);
+		ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+			kinfo->spi_port, kinfo->spi_runtime_flags,
+			(unsigned long long) kinfo->spi_subport_uregbase,
+			(unsigned long long) kinfo->spi_subport_rcvegrbuf,
+			(unsigned long long) kinfo->spi_subport_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+		(dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
+	kinfo->spi_pioalign = dd->ipath_palign;
+
+	kinfo->spi_qpair = IPATH_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = dd->ipath_ibmaxlen;	/* maxlen, not ibmtu */
+	kinfo->spi_port = pd->port_port;
+	kinfo->spi_subport = subport_fp(fp);
+	kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
+	kinfo->spi_hw_version = dd->ipath_revision;
+
+	if (master) {
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+	}
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @fp: the ipath device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
+			    const struct ipath_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, porttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subport = subport_fp(fp);
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+			  (unsigned long long) ti->tidlist);
+		/*
+		 * Should we treat as success?  likely a bug
+		 */
+		ret = -EFAULT;
+		goto done;
+	}
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt) {
+		tidcnt = dd->ipath_rcvtidcnt;
+		tid = pd->port_tidcursor;
+		tidoff = 0;
+	} else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		tidoff = dd->ipath_rcvtidcnt - tidcnt;
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		tidoff = tidcnt * (subport - 1);
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in port_tid_pg_list */
+		dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+			 "TIDs, only trying max (%u)\n", cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+	tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+		   pd->port_port, cnt, tid, tidbase);
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+			  (void *)vaddr, cnt);
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = ipath_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		if (ret == -EBUSY) {
+			ipath_dbg("Failed to lock addr %p, %u pages "
+				  "(already locked)\n",
+				  (void *) vaddr, cnt);
+			/*
+			 * for now, continue, and see what happens but with
+			 * the new implementation, this should never happen,
+			 * unless perhaps the user has mpin'ed the pages
+			 * themselves (something we need to test)
+			 */
+			ret = 0;
+		} else {
+			dev_info(&dd->pcidev->dev,
+				 "Failed to lock addr %p, %u pages: "
+				 "errno %d\n", (void *) vaddr, cnt, -ret);
+			goto done;
+		}
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->ipath_pageshadow[porttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			ipath_dbg("Not enough free TIDs for %u pages "
+				  "(index %d), failing\n", cnt, i);
+			i--;	/* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+			   "vaddr %lx\n", i, tid + tidoff, vaddr);
+		/* we "know" system pages and TID pages are same size */
+		dd->ipath_pageshadow[porttid + tid] = pagep[i];
+		dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+			dd->pcidev, pagep[i], 0, PAGE_SIZE,
+			PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->ipath_physshadow[porttid + tid];
+		ipath_stats.sps_pagelocks++;
+		ipath_cdbg(VERBOSE,
+			   "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+			   tid, vaddr, (unsigned long long) physaddr,
+			   pagep[i]);
+		dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
+				    physaddr);
+		/*
+		 * don't check this tid in ipath_portshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+	cleanup:
+		/* jump here if copy out of updated info failed... */
+		ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+			  -ret, i, cnt);
+		/* same code that's in ipath_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->ipath_pageshadow[porttid + tid]) {
+				ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+					   tid);
+				dd->ipath_f_put_tid(dd, &tidbase[tid],
+						    RCVHQ_RCV_TYPE_EXPECTED,
+						    dd->ipath_tidinvalid);
+				pci_unmap_page(dd->pcidev,
+					dd->ipath_physshadow[porttid + tid],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				dd->ipath_pageshadow[porttid + tid] = NULL;
+				ipath_stats.sps_pageunlocks++;
+			}
+		}
+		ipath_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with ipath_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+				 tidmap, sizeof tidmap)) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!pd->port_subport_cnt)
+			pd->port_tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	if (ret)
+		ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @subport: the subport
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
+			  const struct ipath_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, porttid, cnt, limit, tidcnt;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+			   sizeof tidmap)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt)
+		tidcnt = dd->ipath_rcvtidcnt;
+	else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		porttid += dd->ipath_rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		porttid += tidcnt * (subport - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+		   "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+		   limit, tid, porttid);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->ipath_pageshadow[porttid + tid]) {
+			struct page *p;
+			p = dd->ipath_pageshadow[porttid + tid];
+			dd->ipath_pageshadow[porttid + tid] = NULL;
+			ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+				   pid_nr(pd->port_pid), tid);
+			dd->ipath_f_put_tid(dd, &tidbase[tid],
+					    RCVHQ_RCV_TYPE_EXPECTED,
+					    dd->ipath_tidinvalid);
+			pci_unmap_page(dd->pcidev,
+				dd->ipath_physshadow[porttid + tid],
+				PAGE_SIZE, PCI_DMA_FROMDEVICE);
+			ipath_release_user_pages(&p, 1);
+			ipath_stats.sps_pageunlocks++;
+		} else
+			ipath_dbg("Unused tid %u, ignoring\n", tid);
+	}
+	if (cnt != ti->tidcnt)
+		ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+			  ti->tidcnt, cnt);
+done:
+	if (ret)
+		ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single infinipath register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int i, any = 0, pidx = -1;
+	u16 lkey = key & 0x7FFF;
+	int ret;
+
+	if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		ret = 0;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+		   "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+		   pd->port_port, key, dd->ipath_pkeys[0],
+		   atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+		   atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+		   atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+		   atomic_read(&dd->ipath_pkeyrefs[3]));
+
+	if (!lkey) {
+		ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+			   pd->port_port);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it. (see bug 4331)
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i] && pidx == -1)
+			pidx = i;
+		if (pd->port_pkeys[i] == key) {
+			ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+				   "(%x) more than once\n",
+				   pd->port_port, key);
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (pidx == -1) {
+		ipath_dbg("All pkeys for port %u already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (dd->ipath_pkeys[i] == key) {
+			atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				pd->port_pkeys[pidx] = key;
+				ipath_cdbg(VERBOSE, "p%u set key %x "
+					   "matches #%d, count now %d\n",
+					   pd->port_port, key, i,
+					   atomic_read(pkrefs));
+				ret = 0;
+				goto bail;
+			} else {
+				/*
+				 * lost race, decrement count, catch below
+				 */
+				atomic_dec(pkrefs);
+				ipath_cdbg(VERBOSE, "Lost race, count was "
+					   "0, after dec, it's %d\n",
+					   atomic_read(pkrefs));
+				any++;
+			}
+		}
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ipath_dbg("port %u, all pkeys already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			u64 pkey;
+
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+			pkey =
+				(u64) dd->ipath_pkeys[0] |
+				((u64) dd->ipath_pkeys[1] << 16) |
+				((u64) dd->ipath_pkeys[2] << 32) |
+				((u64) dd->ipath_pkeys[3] << 48);
+			ipath_cdbg(PROC, "p%u set key %x in #%d, "
+				   "portidx %d, new pkey reg %llx\n",
+				   pd->port_port, key, i, pidx,
+				   (unsigned long long) pkey);
+			ipath_write_kreg(
+				dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+			ret = 0;
+			goto bail;
+		}
+	}
+	ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+		  "can't set %x\n", pd->port_port, key);
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @subport: the subport
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+			     int start_stop)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+
+	ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
+		   start_stop ? "en" : "dis", dd->ipath_unit,
+		   pd->port_port, subport);
+	if (subport)
+		goto bail;
+	/* atomically clear receive enable port. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.  This could cause a
+		 * problem if software was broken, and did the enable w/o
+		 * the disable, but eventually the in-memory copy will be
+		 * updated and correct itself, even in the face of software
+		 * bugs.
+		 */
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			&dd->ipath_rcvctrl);
+	} else
+		clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			  &dd->ipath_rcvctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* now be sure chip saw it before we return */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	if (start_stop) {
+		/*
+		 * And try to be sure that tail reg update has happened too.
+		 * This should in theory interlock with the RXE changes to
+		 * the tail register.  Don't assign it to the tail register
+		 * in memory copy, since we could overwrite an update by the
+		 * chip if we did.
+		 */
+		ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+	}
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+				 struct ipath_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	u64 oldpkey;
+
+	/* for debugging only */
+	oldpkey = (u64) dd->ipath_pkeys[0] |
+		((u64) dd->ipath_pkeys[1] << 16) |
+		((u64) dd->ipath_pkeys[2] << 32) |
+		((u64) dd->ipath_pkeys[3] << 48);
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i])
+			continue;
+		ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+			   pd->port_pkeys[i]);
+		for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((dd->ipath_pkeys[j] & 0x7fff) !=
+			    (pd->port_pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+				ipath_cdbg(VERBOSE, "p%u clear key "
+					   "%x matches #%d\n",
+					   pd->port_port,
+					   pd->port_pkeys[i], j);
+				ipath_stats.sps_pkeys[j] =
+					dd->ipath_pkeys[j] = 0;
+				pchanged++;
+			}
+			else ipath_cdbg(
+				VERBOSE, "p%u key %x matches #%d, "
+				"but ref still %d\n", pd->port_port,
+				pd->port_pkeys[i], j,
+				atomic_read(&dd->ipath_pkeyrefs[j]));
+			break;
+		}
+		pd->port_pkeys[i] = 0;
+	}
+	if (pchanged) {
+		u64 pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+			   "new pkey reg %llx\n", pd->port_port,
+			   (unsigned long long) oldpkey,
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+	}
+}
+
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned egrperchunk, egrcnt, size;
+
+	/*
+	 * to avoid wasting a lot of memory, we allocate 32KB chunks of
+	 * physically contiguous memory, advance through it until used up
+	 * and then allocate more.  Of course, we need memory to store those
+	 * extra pointers, now.  Started out with 256KB, but under heavy
+	 * memory pressure (creating large files and then copying them over
+	 * NFS while doing lots of MPI jobs), we hit some allocation
+	 * failures, even though we can sleep...  (2.6.10) Still get
+	 * failures at 64K.  32K is the lowest we can go without wasting
+	 * additional memory.
+	 */
+	size = 0x8000;
+	egrperchunk = size / dd->ipath_rcvegrbufsize;
+	egrcnt = dd->ipath_rcvegrcnt;
+	pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+	pd->port_rcvegrbufs_perchunk = egrperchunk;
+	pd->port_rcvegrbuf_size = size;
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	int ret;
+	gfp_t gfp_flags;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	egrcnt = dd->ipath_rcvegrcnt;
+	/* TID number offset for this port */
+	egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
+	egrsize = dd->ipath_rcvegrbufsize;
+	ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+		   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+	chunk = pd->port_rcvegrbuf_chunks;
+	egrperchunk = pd->port_rcvegrbufs_perchunk;
+	size = pd->port_rcvegrbuf_size;
+	pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+				     GFP_KERNEL);
+	if (!pd->port_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	pd->port_rcvegrbuf_phys =
+		kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+			GFP_KERNEL);
+	if (!pd->port_rcvegrbuf_phys) {
+		ret = -ENOMEM;
+		goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+
+		pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+			&dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+			gfp_flags);
+
+		if (!pd->port_rcvegrbuf[e]) {
+			ret = -ENOMEM;
+			goto bail_rcvegrbuf_phys;
+		}
+	}
+
+	pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->ipath_f_put_tid(dd, e + egroff +
+					    (u64 __iomem *)
+					    ((char __iomem *)
+					     dd->ipath_kregbase +
+					     dd->ipath_rcvegrbase),
+					    RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched();	/* don't hog the cpu */
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+		pd->port_rcvegrbuf[e]; e++) {
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  pd->port_rcvegrbuf[e],
+				  pd->port_rcvegrbuf_phys[e]);
+
+	}
+	kfree(pd->port_rcvegrbuf_phys);
+	pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(pd->port_rcvegrbuf);
+	pd->port_rcvegrbuf = NULL;
+bail:
+	return ret;
+}
+
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int ipath_mmap_mem(struct vm_area_struct *vma,
+	struct ipath_portdata *pd, unsigned len, int write_ok,
+	void *kvaddr, char *what)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		dev_info(&dd->pcidev->dev,
+		         "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			dev_info(&dd->pcidev->dev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+			 "bytes r%c failed: %d\n", what, pd->port_port,
+			 pfn, len, write_ok?'w':'o', ret);
+	else
+		ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+			   "r%c\n", what, pd->port_port, pfn, len,
+			   write_ok?'w':'o');
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their port
+	 * in the chip.
+	 */
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+			 "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->ipath_physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct ipath_devdata *dd,
+			struct ipath_portdata *pd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible.   This prevents access to previous
+	 * process data, and catches users who might try to read the i/o
+	 * space due to a bug.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+			 "reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->ipath_physaddr + piobufs;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = pd->port_rcvegrbuf_size;
+	total_size = pd->port_rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+			 "reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+			 "writable (flags=%lx)\n", vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * ipath_file_vma_fault - handle a VMA page fault.
+ */
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct ipath_file_vm_ops = {
+	.fault = ipath_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct ipath_portdata *pd, unsigned subport)
+{
+	unsigned long len;
+	struct ipath_devdata *dd;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	/* If the port is not shared, all addresses should be physical */
+	if (!pd->port_subport_cnt)
+		goto bail;
+
+	dd = pd->port_dd;
+	size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subport uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
+		addr = pd->subport_uregbase;
+		size = PAGE_SIZE * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
+		addr = pd->subport_rcvhdr_base;
+		size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
+		addr = pd->subport_rcvegrbuf;
+		size *= pd->port_subport_cnt;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
+                                        PAGE_SIZE * subport)) {
+                addr = pd->subport_uregbase + PAGE_SIZE * subport;
+                size = PAGE_SIZE;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
+                                pd->port_rcvhdrq_size * subport)) {
+                addr = pd->subport_rcvhdr_base +
+                        pd->port_rcvhdrq_size * subport;
+                size = pd->port_rcvhdrq_size;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
+                               size * subport)) {
+                addr = pd->subport_rcvegrbuf + size * subport;
+                /* rcvegrbufs are read-only on the slave */
+                if (vma->vm_flags & VM_WRITE) {
+                        dev_info(&dd->pcidev->dev,
+                                 "Can't map eager buffers as "
+                                 "writable (flags=%lx)\n", vma->vm_flags);
+                        ret = -EPERM;
+                        goto bail;
+                }
+                /*
+                 * Don't allow permission to later change to writeable
+                 * with mprotect.
+                 */
+                vma->vm_flags &= ~VM_MAYWRITE;
+	} else {
+		goto bail;
+	}
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &ipath_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret;
+
+	pd = port_fp(fp);
+	if (!pd) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = pd->port_dd;
+
+	/*
+	 * This is the ipath_do_user_init() code, mapping the shared buffers
+	 * into the user process. The address referred to by vm_pgoff is the
+	 * file offset passed via mmap().  For shared ports, this is the
+	 * kernel vmalloc() address of the pages to share with the master.
+	 * For non-shared or master ports, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
+		   (unsigned long long) pgaddr, vma->vm_start,
+		   vma->vm_end - vma->vm_start, dd->ipath_unit,
+		   pd->port_port, subport_fp(fp));
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
+	if (!pd->port_subport_cnt) {
+		/* port is not shared */
+		piocnt = pd->port_piocnt;
+		piobufs = pd->port_piobufs;
+	} else if (!subport_fp(fp)) {
+		/* caller is the master */
+		piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+			 (pd->port_piocnt % pd->port_subport_cnt);
+		piobufs = pd->port_piobufs +
+			dd->ipath_palign * (pd->port_piocnt - piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = pd->port_piocnt / pd->port_subport_cnt;
+		piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+	else if (pgaddr == dd->ipath_pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+			      	     (void *) dd->ipath_pioavailregs_dma,
+				     "pioavail registers");
+	else if (pgaddr == pd->port_rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, pd);
+	else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; readonly except on HT (so have
+		 * to allow writable mapping), multiple pages, contiguous
+		 * from an i/o perspective.
+		 */
+		ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+				     pd->port_rcvhdrq,
+				     "rcvhdrq");
+	else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+				     pd->port_rcvhdrtail_kvaddr,
+				     "rcvhdrq tail");
+	else
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		dev_info(&dd->pcidev->dev,
+			 "Failure %d on off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+	unsigned pollflag = 0;
+
+	if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+	    pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_hdrqfull_poll = pd->port_hdrqfull;
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
+				      struct file *fp,
+				      struct poll_table_struct *pt)
+{
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	if (pd->port_urgent != pd->port_urgent_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_urgent_poll = pd->port_urgent;
+	}
+
+	if (!pollflag) {
+		/* this saves a spin_lock/unlock in interrupt handler... */
+		set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
+		/* flush waiting flag so don't miss an event... */
+		wmb();
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_next(struct ipath_portdata *pd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	u32 head;
+	u32 tail;
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+	if (pd->port_rcvhdrtail_kvaddr)
+		tail = ipath_get_rcvhdrtail(pd);
+	else
+		tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+	if (head != tail)
+		pollflag |= POLLIN | POLLRDNORM;
+	else {
+		/* this saves a spin_lock/unlock in interrupt handler */
+		set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+		/* flush waiting flag so we don't miss an event */
+		wmb();
+
+		set_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			&dd->ipath_rcvctrl);
+
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+
+		if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+			ipath_write_ureg(dd, ur_rcvhdrhead,
+					 dd->ipath_rhdrhead_intr_off | head,
+					 pd->port_port);
+
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+			       struct poll_table_struct *pt)
+{
+	struct ipath_portdata *pd;
+	unsigned pollflag;
+
+	pd = port_fp(fp);
+	if (!pd)
+		pollflag = 0;
+	else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
+		pollflag = ipath_poll_urgent(pd, fp, pt);
+	else
+		pollflag = ipath_poll_next(pd, fp, pt);
+
+	return pollflag;
+}
+
+static int ipath_supports_subports(int user_swmajor, int user_swminor)
+{
+	/* no subport implementation prior to software version 1.3 */
+	return (user_swmajor > 1) || (user_swminor >= 3);
+}
+
+static int ipath_compatible_subports(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (IPATH_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (IPATH_USER_SWMAJOR == 1) {
+		switch (IPATH_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subport implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor >= 4;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subports(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd,
+			 const struct ipath_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subports;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subports,
+	 * skip the subport allocation.
+	 */
+	if (uinfo->spu_subport_cnt <= 0)
+		goto bail;
+
+	/* Self-consistency check for ipath_compatible_subports() */
+	if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
+	    !ipath_compatible_subports(IPATH_USER_SWMAJOR,
+				       IPATH_USER_SWMINOR)) {
+		dev_info(&dd->pcidev->dev,
+			 "Inconsistent ipath_compatible_subports()\n");
+		goto bail;
+	}
+
+	/* Check for subport compatibility */
+	if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
+				       uinfo->spu_userversion & 0xffff)) {
+		dev_info(&dd->pcidev->dev,
+			 "Mismatched user version (%d.%d) and driver "
+			 "version (%d.%d) while port sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+                         (int) (uinfo->spu_userversion & 0xffff),
+			 IPATH_USER_SWMAJOR,
+	                 IPATH_USER_SWMINOR);
+		goto bail;
+	}
+	if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	num_subports = uinfo->spu_subport_cnt;
+	pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
+	if (!pd->subport_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subports;
+	pd->subport_rcvhdr_base = vzalloc(size);
+	if (!pd->subport_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
+					pd->port_rcvegrbuf_size *
+					num_subports);
+	if (!pd->subport_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	pd->port_subport_cnt = uinfo->spu_subport_cnt;
+	pd->port_subport_id = uinfo->spu_subport_id;
+	pd->active_slaves = 1;
+	set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+	vfree(pd->subport_uregbase);
+	pd->subport_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+			  struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_portdata *pd;
+	int ret;
+
+	if (!(pd = dd->ipath_pd[port])) {
+		void *ptmp;
+
+		pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+		/*
+		 * Allocate memory for use in ipath_tid_update() just once
+		 * at open, not per call.  Reduces cost of expected send
+		 * setup.
+		 */
+		ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+			       dd->ipath_rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+		if (!pd || !ptmp) {
+			ipath_dev_err(dd, "Unable to allocate portdata "
+				      "memory, failing open\n");
+			ret = -ENOMEM;
+			kfree(pd);
+			kfree(ptmp);
+			goto bail;
+		}
+		dd->ipath_pd[port] = pd;
+		dd->ipath_pd[port]->port_port = port;
+		dd->ipath_pd[port]->port_dd = dd;
+		dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+		init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+	}
+	if (!pd->port_cnt) {
+		pd->userversion = uinfo->spu_userversion;
+		init_user_egr_sizes(pd);
+		if ((ret = init_subports(dd, pd, uinfo)) != 0)
+			goto bail;
+		ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+			   current->comm, current->pid, dd->ipath_unit,
+			   port);
+		pd->port_cnt = 1;
+		port_fp(fp) = pd;
+		pd->port_pid = get_pid(task_pid(current));
+		strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
+		ipath_stats.sps_ports++;
+		ret = 0;
+	} else
+		ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+	return dd &&
+		(dd->ipath_flags & IPATH_PRESENT) &&
+		dd->ipath_kregbase &&
+		dd->ipath_lid &&
+		!(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+				     | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	int ret, i;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (!usable(dd)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		ret = try_alloc_port(dd, i, fp, uinfo);
+		if (ret != -EBUSY)
+			goto bail;
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static int find_best_unit(struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	int ret = 0, i, prefunit = -1, devmax;
+	int maxofallports, npresent, nup;
+	int ndev;
+
+	devmax = ipath_count_units(&npresent, &nup, &maxofallports);
+
+	/*
+	 * This code is present to allow a knowledgeable person to
+	 * specify the layout of processes to processors before opening
+	 * this driver, and then we'll assign the process to the "closest"
+	 * InfiniPath chip to that processor (we assume reasonable connectivity,
+	 * for now).  This code assumes that if affinity has been set
+	 * before this point, that at most one cpu is set; for now this
+	 * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
+	 * in case some kernel variant sets none of the bits when no
+	 * affinity is set.  2.6.11 and 12 kernels have all present
+	 * cpus set.  Some day we'll have to fix it up further to handle
+	 * a cpu subset.  This algorithm fails for two HT chips connected
+	 * in tunnel fashion.  Eventually this needs real topology
+	 * information.  There may be some issues with dual core numbering
+	 * as well.  This needs more work prior to release.
+	 */
+	if (!cpumask_empty(tsk_cpus_allowed(current)) &&
+	    !cpumask_full(tsk_cpus_allowed(current))) {
+		int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
+		get_online_cpus();
+		for_each_online_cpu(i)
+			if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
+				ipath_cdbg(PROC, "%s[%u] affinity set for "
+					   "cpu %d/%d\n", current->comm,
+					   current->pid, i, ncpus);
+				curcpu = i;
+				nset++;
+			}
+		put_online_cpus();
+		if (curcpu != -1 && nset != ncpus) {
+			if (npresent) {
+				prefunit = curcpu / (ncpus / npresent);
+				ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
+					  "%d cpus/chip, select unit %d\n",
+					  current->comm, current->pid,
+					  npresent, ncpus, ncpus / npresent,
+					  prefunit);
+			}
+		}
+	}
+
+	/*
+	 * user ports start at 1, kernel port is 0
+	 * For now, we do round-robin access across all chips
+	 */
+
+	if (prefunit != -1)
+		devmax = prefunit + 1;
+recheck:
+	for (i = 1; i < maxofallports; i++) {
+		for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+		     ndev++) {
+			struct ipath_devdata *dd = ipath_lookup(ndev);
+
+			if (!usable(dd))
+				continue; /* can't use this unit */
+			if (i >= dd->ipath_cfgports)
+				/*
+				 * Maxed out on users of this unit. Try
+				 * next.
+				 */
+				continue;
+			ret = try_alloc_port(dd, i, fp, uinfo);
+			if (!ret)
+				goto done;
+		}
+	}
+
+	if (npresent) {
+		if (nup == 0) {
+			ret = -ENETDOWN;
+			ipath_dbg("No ports available (none initialized "
+				  "and ready)\n");
+		} else {
+			if (prefunit > 0) {
+				/* if started above 0, retry from 0 */
+				ipath_cdbg(PROC,
+					   "%s[%u] no ports on prefunit "
+					   "%d, clear and re-check\n",
+					   current->comm, current->pid,
+					   prefunit);
+				devmax = ipath_count_units(NULL, NULL,
+							   NULL);
+				prefunit = -1;
+				goto recheck;
+			}
+			ret = -EBUSY;
+			ipath_dbg("No ports available\n");
+		}
+	} else {
+		ret = -ENXIO;
+		ipath_dbg("No boards found\n");
+	}
+
+done:
+	return ret;
+}
+
+static int find_shared_port(struct file *fp,
+			    const struct ipath_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = ipath_count_units(NULL, NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct ipath_devdata *dd = ipath_lookup(ndev);
+
+		if (!usable(dd))
+			continue;
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			/* Skip ports which are not yet open */
+			if (!pd || !pd->port_cnt)
+				continue;
+			/* Skip port if it doesn't match the requested one */
+			if (pd->port_subport_id != uinfo->spu_subport_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+			    pd->userversion != uinfo->spu_userversion ||
+			    pd->port_cnt >= pd->port_subport_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			port_fp(fp) = pd;
+			subport_fp(fp) = pd->port_cnt++;
+			pd->port_subpid[subport_fp(fp)] =
+				get_pid(task_pid(current));
+			tidcursor_fp(fp) = 0;
+			pd->active_slaves |= 1 << subport_fp(fp);
+			ipath_cdbg(PROC,
+				   "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+				   current->comm, current->pid,
+				   subport_fp(fp),
+				   pd->port_comm, pid_nr(pd->port_pid),
+				   dd->ipath_unit, pd->port_port);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in ipath_assign_port() */
+	fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (port_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != IPATH_USER_SWMAJOR) {
+		ipath_dbg("User major version %d not same as driver "
+			  "major %d\n", uinfo->spu_userversion >> 16,
+			  IPATH_USER_SWMAJOR);
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+	if (swminor != IPATH_USER_SWMINOR)
+		ipath_dbg("User minor version %d not same as driver "
+			  "minor %d\n", swminor, IPATH_USER_SWMINOR);
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_compatible_subports(swmajor, swminor) &&
+	    uinfo->spu_subport_cnt &&
+	    (ret = find_shared_port(fp, uinfo))) {
+		if (ret > 0)
+			ret = 0;
+		goto done_chk_sdma;
+	}
+
+	i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
+	ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+		   (long)file_inode(fp)->i_rdev, i_minor);
+
+	if (i_minor)
+		ret = find_free_port(i_minor - 1, fp, uinfo);
+	else
+		ret = find_best_unit(fp, uinfo);
+
+done_chk_sdma:
+	if (!ret) {
+		struct ipath_filedata *fd = fp->private_data;
+		const struct ipath_portdata *pd = fd->pd;
+		const struct ipath_devdata *dd = pd->port_dd;
+
+		fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
+						      dd->ipath_unit,
+						      pd->port_port,
+						      fd->subport);
+
+		if (!fd->pq)
+			ret = -ENOMEM;
+	}
+
+	mutex_unlock(&ipath_mutex);
+
+done:
+	return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	struct ipath_portdata *pd = port_fp(fp);
+	struct ipath_devdata *dd;
+	u32 head32;
+
+	/* Subports don't need to initialize anything since master did it. */
+	if (subport_fp(fp)) {
+		ret = wait_event_interruptible(pd->port_wait,
+			!test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
+		goto done;
+	}
+
+	dd = pd->port_dd;
+
+	if (uinfo->spu_rcvhdrsize) {
+		ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+		if (ret)
+			goto done;
+	}
+
+	/* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+	/* some ports may get extra buffers, calculate that here */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_piocnt = dd->ipath_pbufsport + 1;
+	else
+		pd->port_piocnt = dd->ipath_pbufsport;
+
+	/* for right now, kernel piobufs are at end, so port 1 is at 0 */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_pio_base = (dd->ipath_pbufsport + 1)
+			* (pd->port_port - 1);
+	else
+		pd->port_pio_base = dd->ipath_ports_extrabuf +
+			dd->ipath_pbufsport * (pd->port_port - 1);
+	pd->port_piobufs = dd->ipath_piobufbase +
+		pd->port_pio_base * dd->ipath_palign;
+	ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+		" first pio %u\n", pd->port_port, pd->port_piobufs,
+		pd->port_piocnt, pd->port_pio_base);
+	ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If pd->port_port > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through port 0, someday
+	 */
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = ipath_create_user_egr(pd);
+	if (ret)
+		goto done;
+
+	/*
+	 * set the eager head register for this port to the current values
+	 * of the tail pointers, since we don't know if they were
+	 * updated on last use of the port.
+	 */
+	head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+	pd->port_lastrcvhdrqtail = -1;
+	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+		pd->port_port, head32);
+	pd->port_tidcursor = 0;	/* start at beginning after open */
+
+	/* initialize poll variables... */
+	pd->port_urgent = 0;
+	pd->port_urgent_poll = 0;
+	pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
+	/*
+	 * Now enable the port for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ports, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+		&dd->ipath_rcvctrl);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl &
+			~(1ULL << dd->ipath_r_tailupd_shift));
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* Notify any waiting slaves */
+	if (pd->port_subport_cnt) {
+		clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+		wake_up(&pd->port_wait);
+	}
+done:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+	int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+	ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+		   pd->port_port);
+	for (i = port_tidbase; i < maxtid; i++) {
+		struct page *ps = dd->ipath_pageshadow[i];
+
+		if (!ps)
+			continue;
+
+		dd->ipath_pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+			PAGE_SIZE, PCI_DMA_FROMDEVICE);
+		ipath_release_user_pages_on_close(&ps, 1);
+		cnt++;
+		ipath_stats.sps_pageunlocks++;
+	}
+	if (cnt)
+		ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+			   pd->port_port, cnt);
+
+	if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+		ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+			   (unsigned long long) ipath_stats.sps_pagelocks,
+			   (unsigned long long)
+			   ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct ipath_filedata *fd;
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned port;
+	struct pid *pid;
+
+	ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+		   (long)in->i_rdev, fp->private_data);
+
+	mutex_lock(&ipath_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	pd = fd->pd;
+	if (!pd) {
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+
+	dd = pd->port_dd;
+
+	/* drain user sdma queue */
+	ipath_user_sdma_queue_drain(dd, fd->pq);
+	ipath_user_sdma_queue_destroy(fd->pq);
+
+	if (--pd->port_cnt) {
+		/*
+		 * XXX If the master closes the port before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		pd->active_slaves &= ~(1 << fd->subport);
+		put_pid(pd->port_subpid[fd->subport]);
+		pd->port_subpid[fd->subport] = NULL;
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	port = pd->port_port;
+	dd->ipath_pd[port] = NULL;
+	pid = pd->port_pid;
+	pd->port_pid = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (pd->port_rcvwait_to || pd->port_piowait_to
+	    || pd->port_rcvnowait || pd->port_pionowait) {
+		ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+			   "%u rcv %u, pio already\n",
+			   pd->port_port, pd->port_rcvwait_to,
+			   pd->port_piowait_to, pd->port_rcvnowait,
+			   pd->port_pionowait);
+		pd->port_rcvwait_to = pd->port_piowait_to =
+			pd->port_rcvnowait = pd->port_pionowait = 0;
+	}
+	if (pd->port_flag) {
+		ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
+			  pd->port_port, pd->port_flag);
+		pd->port_flag = 0;
+	}
+
+	if (dd->ipath_kregbase) {
+		/* atomically clear receive enable port and intr avail. */
+		clear_bit(dd->ipath_r_portenable_shift + port,
+			  &dd->ipath_rcvctrl);
+		clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			  &dd->ipath_rcvctrl);
+		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl);
+		/* and read back from chip to be sure that nothing
+		 * else is in flight when we do the rest */
+		(void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+		/* clean up the pkeys for this port user */
+		ipath_clean_part_key(pd, dd);
+		/*
+		 * be paranoid, and never write 0's to these, just use an
+		 * unused part of the port 0 tail page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the port, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the port.
+		 */
+		ipath_write_kreg_port(dd,
+		        dd->ipath_kregs->kr_rcvhdrtailaddr, port,
+			dd->ipath_dummy_hdrq_phys);
+		ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			pd->port_port, dd->ipath_dummy_hdrq_phys);
+
+		ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+		ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+			pd->port_piocnt, 1);
+
+		dd->ipath_f_clear_tids(dd, pd->port_port);
+
+		if (dd->ipath_pageshadow)
+			unlock_expected_tids(pd);
+		ipath_stats.sps_ports--;
+		ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+			   pd->port_comm, pid_nr(pid),
+			   dd->ipath_unit, port);
+	}
+
+	put_pid(pid);
+	mutex_unlock(&ipath_mutex);
+	ipath_free_pddata(dd, pd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
+			   struct ipath_port_info __user *uinfo)
+{
+	struct ipath_port_info info;
+	int nup;
+	int ret;
+	size_t sz;
+
+	(void) ipath_count_units(NULL, &nup, NULL);
+	info.num_active = nup;
+	info.unit = pd->port_dd->ipath_unit;
+	info.port = pd->port_port;
+	info.subport = subport;
+	/* Don't return new fields if old library opened the port. */
+	if (ipath_supports_subports(pd->userversion >> 16,
+				    pd->userversion & 0xffff)) {
+		/* Number of user ports available for this device. */
+		info.num_ports = pd->port_dd->ipath_cfgports - 1;
+		info.num_subports = pd->port_subport_cnt;
+		sz = sizeof(info);
+	} else
+		sz = sizeof(info) - 2 * sizeof(u16);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+				void __user *slave_mask_addr)
+{
+	int ret = 0;
+
+	if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
+				   u32 __user *inflightp)
+{
+	const u32 val = ipath_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ipath_sdma_get_complete(struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	err = ipath_user_sdma_make_progress(dd, pq);
+	if (err < 0)
+		return err;
+
+	val = ipath_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+			   size_t count, loff_t *off)
+{
+	const struct ipath_cmd __user *ucmd;
+	struct ipath_portdata *pd;
+	const void __user *src;
+	size_t consumed, copy;
+	struct ipath_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct ipath_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+	case __IPATH_CMD_USER_INIT:
+	case IPATH_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+	case IPATH_CMD_PORT_INFO:
+		copy = sizeof(cmd.cmd.port_info);
+		dest = &cmd.cmd.port_info;
+		src = &ucmd->cmd.port_info;
+		break;
+	case IPATH_CMD_TID_UPDATE:
+	case IPATH_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		copy = sizeof(cmd.cmd.slave_mask_addr);
+		dest = &cmd.cmd.slave_mask_addr;
+		src = &ucmd->cmd.slave_mask_addr;
+		break;
+	case IPATH_CMD_PIOAVAILUPD:	// force an update of PIOAvail reg
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		consumed += copy;
+	}
+
+	pd = port_fp(fp);
+	if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+		cmd.type != IPATH_CMD_ASSIGN_PORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+	case __IPATH_CMD_USER_INIT:
+		/* backwards compatibility, get port first */
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		/* and fall through to current version. */
+	case IPATH_CMD_USER_INIT:
+		ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = ipath_get_base_info(
+			fp, (void __user *) (unsigned long)
+			cmd.cmd.user_info.spu_base_info,
+			cmd.cmd.user_info.spu_base_info_size);
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+	case IPATH_CMD_PORT_INFO:
+		ret = ipath_port_info(pd, subport_fp(fp),
+				      (struct ipath_port_info __user *)
+				      (unsigned long) cmd.cmd.port_info);
+		break;
+	case IPATH_CMD_TID_UPDATE:
+		ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_TID_FREE:
+		ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		ret = ipath_get_slave_info(pd,
+					   (void __user *) (unsigned long)
+					   cmd.cmd.slave_mask_addr);
+		break;
+	case IPATH_CMD_PIOAVAILUPD:
+		ipath_force_pio_avail_update(pd->port_dd);
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		pd->poll_type = cmd.cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		if (cmd.cmd.armlaunch_ctrl)
+			ipath_enable_armlaunch(pd->port_dd);
+		else
+			ipath_disable_armlaunch(pd->port_dd);
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_inflight);
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		ret = ipath_sdma_get_complete(pd->port_dd,
+					      user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_complete);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *filp = iocb->ki_filp;
+	struct ipath_filedata *fp = filp->private_data;
+	struct ipath_portdata *pd = port_fp(filp);
+	struct ipath_user_sdma_queue *pq = fp->pq;
+
+	if (!iter_is_iovec(from) || !from->nr_segs)
+		return -EINVAL;
+
+	return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, const struct file_operations *fops,
+		     struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+	struct cdev *cdev = NULL;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(ipath_class, NULL, dev, NULL, name);
+
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	goto done;
+
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+
+done:
+	if (ret >= 0) {
+		*cdevp = cdev;
+		*devp = device;
+	} else {
+		*cdevp = NULL;
+		*devp = NULL;
+	}
+
+	return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp)
+{
+	return init_cdev(minor, name, fops, cdevp, devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+			 struct device **devp)
+{
+	struct device *dev = *devp;
+
+	if (dev) {
+		device_unregister(dev);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp)
+{
+	cleanup_cdev(cdevp, devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+		       "chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+	if (IS_ERR(ipath_class)) {
+		ret = PTR_ERR(ipath_class);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device class (err %d)\n", -ret);
+		goto bail;
+	}
+
+	goto done;
+bail:
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+	return ret;
+}
+
+static void user_cleanup(void)
+{
+	if (ipath_class) {
+		class_destroy(ipath_class);
+		ipath_class = NULL;
+	}
+
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = user_init();
+		if (ret < 0) {
+			ipath_dev_err(dd, "Unable to set up user support: "
+				      "error %d\n", -ret);
+			goto bail;
+		}
+		ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+				&wildcard_dev);
+		if (ret < 0) {
+			ipath_dev_err(dd, "Could not create wildcard "
+				      "minor: error %d\n", -ret);
+			goto bail_user;
+		}
+
+		atomic_set(&user_setup, 1);
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+	ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+			&dd->user_cdev, &dd->user_dev);
+	if (ret < 0)
+		ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+			      dd->ipath_unit + 1, name);
+
+	goto bail;
+
+bail_user:
+	user_cleanup();
+bail:
+	return ret;
+}
+
+void ipath_user_remove(struct ipath_devdata *dd)
+{
+	cleanup_cdev(&dd->user_cdev, &dd->user_dev);
+
+	if (atomic_dec_return(&user_count) == 0) {
+		if (atomic_read(&user_setup) == 0)
+			goto bail;
+
+		cleanup_cdev(&wildcard_cdev, &wildcard_dev);
+		user_cleanup();
+
+		atomic_set(&user_setup, 0);
+	}
+bail:
+	return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
new file mode 100644
index 0000000..25422a3
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_fs.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, const struct file_operations *fops,
+			 void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = ipathfs_mknod(d_inode(parent), *dentry,
+				      mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+				       sizeof ipath_stats);
+}
+
+static const struct file_operations atomic_stats_ops = {
+	.read = atomic_stats_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct infinipath_counters counters;
+	struct ipath_devdata *dd;
+
+	dd = file_inode(file)->i_private;
+	dd->ipath_f_read_counters(dd, &counters);
+
+	return simple_read_from_buffer(buf, count, ppos, &counters,
+				       sizeof counters);
+}
+
+static const struct file_operations atomic_counters_ops = {
+	.read = atomic_counters_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if ( pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct ipath_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct ipath_flash) - pos)
+		count = sizeof(struct ipath_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_read(dd, pos, tmp, count)) {
+		ipath_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (count != sizeof(struct ipath_flash)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmp, buf, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		ipath_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int create_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret;
+
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &atomic_counters_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/atomic_counters) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/flash) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (simple_positive(tmp)) {
+		dget_dlock(tmp);
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(d_inode(parent), tmp);
+	} else
+		spin_unlock(&tmp->d_lock);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret;
+
+	root = dget(sb->s_root);
+	mutex_lock(&d_inode(root)->i_mutex);
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		printk(KERN_ERR "Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	remove_file(dir, "flash");
+	remove_file(dir, "atomic_counters");
+	d_delete(dir);
+	ret = simple_rmdir(d_inode(root), dir);
+
+bail:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	dput(root);
+	return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+			      int silent)
+{
+	struct ipath_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static struct tree_descr files[] = {
+		[2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+	if (ret) {
+		printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+		ret = create_device_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+	if (!IS_ERR(ret))
+		ipath_super = ret->d_sb;
+	return ret;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = create_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = remove_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"ipathfs",
+	.mount =	ipathfs_mount,
+	.kill_sb =	ipathfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init ipath_init_ipathfs(void)
+{
+	return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+	unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c
new file mode 100644
index 0000000..7cc3054
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_iba6110.c
@@ -0,0 +1,1940 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT chip.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/htirq.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
+
+
+/*
+ * This lists the InfiniPath registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification.  Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+	unsigned long long Revision;
+	unsigned long long Control;
+	unsigned long long PageAlign;
+	unsigned long long PortCnt;
+	unsigned long long DebugPortSelect;
+	unsigned long long DebugPort;
+	unsigned long long SendRegBase;
+	unsigned long long UserRegBase;
+	unsigned long long CounterRegBase;
+	unsigned long long Scratch;
+	unsigned long long ReservedMisc1;
+	unsigned long long InterruptConfig;
+	unsigned long long IntBlocked;
+	unsigned long long IntMask;
+	unsigned long long IntStatus;
+	unsigned long long IntClear;
+	unsigned long long ErrorMask;
+	unsigned long long ErrorStatus;
+	unsigned long long ErrorClear;
+	unsigned long long HwErrMask;
+	unsigned long long HwErrStatus;
+	unsigned long long HwErrClear;
+	unsigned long long HwDiagCtrl;
+	unsigned long long MDIO;
+	unsigned long long IBCStatus;
+	unsigned long long IBCCtrl;
+	unsigned long long ExtStatus;
+	unsigned long long ExtCtrl;
+	unsigned long long GPIOOut;
+	unsigned long long GPIOMask;
+	unsigned long long GPIOStatus;
+	unsigned long long GPIOClear;
+	unsigned long long RcvCtrl;
+	unsigned long long RcvBTHQP;
+	unsigned long long RcvHdrSize;
+	unsigned long long RcvHdrCnt;
+	unsigned long long RcvHdrEntSize;
+	unsigned long long RcvTIDBase;
+	unsigned long long RcvTIDCnt;
+	unsigned long long RcvEgrBase;
+	unsigned long long RcvEgrCnt;
+	unsigned long long RcvBufBase;
+	unsigned long long RcvBufSize;
+	unsigned long long RxIntMemBase;
+	unsigned long long RxIntMemSize;
+	unsigned long long RcvPartitionKey;
+	unsigned long long ReservedRcv[10];
+	unsigned long long SendCtrl;
+	unsigned long long SendPIOBufBase;
+	unsigned long long SendPIOSize;
+	unsigned long long SendPIOBufCnt;
+	unsigned long long SendPIOAvailAddr;
+	unsigned long long TxIntMemBase;
+	unsigned long long TxIntMemSize;
+	unsigned long long ReservedSend[9];
+	unsigned long long SendBufferError;
+	unsigned long long SendBufferErrorCONT1;
+	unsigned long long SendBufferErrorCONT2;
+	unsigned long long SendBufferErrorCONT3;
+	unsigned long long ReservedSBE[4];
+	unsigned long long RcvHdrAddr0;
+	unsigned long long RcvHdrAddr1;
+	unsigned long long RcvHdrAddr2;
+	unsigned long long RcvHdrAddr3;
+	unsigned long long RcvHdrAddr4;
+	unsigned long long RcvHdrAddr5;
+	unsigned long long RcvHdrAddr6;
+	unsigned long long RcvHdrAddr7;
+	unsigned long long RcvHdrAddr8;
+	unsigned long long ReservedRHA[7];
+	unsigned long long RcvHdrTailAddr0;
+	unsigned long long RcvHdrTailAddr1;
+	unsigned long long RcvHdrTailAddr2;
+	unsigned long long RcvHdrTailAddr3;
+	unsigned long long RcvHdrTailAddr4;
+	unsigned long long RcvHdrTailAddr5;
+	unsigned long long RcvHdrTailAddr6;
+	unsigned long long RcvHdrTailAddr7;
+	unsigned long long RcvHdrTailAddr8;
+	unsigned long long ReservedRHTA[7];
+	unsigned long long Sync;	/* Software only */
+	unsigned long long Dump;	/* Software only */
+	unsigned long long SimVer;	/* Software only */
+	unsigned long long ReservedSW[5];
+	unsigned long long SerdesConfig0;
+	unsigned long long SerdesConfig1;
+	unsigned long long SerdesStatus;
+	unsigned long long XGXSConfig;
+	unsigned long long ReservedSW2[4];
+};
+
+struct _infinipath_do_not_use_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 Reserved1;
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 Reserved6;
+	__u64 Reserved7;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+	.kr_control = IPATH_KREG_OFFSET(Control),
+	.kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+	.kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+	.kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+	.kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+	.kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+	.kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+	.kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+	.kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+	.kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+	.kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+	.kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+	.kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+	.kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+	.kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+	.kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+	.kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+	.kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+	.kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+	.kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+	.kr_intclear = IPATH_KREG_OFFSET(IntClear),
+	.kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+	.kr_intmask = IPATH_KREG_OFFSET(IntMask),
+	.kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+	.kr_mdio = IPATH_KREG_OFFSET(MDIO),
+	.kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+	.kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+	.kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+	.kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+	.kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+	.kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+	.kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+	.kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+	.kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+	.kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+	.kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+	.kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+	.kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+	.kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+	.kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+	.kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+	.kr_revision = IPATH_KREG_OFFSET(Revision),
+	.kr_scratch = IPATH_KREG_OFFSET(Scratch),
+	.kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+	.kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+	.kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+	.kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+	.kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+	.kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+	.kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+	.kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+	.kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+	.kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+	.kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+	.kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+	.kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+	.kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+	/*
+	 * These should not be used directly via ipath_write_kreg64(),
+	 * use them with ipath_write_kreg64_port(),
+	 */
+	.kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+	.kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+	.cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+	.cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+	.cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+	.cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+	.cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+	.cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+	.cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+	.cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+	.cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+	.cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+	.cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+	.cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+	/* calc from Reg_CounterRegBase + offset */
+	.cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+	.cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+	.cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+	.cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+	.cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+	.cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+	.cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+	.cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+	.cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+	.cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+	.cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+	.cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+	.cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+	.cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+	.cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+	.cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+	.cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+	.cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+	.cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+	.cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+	.cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
+
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
+
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL	/* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+	(1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+	(1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+			  char *msg, size_t msgl)
+{
+	char bitsmsg[64];
+	ipath_err_t crcbits = hwerrs &
+		(_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+	/*
+	 * we'll want to ignore link errors on link that is
+	 * not in use, if any.  For now, complain about both
+	 */
+	if (crcbits) {
+		u16 ctrl0, ctrl1;
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
+			 !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+			 "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+				    ? "1 (B)" : "0+1 (A+B)"),
+			 !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+			 : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+			    "0+1"), (unsigned long long) crcbits);
+		strlcat(msg, bitsmsg, msgl);
+
+		/*
+		 * print extra info for debugging.  slave/primary
+		 * config word 4, 8 (link control 0, 1)
+		 */
+
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x4,
+					 &ctrl0))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl0 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl0 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+				  ((ctrl0 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl0 >> 4) & 1) ? "linkfail" :
+				  "");
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x8,
+					 &ctrl1))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl1 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl1 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+				  ((ctrl1 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl1 >> 4) & 1) ? "linkfail" :
+				  "");
+
+		/* disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~crcbits;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+		ipath_dbg("HT crc errs: %s\n", msg);
+	} else
+		ipath_dbg("ignoring HT crc errors 0x%llx, "
+			  "not in use\n", (unsigned long long)
+			  (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+				     _IPATH_HTLINK1_CRCBITS)));
+}
+
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+	INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+	INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+	INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+	INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+			  << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static void ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+	++ipath_stats.sps_txeparity;
+	dev_info(&dd->pcidev->dev,
+		"Recovering from TXE PIO parity error\n");
+}
+
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	ipath_err_t hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char bitsmsg[64];
+	int log_idx;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+	if (!hwerrs) {
+		ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+		/*
+		 * better than printing cofusing messages
+		 * This seems to be related to clearing the crc error, or
+		 * the pll error during init.
+		 */
+		goto bail;
+	} else if (hwerrs == -1LL) {
+		ipath_dev_err(dd, "Read of hardware error status failed "
+			      "(all bits set); ignoring\n");
+		goto bail;
+	}
+	ipath_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+	hwerrs &= dd->ipath_hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
+			ipath_inc_eeprom_err(dd, log_idx, 1);
+
+	/*
+	 * make sure we get this much out, unless told to be quiet,
+	 * it's a parity error we may recover from,
+	 * or it's occurred within the last 5 seconds
+	 */
+	if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+		RXE_EAGER_PARITY)) ||
+		(ipath_debug & __IPATH_VERBDBG))
+		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+			 "(cleared)\n", (unsigned long long) hwerrs);
+	dd->ipath_lasthwerror |= hwerrs;
+
+	if (hwerrs & ~dd->ipath_hwe_bitsextant)
+		ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (hwerrs & ~dd->ipath_hwe_bitsextant));
+
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
+		/*
+		 * parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			ipath_ht_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			ipath_dbg("Clearing freezemode on ignored or "
+				  "recovered hardware error\n");
+			ipath_clear_freeze(dd);
+		}
+	}
+
+	*msg = '\0';
+
+	/*
+	 * may someday want to decode into which bits are which
+	 * functional area for parity errors, etc.
+	 */
+	if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+		      << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+			      INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+			 bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	ipath_format_hwerrors(hwerrs,
+			      ipath_6110_hwerror_msgs,
+			      ARRAY_SIZE(ipath_6110_hwerror_msgs),
+			      msg, msgl);
+
+	if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+		hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+	if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+		strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |	\
+			 INFINIPATH_HWE_COREPLL_RFSLIP |	\
+			 INFINIPATH_HWE_HTBPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTBPLL_RFSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+	if (hwerrs & _IPATH_PLL_FAIL) {
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused
+		 */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs) {
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 * force link down, so switch knows, and
+		 * LEDs are turned off
+		 */
+		if (dd->ipath_flags & IPATH_INITTED) {
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			ipath_setup_ht_setextled(dd,
+				INFINIPATH_IBCS_L_STATE_DOWN,
+				INFINIPATH_IBCS_LT_STATE_DISABLED);
+			ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+					  "mode), no longer usable, SN %.16s\n",
+					  dd->ipath_serial);
+			isfatal = 1;
+		}
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		/*
+		 * mark as not usable, at a minimum until driver
+		 * is reloaded, probably until reboot, since no
+		 * other reset is possible.
+		 */
+		dd->ipath_flags &= ~IPATH_INITTED;
+	}
+	else
+		*msg = 0; /* recovered from all of them */
+	if (*msg)
+		ipath_dev_err(dd, "%s hardware error\n", msg);
+	if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+		/*
+		 * for status file; if no trailing brace is copied,
+		 * we'll know it was truncated.
+		 */
+		snprintf(dd->ipath_freezemsg,
+			 dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+			      size_t namelen)
+{
+	char *n = NULL;
+	u8 boardrev = dd->ipath_boardrev;
+	int ret = 0;
+
+	switch (boardrev) {
+	case 5:
+		/*
+		 * original production board; two production levels, with
+		 * different serial number ranges.   See ipath_ht_early_init() for
+		 * case where we enable IPATH_GPIO_INTR for later serial # range.
+		 * Original 112* serial number is no longer supported.
+		 */
+		n = "InfiniPath_QHT7040";
+		break;
+	case 7:
+		/* small form factor production board */
+		n = "InfiniPath_QHT7140";
+		break;
+	default:		/* don't know, just print the number */
+		ipath_dev_err(dd, "Don't yet know about board "
+			      "with ID %u\n", boardrev);
+		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
+			 boardrev);
+		break;
+	}
+	if (n)
+		snprintf(name, namelen, "%s", n);
+
+	if (ret) {
+		ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
+		goto bail;
+	}
+	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+		dd->ipath_minrev > 4)) {
+		/*
+		 * This version of the driver only supports Rev 3.2 - 3.4
+		 */
+		ipath_dev_err(dd,
+			      "Unsupported InfiniPath hardware revision %u.%u!\n",
+			      dd->ipath_majrev, dd->ipath_minrev);
+		ret = 1;
+		goto bail;
+	}
+	/*
+	 * pkt/word counters are 32 bit, and therefore wrap fast enough
+	 * that we snapshot them from a timer, and maintain 64 bit shadow
+	 * copies
+	 */
+	dd->ipath_flags |= IPATH_32BITCOUNTERS;
+	dd->ipath_flags |= IPATH_GPIO_INTR;
+	if (dd->ipath_lbus_speed != 800)
+		ipath_dev_err(dd,
+			      "Incorrectly configured for HT @ %uMHz\n",
+			      dd->ipath_lbus_speed);
+
+	/*
+	 * set here, not in ipath_init_*_funcs because we have to do
+	 * it after we can read chip registers.
+	 */
+	dd->ipath_ureg_align =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
+bail:
+	return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+	u8 linkerr, link_off, i;
+
+	for (i = 0; i < 2; i++) {
+		link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+		if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkerror%d of HT slave/primary block\n",
+				 i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set should
+			 * clear them
+			 */
+			if (pci_write_config_byte(dd->pcidev, link_off,
+						  linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(dd->pcidev, link_off,
+						 &linkerr))
+				dev_info(&dd->pcidev->dev,
+					 "Couldn't reread linkerror%d of "
+					 "HT slave/primary block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&dd->pcidev->dev,
+					 "HT linkerror%d bits 0x%x "
+					 "couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+	ipath_dbg("No reset possible for this InfiniPath hardware\n");
+	return 0;
+}
+
+#define HT_INTR_DISC_CONFIG  0x80	/* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX    2	/* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
+ * errors.  We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway.  Also need to not check
+ * for upper byte errors if we are in 8 bit mode, so figure out
+ * our width.  For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+			     int pos, u8 cap_type)
+{
+	u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+	u16 linkctrl = 0;
+	int i;
+
+	dd->ipath_ht_slave_off = pos;
+	/* command word, master_host bit */
+	/* master host || slave */
+	if ((cap_type >> 2) & 1)
+		link_a_b_off = 4;
+	else
+		link_a_b_off = 0;
+	ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+		   link_a_b_off ? 1 : 0,
+		   link_a_b_off ? 'B' : 'A');
+
+	link_a_b_off += pos;
+
+	/*
+	 * check both link control registers; clear both HT CRC sets if
+	 * necessary.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0x4;
+		if (pci_read_config_word(pdev, link_off, &linkctrl))
+			ipath_dev_err(dd, "Couldn't read HT link control%d "
+				      "register\n", i);
+		else if (linkctrl & (0xf << 8)) {
+			ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+				   "bits %x\n", i, linkctrl & (0xf << 8));
+			/*
+			 * now write them back to clear the error.
+			 */
+			pci_write_config_word(pdev, link_off,
+					      linkctrl & (0xf << 8));
+		}
+	}
+
+	/*
+	 * As with HT CRC bits, same for protocol errors that might occur
+	 * during boot.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0xd;
+		if (pci_read_config_byte(pdev, link_off, &linkerr))
+			dev_info(&pdev->dev, "Couldn't read linkerror%d "
+				 "of HT slave/primary block\n", i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set will clear
+			 * them
+			 */
+			if (pci_write_config_byte
+			    (pdev, link_off, linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(pdev, link_off, &linkerr))
+				dev_info(&pdev->dev, "Couldn't reread "
+					 "linkerror%d of HT slave/primary "
+					 "block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&pdev->dev, "HT linkerror%d bits "
+					 "0x%x couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+
+	if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link width "
+			      "config register\n");
+	else {
+		u32 width;
+		switch (linkwidth & 7) {
+		case 5:
+			width = 4;
+			break;
+		case 4:
+			width = 2;
+			break;
+		case 3:
+			width = 32;
+			break;
+		case 1:
+			width = 16;
+			break;
+		case 0:
+		default:	/* if wrong, assume 8 bit */
+			width = 8;
+			break;
+		}
+
+		dd->ipath_lbus_width = width;
+
+		if (linkwidth != 0x11) {
+			ipath_dev_err(dd, "Not configured for 16 bit HT "
+				      "(%x)\n", linkwidth);
+			if (!(linkwidth & 0xf)) {
+				ipath_dbg("Will ignore HT lane1 errors\n");
+				dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+			}
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+	if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link frequency "
+			      "config register\n");
+	else {
+		u32 speed;
+		switch (linkwidth & 0xf) {
+		case 6:
+			speed = 1000;
+			break;
+		case 5:
+			speed = 800;
+			break;
+		case 4:
+			speed = 600;
+			break;
+		case 3:
+			speed = 500;
+			break;
+		case 2:
+			speed = 400;
+			break;
+		case 1:
+			speed = 300;
+			break;
+		default:
+			/*
+			 * assume reserved and vendor-specific are 200...
+			 */
+		case 0:
+			speed = 200;
+			break;
+		}
+		dd->ipath_lbus_speed = speed;
+	}
+
+	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
+		"HyperTransport,%uMHz,x%u\n",
+		dd->ipath_lbus_speed,
+		dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (dd->ipath_intconfig) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+				 dd->ipath_intconfig);	/* interrupt address */
+		ret = 0;
+	} else {
+		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+			      "interrupt address\n");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
+				struct ht_irq_msg *msg)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(dev);
+	u64 prev_intconfig = dd->ipath_intconfig;
+
+	dd->ipath_intconfig = msg->address_lo;
+	dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
+
+	/*
+	 * If the previous value of dd->ipath_intconfig is zero, we're
+	 * getting configured for the first time, and must not program the
+	 * intconfig register here (it will be programmed later, when the
+	 * hardware is ready).  Otherwise, we should.
+	 */
+	if (prev_intconfig)
+		ipath_ht_intconfig(dd);
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware.  It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+				 struct pci_dev *pdev)
+{
+	int pos, ret;
+
+	ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Couldn't create interrupt handler: "
+			      "err %d\n", ret);
+		goto bail;
+	}
+	dd->ipath_irq = ret;
+	ret = 0;
+
+	/*
+	 * Handle clearing CRC errors in linkctrl register if necessary.  We
+	 * do this early, before we ever enable errors or hardware errors,
+	 * mostly to avoid causing the chip to enter freeze mode.
+	 */
+	pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
+	if (!pos) {
+		ipath_dev_err(dd, "Couldn't find HyperTransport "
+			      "capability; no interrupts\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	do {
+		u8 cap_type;
+
+		/*
+		 * The HT capability type byte is 3 bytes after the
+		 * capability byte.
+		 */
+		if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+			dev_info(&pdev->dev, "Couldn't read config "
+				 "command @ %d\n", pos);
+			continue;
+		}
+		if (!(cap_type & 0xE0))
+			slave_or_pri_blk(dd, pdev, pos, cap_type);
+	} while ((pos = pci_find_next_capability(pdev, pos,
+						 PCI_CAP_ID_HT)));
+
+	dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT chip, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link.   For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+				     u64 lst, u64 ltst)
+{
+	u64 extctl;
+	unsigned long flags = 0;
+
+	/* the diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running */
+	if (ipath_diag_inuse)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (dd->ipath_led_override) {
+		ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
+			? INFINIPATH_IBCS_LT_STATE_LINKUP
+			: INFINIPATH_IBCS_LT_STATE_DISABLED;
+		lst = (dd->ipath_led_override & IPATH_LED_LOG)
+			? INFINIPATH_IBCS_L_STATE_ACTIVE
+			: INFINIPATH_IBCS_L_STATE_DOWN;
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/*
+	 * start by setting both LED control bits to off, then turn
+	 * on the appropriate bit(s).
+	 */
+	if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+		/*
+		 * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+		 * is inverted,  because it is normally used to indicate
+		 * a hardware fault at reset, if there were errors
+		 */
+		extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+			| INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+	}
+	else {
+		extctl = dd->ipath_extctrl &
+			~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+			  INFINIPATH_EXTC_LED2PRIPORT_ON);
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+	}
+	dd->ipath_extctrl = extctl;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+}
+
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
+{
+	/*
+	 * setup the register offsets, since they are different for each
+	 * chip
+	 */
+	dd->ipath_kregs = &ipath_ht_kregs;
+	dd->ipath_cregs = &ipath_ht_cregs;
+
+	dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+	dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
+
+	/*
+	 * Fill in data for field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKTRAININGSTATE
+	 * and only the shift for LINKSTATE, as they are the only ones
+	 * that change.  Also precalculate the 3 link states of interest
+	 * and the combined mask.
+	 */
+	dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+	dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+	/*
+	 * Fill in data for ibcc field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKINITCMD
+	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
+	 * the only ones that change.
+	 */
+	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+	/* Fill in shifts for RcvCtrl. */
+	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
+	dd->ipath_i_bitsextant =
+		(INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+		(INFINIPATH_I_RCVAVAIL_MASK <<
+		 INFINIPATH_I_RCVAVAIL_SHIFT) |
+		INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+		INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+	dd->ipath_e_bitsextant =
+		INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+		INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+		INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+		INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+		INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+		INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+		INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+		INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+		INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+		INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+		INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+		INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+		INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+		INFINIPATH_E_HARDWARE;
+
+	dd->ipath_hwe_bitsextant =
+		(INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+		INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+		INFINIPATH_HWE_HTCMISCERR4 |
+		INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+		INFINIPATH_HWE_HTCMISCERR7 |
+		INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+		INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+		INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+		INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+		INFINIPATH_HWE_MEMBISTFAILED |
+		INFINIPATH_HWE_COREPLL_FBSLIP |
+		INFINIPATH_HWE_COREPLL_RFSLIP |
+		INFINIPATH_HWE_HTBPLL_FBSLIP |
+		INFINIPATH_HWE_HTBPLL_RFSLIP |
+		INFINIPATH_HWE_HTAPLL_FBSLIP |
+		INFINIPATH_HWE_HTAPLL_RFSLIP |
+		INFINIPATH_HWE_SERDESPLLFAILED |
+		INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+		INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->ipath_eep_st_masks[0].hwerrs_to_log =
+		INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[1].hwerrs_to_log =
+		INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
+
+	dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	dd->ipath_link_speed_supported = IPATH_IB_SDR;
+	dd->ipath_link_width_enabled = IB_WIDTH_4X;
+	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+	/* these can't change for this chip, so set once */
+	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+	ipath_err_t val;
+	u64 extsval;
+
+	extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+		ipath_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+		ipath_dbg("MemBIST corrected\n");
+
+	ipath_check_htlink(dd);
+
+	/* barring bugs, all hwerrors become interrupts, which can */
+	val = -1LL;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		val &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+	/*
+	 * disable RXDSYNCMEMPARITY because external serdes is unused,
+	 * and therefore the logic will never be used or initialized,
+	 * and uninitialized state will normally result in this error
+	 * being asserted.  Similarly for the external serdess pll
+	 * lock signal.
+	 */
+	val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+		 INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+	/*
+	 * Disable MISCERR4 because of an inversion in the HT core
+	 * logic checking for errors that cause this bit to be set.
+	 * The errata can also cause the protocol error bit to be set
+	 * in the HT config space linkerror register(s).
+	 */
+	val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+	/*
+	 * PLL ignored because unused MDIO interface has a logic problem
+	 */
+	if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+		val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+	dd->ipath_hwerrmask = val;
+}
+
+
+
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+	u64 val, config1;
+	int ret = 0, change = 0;
+
+	ipath_dbg("Trying to bringup serdes\n");
+
+	if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+	    INFINIPATH_HWE_SERDESPLLFAILED)
+	{
+		ipath_dbg("At start, serdes PLL failed bit set in "
+			  "hwerrstatus, clearing and continuing\n");
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+				 INFINIPATH_HWE_SERDESPLLFAILED);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+	config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+	ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	/* force reset on */
+	val |= INFINIPATH_SERDC0_RESET_PLL
+		/* | INFINIPATH_SERDC0_RESET_MASK */
+		;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+	udelay(15);		/* need pll reset set at least for a bit */
+
+	if (val & INFINIPATH_SERDC0_RESET_PLL) {
+		u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+		/* set lane resets, and tx idle, during pll reset */
+		val2 |= INFINIPATH_SERDC0_RESET_MASK |
+			INFINIPATH_SERDC0_TXIDLE;
+		ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+			   "%llx)\n", (unsigned long long) val2);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val2);
+		/*
+		 * be sure chip saw it
+		 */
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		/*
+		 * need pll reset clear at least 11 usec before lane
+		 * resets cleared; give it a few more
+		 */
+		udelay(15);
+		val = val2;	/* for check below */
+	}
+
+	if (val & (INFINIPATH_SERDC0_RESET_PLL |
+		   INFINIPATH_SERDC0_RESET_MASK |
+		   INFINIPATH_SERDC0_TXIDLE)) {
+		val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+			 INFINIPATH_SERDC0_RESET_MASK |
+			 INFINIPATH_SERDC0_TXIDLE);
+		/* clear them */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	if (val & INFINIPATH_XGXS_RESET) {
+		/* normally true after boot */
+		val &= ~INFINIPATH_XGXS_RESET;
+		change = 1;
+	}
+	if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
+	     INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+		         INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= dd->ipath_rx_pol_inv <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		change = 1;
+	}
+	if (change)
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+	ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	return ret;		/* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+	u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	val |= INFINIPATH_SERDC0_TXIDLE;
+	ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+		  (unsigned long long) val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+			     u64 __iomem *tidptr, u32 type,
+			     unsigned long pa)
+{
+	if (!dd->ipath_kregbase)
+		return;
+
+	if (pa != dd->ipath_tidinvalid) {
+		if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+			dev_info(&dd->pcidev->dev,
+				 "physaddr %lx has more than "
+				 "40 bits, using only 40!!!\n", pa);
+			pa &= INFINIPATH_RT_ADDR_MASK;
+		}
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->ipath_tidtemplate;
+		else {
+			/* in words (fixed, full page).  */
+			u64 lenvalid = PAGE_SIZE >> 2;
+			lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+			pa |= lenvalid | INFINIPATH_RT_VALID;
+		}
+	}
+
+	writeq(pa, tidptr);
+}
+
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+	u64 __iomem *tidbase;
+	int i;
+
+	if (!dd->ipath_kregbase)
+		return;
+
+	ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+	/*
+	 * need to invalidate all of the expected TID entries for this
+	 * port, so we don't have valid entries that might somehow get
+	 * used (early in next use of this port, or through some bug)
+	 */
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   port * dd->ipath_rcvtidcnt *
+				   sizeof(*tidbase));
+	for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 dd->ipath_tidinvalid);
+
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvegrbase +
+				   port * dd->ipath_rcvegrcnt *
+				   sizeof(*tidbase));
+
+	for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+	dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+	dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+	dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+	/*
+	 * work around chip errata bug 7358, by marking invalid tids
+	 * as having max length
+	 */
+	dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+		INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+	u32 __iomem *piobuf;
+	u32 pioincr, val32;
+	int i;
+
+	/*
+	 * one cache line; long IB headers will spill over into received
+	 * buffer
+	 */
+	dd->ipath_rcvhdrentsize = 16;
+	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+	/*
+	 * For HT, we allocate a somewhat overly large eager buffer,
+	 * such that we can guarantee that we can receive the largest
+	 * packet that we can send out.  To truly support a 4KB MTU,
+	 * we need to bump this to a large value.  To date, other than
+	 * testing, we have never encountered an HCA that can really
+	 * send 4KB MTU packets, so we do not handle that (we'll get
+	 * errors interrupts if we ever see one).
+	 */
+	dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+
+	/*
+	 * the min() check here is currently a nop, but it may not
+	 * always be, depending on just how we do ipath_rcvegrbufsize
+	 */
+	dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+				 dd->ipath_rcvegrbufsize);
+	dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+	ipath_ht_tidtemplate(dd);
+
+	/*
+	 * zero all the TID entries at startup.  We do this for sanity,
+	 * in case of a previous driver crash of some kind, and also
+	 * because the chip powers up with these memories in an unknown
+	 * state.  Use portcnt, not cfgports, since this is for the
+	 * full chip, not for current (possibly different) configuration
+	 * value.
+	 * Chip Errata bug 6447
+	 */
+	for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+		ipath_ht_clear_tids(dd, val32);
+
+	/*
+	 * write the pbc of each buffer, to be sure it's initialized, then
+	 * cancel all the buffers, and also abort any packets that might
+	 * have been in flight for some reason (the latter is for driver
+	 * unload/reload, but isn't a bad idea at first init).	PIO send
+	 * isn't enabled at this point, so there is no danger of sending
+	 * these out on the wire.
+	 * Chip Errata bug 6610
+	 */
+	piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+				  dd->ipath_piobufbase);
+	pioincr = dd->ipath_palign / sizeof(*piobuf);
+	for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+		/*
+		 * reasonable word count, just to init pbc
+		 */
+		writel(16, piobuf);
+		piobuf += pioincr;
+	}
+
+	ipath_get_eeprom_info(dd);
+	if (dd->ipath_boardrev == 5) {
+		/*
+		 * Later production QHT7040 has same changes as QHT7140, so
+		 * can use GPIO interrupts.  They have serial #'s starting
+		 * with 128, rather than 112.
+		 */
+		if (dd->ipath_serial[0] == '1' &&
+		    dd->ipath_serial[1] == '2' &&
+		    dd->ipath_serial[2] == '8')
+			dd->ipath_flags |= IPATH_GPIO_INTR;
+		else {
+			ipath_dev_err(dd, "Unsupported InfiniPath board "
+				"(serial number %.16s)!\n",
+				dd->ipath_serial);
+			return 1;
+		}
+	}
+
+	if (dd->ipath_minrev >= 4) {
+		/* Rev4+ reports extra errors via internal GPIO pins */
+		dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+		dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	return 0;
+}
+
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+	struct ipath_base_info *kinfo = kbase;
+
+	kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+		IPATH_RUNTIME_PIO_REGSWAPPED;
+
+	if (pd->port_dd->ipath_minrev < 4)
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
+
+	return 0;
+}
+
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	ht_destroy_irq(dd->ipath_irq);
+	dd->ipath_irq = 0;
+	dd->ipath_intconfig = 0;
+}
+
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct ipath_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+	dd->ipath_portcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_p0_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+				   struct infinipath_counters *cntrs)
+{
+	cntrs->LBIntCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+	cntrs->LBFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+	cntrs->TxSDmaDescCnt = 0;
+	cntrs->TxUnsupVLErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+	cntrs->TxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+	cntrs->TxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+	cntrs->TxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+	cntrs->TxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+	cntrs->TxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+	cntrs->TxUnderrunCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+	cntrs->TxFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+	cntrs->TxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+	cntrs->RxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+	cntrs->RxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+	cntrs->RxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+	cntrs->RxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+	cntrs->RxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+	cntrs->RxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+	cntrs->RxICRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+	cntrs->RxVCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+	cntrs->RxFlowCtrlErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+	cntrs->RxBadFormatCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+	cntrs->RxLinkProblemCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+	cntrs->RxEBPCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+	cntrs->RxLPCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+	cntrs->RxBufOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+	cntrs->RxTIDFullErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+	cntrs->RxTIDValidErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+	cntrs->RxPKeyMismatchCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+	cntrs->RxP0HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+	cntrs->RxP1HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+	cntrs->RxP2HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+	cntrs->RxP3HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+	cntrs->RxP4HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+	cntrs->RxP5HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+	cntrs->RxP6HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+	cntrs->RxP7HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+	cntrs->RxP8HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+	cntrs->RxP9HdrEgrOvflCnt = 0;
+	cntrs->RxP10HdrEgrOvflCnt = 0;
+	cntrs->RxP11HdrEgrOvflCnt = 0;
+	cntrs->RxP12HdrEgrOvflCnt = 0;
+	cntrs->RxP13HdrEgrOvflCnt = 0;
+	cntrs->RxP14HdrEgrOvflCnt = 0;
+	cntrs->RxP15HdrEgrOvflCnt = 0;
+	cntrs->RxP16HdrEgrOvflCnt = 0;
+	cntrs->IBStatusChangeCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+	cntrs->IBLinkErrRecoveryCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+	cntrs->IBLinkDownedCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+	cntrs->IBSymbolErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+	cntrs->RxVL15DroppedPktCnt = 0;
+	cntrs->RxOtherLocalPhyErrCnt = 0;
+	cntrs->PcieRetryBufDiagQwordCnt = 0;
+	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+	cntrs->LocalLinkIntegrityErrCnt =
+		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		dd->ipath_lli_errs : dd->ipath_lli_errors;
+	cntrs->RxVlErrCnt = 0;
+	cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+	u64 val, prev_val;
+
+	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	val = prev_val | INFINIPATH_XGXS_RESET;
+	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case IPATH_IB_CFG_LWID:
+		ret = dd->ipath_link_width_active;
+		break;
+	case IPATH_IB_CFG_SPD:
+		ret = dd->ipath_link_speed_active;
+		break;
+	case IPATH_IB_CFG_LWID_ENB:
+		ret = dd->ipath_link_width_enabled;
+		break;
+	case IPATH_IB_CFG_SPD_ENB:
+		ret = dd->ipath_link_speed_enabled;
+		break;
+	default:
+		ret =  -ENOTSUPP;
+		break;
+	}
+	return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+	int ret = 0;
+
+	if (which == IPATH_IB_CFG_LWID_ENB)
+		dd->ipath_link_width_enabled = val;
+	else if (which == IPATH_IB_CFG_SPD_ENB)
+		dd->ipath_link_speed_enabled = val;
+	else
+		ret = -ENOTSUPP;
+	return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+	ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+		ipath_ib_linktrstate(dd, ibcs));
+	return 0;
+}
+
+
+/**
+ * ipath_init_iba6110_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
+{
+	dd->ipath_f_intrsetup = ipath_ht_intconfig;
+	dd->ipath_f_bus = ipath_setup_ht_config;
+	dd->ipath_f_reset = ipath_setup_ht_reset;
+	dd->ipath_f_get_boardname = ipath_ht_boardname;
+	dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+	dd->ipath_f_early_init = ipath_ht_early_init;
+	dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+	dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+	dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+	dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+	dd->ipath_f_put_tid = ipath_ht_put_tid;
+	dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+	dd->ipath_f_setextled = ipath_setup_ht_setextled;
+	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+	dd->ipath_f_free_irq = ipath_ht_free_irq;
+	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+	dd->ipath_f_config_ports = ipath_ht_config_ports;
+	dd->ipath_f_read_counters = ipath_ht_read_counters;
+	dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+	dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+	dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+	dd->ipath_f_config_jint = ipath_ht_config_jint;
+	dd->ipath_f_ib_updown = ipath_ht_ib_updown;
+
+	/*
+	 * initialize chip-specific variables
+	 */
+	ipath_init_ht_variables(dd);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c
new file mode 100644
index 0000000..be2a60e
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_init_chip.c
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 7
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.)  Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (verbs and layered drivers.)
+ * Initialized based on number of PIO buffers if not set via module interface.
+ * The problem with this is that it's global, but we'll use different
+ * numbers for different chip types.
+ */
+static ushort ipath_kpiobufs;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
+		  &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like verbs
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+	unsigned e, egrcnt;
+	struct ipath_skbinfo *skbinfo;
+	int ret;
+
+	egrcnt = dd->ipath_p0_rcvegrcnt;
+
+	skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+	if (skbinfo == NULL) {
+		ipath_dev_err(dd, "allocation error for eager TID "
+			      "skb array\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+	for (e = 0; e < egrcnt; e++) {
+		/*
+		 * This is a bit tricky in that we allocate extra
+		 * space for 2 bytes of the 14 byte ethernet header.
+		 * These two bytes are passed in the ipath header so
+		 * the rest of the data is word aligned.  We allocate
+		 * 4 bytes so that the data buffer stays word aligned.
+		 * See ipath_kreceive() for more details.
+		 */
+		skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+		if (!skbinfo[e].skb) {
+			ipath_dev_err(dd, "SKB allocation error for "
+				      "eager TID %u\n", e);
+			while (e != 0)
+				dev_kfree_skb(skbinfo[--e].skb);
+			vfree(skbinfo);
+			ret = -ENOMEM;
+			goto bail;
+		}
+	}
+	/*
+	 * After loop above, so we can test non-NULL to see if ready
+	 * to use at receive, etc.
+	 */
+	dd->ipath_port0_skbinfo = skbinfo;
+
+	for (e = 0; e < egrcnt; e++) {
+		dd->ipath_port0_skbinfo[e].phys =
+		  ipath_map_single(dd->pcidev,
+				   dd->ipath_port0_skbinfo[e].skb->data,
+				   dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+		dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+				    ((char __iomem *) dd->ipath_kregbase +
+				     dd->ipath_rcvegrbase),
+				    RCVHQ_RCV_TYPE_EAGER,
+				    dd->ipath_port0_skbinfo[e].phys);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+	u64 val, ibc;
+	int ret = 0;
+
+	/* hold IBC in reset */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
+	 */
+	val = (dd->ipath_ibmaxlen >> 2) + 1;
+	ibc = val << dd->ibcc_mpl_shift;
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+	/* max error tolerance */
+	ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+	/* use "real" buffer space for */
+	ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+	/* IB credit flow control. */
+	ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+	/* initially come up waiting for TS1, without sending anything. */
+	dd->ipath_ibcctrl = ibc;
+	/*
+	 * Want to start out with both LINKCMD and LINKINITCMD in NOP
+	 * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
+	 * to stay a NOP. Flag that we are disabled, for the (unlikely)
+	 * case that some recovery path is trying to bring the link up
+	 * before we are ready.
+	 */
+	ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+		INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+	dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+	ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+		   (unsigned long long) ibc);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+	// be sure chip saw it
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	ret = dd->ipath_f_bringup_serdes(dd);
+
+	if (ret)
+		dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+			 "not usable\n");
+	else {
+		/* enable IBC */
+		dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+				 dd->ipath_control);
+	}
+
+	return ret;
+}
+
+static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd = NULL;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (pd) {
+		pd->port_dd = dd;
+		pd->port_cnt = 1;
+		/* The port 0 pkey table is used by the layer interface. */
+		pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
+		pd->port_seq_cnt = 1;
+	}
+	return pd;
+}
+
+static int init_chip_first(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd;
+	int ret = 0;
+	u64 val;
+
+	spin_lock_init(&dd->ipath_kernel_tid_lock);
+	spin_lock_init(&dd->ipath_user_tid_lock);
+	spin_lock_init(&dd->ipath_sendctrl_lock);
+	spin_lock_init(&dd->ipath_uctxt_lock);
+	spin_lock_init(&dd->ipath_sdma_lock);
+	spin_lock_init(&dd->ipath_gpio_lock);
+	spin_lock_init(&dd->ipath_eep_st_lock);
+	spin_lock_init(&dd->ipath_sdepb_lock);
+	mutex_init(&dd->ipath_eep_lock);
+
+	/*
+	 * skip cfgports stuff because we are not allocating memory,
+	 * and we don't want problems if the portcnt changed due to
+	 * cfgports.  We do still check and report a difference, if
+	 * not same (should be impossible).
+	 */
+	dd->ipath_f_config_ports(dd, ipath_cfgports);
+	if (!ipath_cfgports)
+		dd->ipath_cfgports = dd->ipath_portcnt;
+	else if (ipath_cfgports <= dd->ipath_portcnt) {
+		dd->ipath_cfgports = ipath_cfgports;
+		ipath_dbg("Configured to use %u ports out of %u in chip\n",
+			  dd->ipath_cfgports, ipath_read_kreg32(dd,
+			  dd->ipath_kregs->kr_portcnt));
+	} else {
+		dd->ipath_cfgports = dd->ipath_portcnt;
+		ipath_dbg("Tried to configured to use %u ports; chip "
+			  "only supports %u\n", ipath_cfgports,
+			  ipath_read_kreg32(dd,
+				  dd->ipath_kregs->kr_portcnt));
+	}
+	/*
+	 * Allocate full portcnt array, rather than just cfgports, because
+	 * cleanup iterates across all possible ports.
+	 */
+	dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
+			       GFP_KERNEL);
+
+	if (!dd->ipath_pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata array, "
+			      "failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	pd = create_portdata0(dd);
+	if (!pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata for port "
+			      "0, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	dd->ipath_pd[0] = pd;
+
+	dd->ipath_rcvtidcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	dd->ipath_rcvtidbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	dd->ipath_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	dd->ipath_rcvegrbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	dd->ipath_palign =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+	dd->ipath_piobufbase =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+	dd->ipath_piosize2k = val & ~0U;
+	dd->ipath_piosize4k = val >> 32;
+	if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
+		ipath_mtu4096 = 0; /* 4KB not supported by this chip */
+	dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+	dd->ipath_piobcnt2k = val & ~0U;
+	dd->ipath_piobcnt4k = val >> 32;
+	dd->ipath_pio2kbase =
+		(u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				 (dd->ipath_piobufbase & 0xffffffff));
+	if (dd->ipath_piobcnt4k) {
+		dd->ipath_pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->ipath_kregbase) +
+			 (dd->ipath_piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+					  dd->ipath_palign);
+		ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+			  "(%x aligned)\n",
+			  dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+			  dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+			  dd->ipath_piosize4k, dd->ipath_pio4kbase,
+			  dd->ipath_4kalign);
+	}
+	else ipath_dbg("%u 2k piobufs @ %p\n",
+		       dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+done:
+	return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd)
+{
+	u32 rtmp;
+	int i;
+	unsigned long flags;
+
+	/*
+	 * ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize
+	 */
+	dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
+	for (i = 0; i < dd->ipath_portcnt; i++) {
+		clear_bit(dd->ipath_r_portenable_shift + i,
+			  &dd->ipath_rcvctrl);
+		clear_bit(dd->ipath_r_intravail_shift + i,
+			  &dd->ipath_rcvctrl);
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+		dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0U; /* no sdma, etc */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	if (rtmp != dd->ipath_rcvtidcnt)
+		dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	if (rtmp != dd->ipath_rcvtidbase)
+		dev_info(&dd->pcidev->dev, "tidbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidbase, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	if (rtmp != dd->ipath_rcvegrcnt)
+		dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	if (rtmp != dd->ipath_rcvegrbase)
+		dev_info(&dd->pcidev->dev, "egrbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrbase, rtmp);
+
+	return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+	int ret;
+
+	dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+			      "in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * we really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	dd->ipath_statusp = (u64 *)
+		((char *)dd->ipath_pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* copy the current value now that it's really allocated */
+	*dd->ipath_statusp = dd->_ipath_status;
+	/*
+	 * setup buffer to hold freeze msg, accessible to apps,
+	 * following statusp
+	 */
+	dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+	/* and its length */
+	dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(struct page *));
+	if (!pages) {
+		ipath_dev_err(dd, "failed to allocate shadow page * "
+			      "array, no expected sends!\n");
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(dma_addr_t));
+	if (!addrs) {
+		ipath_dev_err(dd, "failed to allocate shadow dma handle "
+			      "array, no expected sends!\n");
+		vfree(pages);
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	dd->ipath_pageshadow = pages;
+	dd->ipath_physshadow = addrs;
+}
+
+static void enable_chip(struct ipath_devdata *dd, int reinit)
+{
+	u32 val;
+	u64 rcvmask;
+	unsigned long flags;
+	int i;
+
+	if (!reinit)
+		init_waitqueue_head(&ipath_state_wait);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/* Enable PIO send, and update of PIOavail regs to memory. */
+	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+		INFINIPATH_S_PIOBUFAVAILUPD;
+
+	/*
+	 * Set the PIO avail update threshold to host memory
+	 * on chips that support it.
+	 */
+	if (dd->ipath_pioupd_thresh)
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * Enable kernel ports' receive and receive interrupt.
+	 * Other ports done as user opens and inits them.
+	 */
+	rcvmask = 1ULL;
+	dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
+		(rcvmask << dd->ipath_r_intravail_shift);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
+		dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	/*
+	 * now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.
+	 */
+	dd->ipath_flags |= IPATH_INITTED;
+
+	/*
+	 * Init our shadow copies of head from tail values,
+	 * and write head values to match.
+	 */
+	val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+
+	/* Initialize so we interrupt on next packet received */
+	ipath_write_ureg(dd, ur_rcvhdrhead,
+			 dd->ipath_rhdrhead_intr_off |
+			 dd->ipath_pd[0]->port_head, 0);
+
+	/*
+	 * by now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		__le64 pioavail;
+
+		/*
+		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
+		else
+			pioavail = dd->ipath_pioavailregs_dma[i];
+		/*
+		 * don't need to worry about ipath_pioavailkernel here
+		 * because we will call ipath_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed
+		 */
+		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
+	}
+	/* can get counters, stats, etc. */
+	dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd, int reinit)
+{
+	char boardn[40];
+	int ret = 0;
+
+	/*
+	 * have to clear shadow copies of registers at init that are
+	 * not otherwise set here, or all kinds of bizarre things
+	 * happen with driver on chip reset
+	 */
+	dd->ipath_rcvhdrsize = 0;
+
+	/*
+	 * Don't clear ipath_flags as 8bit mode was set before
+	 * entering this func. However, we do set the linkstate to
+	 * unknown, so we can watch for a transition.
+	 * PRESENT is set because we want register reads to work,
+	 * and the kernel infrastructure saw it in config space;
+	 * We clear it if we have failures.
+	 */
+	dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
+	dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+			     IPATH_LINKDOWN | IPATH_LINKINIT);
+
+	ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+	dd->ipath_revision =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+	/*
+	 * set up fundamental info we need to use the chip; we assume
+	 * if the revision reg and these regs are OK, we don't need to
+	 * special case the rest
+	 */
+	dd->ipath_sregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+	dd->ipath_cregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+	dd->ipath_uregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+	ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+		   "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+		   dd->ipath_uregbase, dd->ipath_cregbase);
+	if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+		ipath_dev_err(dd, "Register read failures from chip, "
+			      "giving up initialization\n");
+		dd->ipath_flags &= ~IPATH_PRESENT;
+		ret = -ENODEV;
+		goto done;
+	}
+
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
+	/* clear the initial reset flag, in case first driver load */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+			 INFINIPATH_E_RESET);
+
+	ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
+		   (unsigned long long) dd->ipath_revision,
+		   dd->ipath_pcirev);
+
+	if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+	     INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+		ipath_dev_err(dd, "Driver only handles version %d, "
+			      "chip swversion is %d (%llx), failng\n",
+			      IPATH_CHIP_SWVERSION,
+			      (int)(dd->ipath_revision >>
+				    INFINIPATH_R_SOFTWARE_SHIFT) &
+			      INFINIPATH_R_SOFTWARE_MASK,
+			      (unsigned long long) dd->ipath_revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+	dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMAJOR_MASK);
+	dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMINOR_MASK);
+	dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+				    INFINIPATH_R_BOARDID_SHIFT) &
+				   INFINIPATH_R_BOARDID_MASK);
+
+	ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+	snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+		 "SW Compat %u\n",
+		 IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+		 (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+		 INFINIPATH_R_ARCH_MASK,
+		 dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+		 (unsigned)(dd->ipath_revision >>
+			    INFINIPATH_R_SOFTWARE_SHIFT) &
+		 INFINIPATH_R_SOFTWARE_MASK);
+
+	ipath_dbg("%s", dd->ipath_boardversion);
+
+	if (ret)
+		goto done;
+
+	if (reinit)
+		ret = init_chip_reset(dd);
+	else
+		ret = init_chip_first(dd);
+
+done:
+	return ret;
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	if (dd->ipath_int_counter == 0) {
+		if (!dd->ipath_f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev, "No interrupts detected, "
+				"not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
+	} else
+		ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
+			dd->ipath_int_counter);
+}
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+	int ret = 0;
+	u32 kpiobufs, defkbufs;
+	u32 piobufs, uports;
+	u64 val;
+	struct ipath_portdata *pd;
+	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+	ret = init_housekeeping(dd, reinit);
+	if (ret)
+		goto done;
+
+	/*
+	 * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+	 * but then it no longer nicely fits power of two, and since
+	 * we now use routines that backend onto __get_free_pages, the
+	 * rest would be wasted.
+	 */
+	dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+			 dd->ipath_rcvhdrcnt);
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.  This has to
+	 * be done early, before we calculate lastport, etc.
+	 */
+	piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/*
+	 * calc number of pioavail registers, and save it; we have 2
+	 * bits per buffer.
+	 */
+	dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
+		/ (sizeof(u64) * BITS_PER_BYTE / 2);
+	uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
+	if (piobufs > 144)
+		defkbufs = 32 + dd->ipath_pioreserved;
+	else
+		defkbufs = 16 + dd->ipath_pioreserved;
+
+	if (ipath_kpiobufs && (ipath_kpiobufs +
+		(uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
+		int i = (int) piobufs -
+			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
+		if (i < 1)
+			i = 1;
+		dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
+			 "%d for kernel leaves too few for %d user ports "
+			 "(%d each); using %u\n", ipath_kpiobufs,
+			 piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
+		/*
+		 * shouldn't change ipath_kpiobufs, because could be
+		 * different for different devices...
+		 */
+		kpiobufs = i;
+	} else if (ipath_kpiobufs)
+		kpiobufs = ipath_kpiobufs;
+	else
+		kpiobufs = defkbufs;
+	dd->ipath_lastport_piobuf = piobufs - kpiobufs;
+	dd->ipath_pbufsport =
+		uports ? dd->ipath_lastport_piobuf / uports : 0;
+	/* if not an even divisor, some user ports get extra buffers */
+	dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+		(dd->ipath_pbufsport * uports);
+	if (dd->ipath_ports_extrabuf)
+		ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+			"ports <= %u\n", dd->ipath_pbufsport,
+			dd->ipath_ports_extrabuf);
+	dd->ipath_lastpioindex = 0;
+	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
+	/* ipath_pioavailshadow initialized earlier */
+	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+		   "each for %u user ports\n", kpiobufs,
+		   piobufs, dd->ipath_pbufsport, uports);
+	ret = dd->ipath_f_early_init(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Early initialization failure\n");
+		goto done;
+	}
+
+	/*
+	 * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+	 * done after early_init.
+	 */
+	dd->ipath_hdrqlast =
+		dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+			 dd->ipath_rcvhdrentsize);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+			 dd->ipath_rcvhdrsize);
+
+	if (!reinit) {
+		ret = init_pioavailregs(dd);
+		init_shadow_tids(dd);
+		if (ret)
+			goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+			 dd->ipath_pioavailregs_phys);
+
+	/*
+	 * this is to detect s/w errors, which the h/w works around by
+	 * ignoring the low 6 bits of address, if it wasn't aligned.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+	if (val != dd->ipath_pioavailregs_phys) {
+		ipath_dev_err(dd, "Catastrophic software error, "
+			      "SendPIOAvailAddr written as %lx, "
+			      "read back as %llx\n",
+			      (unsigned long) dd->ipath_pioavailregs_phys,
+			      (unsigned long long) val);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+	/*
+	 * make sure we are not in freeze, and PIO send enabled, so
+	 * writes to pbc happen
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	/*
+	 * before error clears, since we expect serdes pll errors during
+	 * this, the first time after reset
+	 */
+	if (bringup_link(dd)) {
+		dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	/*
+	 * clear any "expected" hwerrs from reset and/or initialization
+	 * clear any that aren't enabled (at least this once), and then
+	 * set the enable mask
+	 */
+	dd->ipath_f_init_hwerrors(dd);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+			 dd->ipath_hwerrmask);
+
+	/* clear all */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	/* enable errors that are masked, at least this first time. */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+			 ~dd->ipath_maskederrs);
+	dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
+	dd->ipath_errormask =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	dd->ipath_f_tidtemplate(dd);
+
+	/*
+	 * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
+	 * re-init, the simplest way to handle this is to free
+	 * existing, and re-allocate.
+	 * Need to re-create rest of port 0 portdata as well.
+	 */
+	pd = dd->ipath_pd[0];
+	if (reinit) {
+		struct ipath_portdata *npd;
+
+		/*
+		 * Alloc and init new ipath_portdata for port0,
+		 * Then free old pd. Could lead to fragmentation, but also
+		 * makes later support for hot-swap easier.
+		 */
+		npd = create_portdata0(dd);
+		if (npd) {
+			ipath_free_pddata(dd, pd);
+			dd->ipath_pd[0] = npd;
+			pd = npd;
+		} else {
+			ipath_dev_err(dd, "Unable to allocate portdata"
+				      " for port 0, failing\n");
+			ret = -ENOMEM;
+			goto done;
+		}
+	}
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = create_port0_egr(dd);
+	if (ret) {
+		ipath_dev_err(dd, "failed to allocate kernel port's "
+			      "rcvhdrq and/or egr bufs\n");
+		goto done;
+	}
+	else
+		enable_chip(dd, reinit);
+
+	/* after enable_chip, so pioavailshadow setup */
+	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+	/*
+	 * Cancel any possible active sends from early driver load.
+	 * Follows early_init because some chips have to initialize
+	 * PIO buffers in early_init to avoid false parity errors.
+	 * After enable and ipath_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE; packets are now
+	 * ready to go out.
+	 */
+	ipath_cancel_sends(dd, 1);
+
+	if (!reinit) {
+		/*
+		 * Used when we close a port, for DMA already in flight
+		 * at close.
+		 */
+		dd->ipath_dummy_hdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
+			&dd->ipath_dummy_hdrq_phys,
+			gfp_flags);
+		if (!dd->ipath_dummy_hdrq) {
+			dev_info(&dd->pcidev->dev,
+				"Couldn't allocate 0x%lx bytes for dummy hdrq\n",
+				dd->ipath_pd[0]->port_rcvhdrq_size);
+			/* fallback to just 0'ing */
+			dd->ipath_dummy_hdrq_phys = 0UL;
+		}
+	}
+
+	/*
+	 * cause retrigger of pending interrupts ignored during init,
+	 * even if we had errors
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+	if (!dd->ipath_stats_timer_active) {
+		/*
+		 * first init, or after an admin disable/enable
+		 * set up stats retrieval timer, even if we had errors
+		 * in last portion of setup
+		 */
+		init_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer.function = ipath_get_faststats;
+		dd->ipath_stats_timer.data = (unsigned long) dd;
+		/* every 5 seconds; */
+		dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+		/* takes ~16 seconds to overflow at full IB 4x bandwdith */
+		add_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 1;
+	}
+
+	/* Set up SendDMA if chip supports it */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		ret = setup_sdma(dd);
+
+	/* Set up HoL state */
+	init_timer(&dd->ipath_hol_timer);
+	dd->ipath_hol_timer.function = ipath_hol_event;
+	dd->ipath_hol_timer.data = (unsigned long)dd;
+	dd->ipath_hol_state = IPATH_HOL_UP;
+
+done:
+	if (!ret) {
+		*dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+		if (!dd->ipath_f_intrsetup(dd)) {
+			/* now we can enable all interrupts from the chip */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 -1LL);
+			/* force re-interrupt of any pending interrupts. */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+					 0ULL);
+			/* chip is usable; mark it as initialized */
+			*dd->ipath_statusp |= IPATH_STATUS_INITTED;
+
+			/*
+			 * setup to verify we get an interrupt, and fallback
+			 * to an alternate if necessary and possible
+			 */
+			if (!reinit) {
+				init_timer(&dd->ipath_intrchk_timer);
+				dd->ipath_intrchk_timer.function =
+					verify_interrupt;
+				dd->ipath_intrchk_timer.data =
+					(unsigned long) dd;
+			}
+			dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
+			add_timer(&dd->ipath_intrchk_timer);
+		} else
+			ipath_dev_err(dd, "No interrupts enabled, couldn't "
+				      "setup interrupt address\n");
+
+		if (dd->ipath_cfgports > ipath_stats.sps_nports)
+			/*
+			 * sps_nports is a global, so, we set it to
+			 * the highest number of ports of any of the
+			 * chips we find; we never decrement it, at
+			 * least for now.  Since this might have changed
+			 * over disable/enable or prior to reset, always
+			 * do the check and potentially adjust.
+			 */
+			ipath_stats.sps_nports = dd->ipath_cfgports;
+	} else
+		ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+	/* if ret is non-zero, we probably should do some cleanup
+	   here... */
+	return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned short val;
+	int ret;
+
+	ret = ipath_parse_ushort(str, &val);
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	if (ret < 0)
+		goto bail;
+
+	if (val == 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		if (dd->ipath_kregbase)
+			continue;
+		if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+			   (dd->ipath_cfgports *
+			    IPATH_MIN_USER_PORT_BUFCNT)))
+		{
+			ipath_dev_err(
+				dd,
+				"Allocating %d PIO bufs for kernel leaves "
+				"too few for %d user ports (%d each)\n",
+				val, dd->ipath_cfgports - 1,
+				IPATH_MIN_USER_PORT_BUFCNT);
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->ipath_lastport_piobuf =
+			dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+	}
+
+	ipath_kpiobufs = val;
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c
new file mode 100644
index 0000000..01ba792
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_intr.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+	u32 piobcnt;
+	unsigned long sbuf[4];
+	/*
+	 * it's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling
+	 */
+	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/* read these before writing errorclear */
+	sbuf[0] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror);
+	sbuf[1] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+	if (piobcnt > 128)
+		sbuf[2] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+	if (piobcnt > 192)
+		sbuf[3] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+	else
+		sbuf[3] = 0;
+
+	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+		int i;
+		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
+			time_after(dd->ipath_lastcancel, jiffies)) {
+			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+					  "SendbufErrs %lx %lx", sbuf[0],
+					  sbuf[1]);
+			if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+				printk(" %lx %lx ", sbuf[2], sbuf[3]);
+			printk("\n");
+		}
+
+		for (i = 0; i < piobcnt; i++)
+			if (test_bit(i, sbuf))
+				ipath_disarm_piobufs(dd, i, 1);
+		/* ignore armlaunch errs for a bit */
+		dd->ipath_lastcancel = jiffies+3;
+	}
+}
+
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+	 INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+	 INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+	 INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+	 INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+	 INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_INVALIDADDR)
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	 (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
+	 INFINIPATH_E_SPKTLEN)
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RUNEXPCHAR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	u64 ignore_this_time = 0;
+
+	ipath_disarm_senderrbufs(dd);
+	if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	return ignore_this_time;
+}
+
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
+		.msg = "TXE " #a " Memory Parity"	     \
+	}
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
+		.msg = "RXE " #a " Memory Parity"	     \
+	}
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+	INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t msgl)
+{
+	int i;
+	const int glen =
+	    ARRAY_SIZE(ipath_generic_hwerror_msgs);
+
+	for (i=0; i<glen; i++) {
+		if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl,
+					   ipath_generic_hwerror_msgs[i].msg);
+		}
+	}
+
+	for (i=0; i<nhwerrmsgs; i++) {
+		if (hwerrs & hwerrmsgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+		}
+	}
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	char *ret;
+	u32 state;
+
+	state = ipath_ib_state(dd, ibcs);
+	if (state == dd->ib_init)
+		ret = "Init";
+	else if (state == dd->ib_arm)
+		ret = "Arm";
+	else if (state == dd->ib_active)
+		ret = "Active";
+	else
+		ret = "Down";
+	return ret;
+}
+
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+	struct ib_event event;
+
+	event.device = &dd->verbs_dev->ibdev;
+	event.element.port_num = 1;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+				     ipath_err_t errs)
+{
+	u32 ltstate, lstate, ibstate, lastlstate;
+	u32 init = dd->ib_init;
+	u32 arm = dd->ib_arm;
+	u32 active = dd->ib_active;
+	const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+
+	lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
+	ibstate = ipath_ib_state(dd, ibcs);
+	/* linkstate at last interrupt */
+	lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
+
+	/*
+	 * Since going into a recovery state causes the link state to go
+	 * down and since recovery is transitory, it is better if we "miss"
+	 * ever seeing the link training state go into recovery (i.e.,
+	 * ignore this transition for link state special handling purposes)
+	 * without even updating ipath_lastibcstat.
+	 */
+	if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
+		goto done;
+
+	/*
+	 * if linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 */
+	if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
+		lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
+		/* transitioned to UP */
+		if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
+			/* link came up, so we must no longer be disabled */
+			dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+			ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	} else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
+		(dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
+		ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
+		ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+		int handled;
+		handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
+		dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
+		if (handled) {
+			ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	}
+
+	/*
+	 * Significant enough to always print and get into logs, if it was
+	 * unexpected.  If it was a requested state change, we'll have
+	 * already cleared the flags, so we won't print this warning
+	 */
+	if ((ibstate != arm && ibstate != active) &&
+	    (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
+		dev_info(&dd->pcidev->dev, "Link state changed from %s "
+			 "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
+			 "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
+	}
+
+	if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+	    ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+		u32 lastlts;
+		lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+		/*
+		 * Ignore cycling back and forth from Polling.Active to
+		 * Polling.Quiet while waiting for the other end of the link
+		 * to come up, except to try and decide if we are connected
+		 * to a live IB device or not.  We will cycle back and
+		 * forth between them if no cable is plugged in, the other
+		 * device is powered off or disabled, etc.
+		 */
+		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+		    lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+			if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
+			     (++dd->ipath_ibpollcnt == 40)) {
+				dd->ipath_flags |= IPATH_NOCABLE;
+				*dd->ipath_statusp |=
+					IPATH_STATUS_IB_NOCABLE;
+				ipath_cdbg(LINKVERB, "Set NOCABLE\n");
+			}
+			ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
+				ipath_ibcstatus_str[ltstate], ibstate);
+			goto skip_ibchange;
+		}
+	}
+
+	dd->ipath_ibpollcnt = 0; /* not poll*, now */
+	ipath_stats.sps_iblink++;
+
+	if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+		u64 linkrecov;
+		linkrecov = ipath_snap_cntr(dd,
+			dd->ipath_cregs->cr_iblinkerrrecovcnt);
+		if (linkrecov != dd->ipath_lastlinkrecov) {
+			ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+				(unsigned long long) ibcs,
+				ib_linkstate(dd, ibcs),
+				ipath_ibcstatus_str[ltstate],
+				(unsigned long long) linkrecov);
+			/* and no more until active again */
+			dd->ipath_lastlinkrecov = 0;
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			goto skip_ibchange;
+		}
+	}
+
+	if (ibstate == init || ibstate == arm || ibstate == active) {
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+		if (ibstate == init || ibstate == arm) {
+			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+			if (dd->ipath_flags & IPATH_LINKACTIVE)
+				signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		}
+		if (ibstate == arm) {
+			dd->ipath_flags |= IPATH_LINKARMED;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKINIT | IPATH_LINKDOWN |
+				IPATH_LINKACTIVE | IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else  if (ibstate == init) {
+			/*
+			 * set INIT and DOWN.  Down is checked by
+			 * most of the other code, but INIT is
+			 * useful to know in a few places.
+			 */
+			dd->ipath_flags |= IPATH_LINKINIT |
+				IPATH_LINKDOWN;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKARMED | IPATH_LINKACTIVE |
+				IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else {  /* active */
+			dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+				dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			*dd->ipath_statusp |=
+				IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+			dd->ipath_flags |= IPATH_LINKACTIVE;
+			dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				| IPATH_LINKDOWN | IPATH_LINKARMED |
+				IPATH_NOCABLE);
+			if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+				ipath_restart_sdma(dd);
+			signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
+			/* LED active not handled in chip _f_updown */
+			dd->ipath_f_setextled(dd, lstate, ltstate);
+			ipath_hol_up(dd);
+		}
+
+		/*
+		 * print after we've already done the work, so as not to
+		 * delay the state changes and notifications, for debugging
+		 */
+		if (lstate == lastlstate)
+			ipath_cdbg(LINKVERB, "Unchanged from last: %s "
+				"(%x)\n", ib_linkstate(dd, ibcs), ibstate);
+		else
+			ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
+				  dd->ipath_unit, ib_linkstate(dd, ibcs),
+				  ipath_ibcstatus_str[ltstate],  ibstate);
+	} else { /* down */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKACTIVE |
+				     IPATH_LINKARMED);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		dd->ipath_lli_counter = 0;
+
+		if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
+			ipath_cdbg(VERBOSE, "Unit %u link state down "
+				   "(state 0x%x), from %s\n",
+				   dd->ipath_unit, lstate,
+				   ib_linkstate(dd, dd->ipath_lastibcstat));
+		else
+			ipath_cdbg(LINKVERB, "Unit %u link state changed "
+				   "to %s (0x%x) from down (%x)\n",
+				   dd->ipath_unit,
+				   ipath_ibcstatus_str[ltstate],
+				   ibstate, lastlstate);
+	}
+
+skip_ibchange:
+	dd->ipath_lastibcstat = ibcs;
+done:
+	return;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+			     unsigned supp_msgs, char *msg, u32 msgsz)
+{
+	/*
+	 * Print the message unless it's ibc status change only, which
+	 * happens so often we never want to count it.
+	 */
+	if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+		int iserr;
+		ipath_err_t mask;
+		iserr = ipath_decode_err(dd, msg, msgsz,
+					 dd->ipath_lasterror &
+					 ~INFINIPATH_E_IBSTATUSCHANGED);
+
+		mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		if (dd->ipath_lasterror & ~mask)
+			ipath_dev_err(dd, "Suppressed %u messages for "
+				      "fast-repeating errors (%s) (%llx)\n",
+				      supp_msgs, msg,
+				      (unsigned long long)
+				      dd->ipath_lasterror);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them. So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg("Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+			else
+				ipath_cdbg(ERRPKT,
+					"Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+		}
+	}
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+				       ipath_err_t errs, char *msg,
+				       u32 msgsz, int *noprint)
+{
+	unsigned long nc;
+	static unsigned long nextmsg_time;
+	static unsigned nmsgs, supp_msgs;
+
+	/*
+	 * Throttle back "fast" messages to no more than 10 per 5 seconds.
+	 * This isn't perfect, but it's a reasonable heuristic. If we get
+	 * more than 10, give a 6x longer delay.
+	 */
+	nc = jiffies;
+	if (nmsgs > 10) {
+		if (time_before(nc, nextmsg_time)) {
+			*noprint = 1;
+			if (!supp_msgs++)
+				nextmsg_time = nc + HZ * 3;
+		}
+		else if (supp_msgs) {
+			handle_supp_msgs(dd, supp_msgs, msg, msgsz);
+			supp_msgs = 0;
+			nmsgs = 0;
+		}
+	}
+	else if (!nmsgs++ || time_after(nc, nextmsg_time))
+		nextmsg_time = nc + HZ / 2;
+
+	return supp_msgs;
+}
+
+static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	unsigned long flags;
+	int expected;
+
+	if (ipath_debug & __IPATH_DBG) {
+		char msg[128];
+		ipath_decode_err(dd, msg, sizeof msg, errs &
+			INFINIPATH_E_SDMAERRS);
+		ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		unsigned long tl, hd, status, lengen;
+		tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+		hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+		status = ipath_read_kreg64(dd
+			, dd->ipath_kregs->kr_senddmastatus);
+		lengen = ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_senddmalengen);
+		ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
+			"lengen 0x%lx\n", tl, hd, status, lengen);
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!expected)
+		ipath_cancel_sends(dd, 1);
+}
+
+static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
+{
+	unsigned long flags;
+	int expected;
+
+	if ((istat & INFINIPATH_I_SDMAINT) &&
+	    !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		ipath_sdma_intr(dd);
+
+	if (istat & INFINIPATH_I_SDMADISABLED) {
+		expected = test_bit(IPATH_SDMA_ABORTING,
+			&dd->ipath_sdma_status);
+		ipath_dbg("%s SDmaDisabled intr\n",
+			expected ? "expected" : "unexpected");
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (!expected)
+			ipath_cancel_sends(dd, 1);
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	}
+}
+
+static int handle_hdrq_full(struct ipath_devdata *dd)
+{
+	int chkerrpkts = 0;
+	u32 hd, tl;
+	u32 i;
+
+	ipath_stats.sps_hdrqfull++;
+	for (i = 0; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (i == 0) {
+			/*
+			 * For kernel receive queues, we just want to know
+			 * if there are packets in the queue that we can
+			 * process.
+			 */
+			if (pd->port_head != ipath_get_hdrqtail(pd))
+				chkerrpkts |= 1 << i;
+			continue;
+		}
+
+		/* Skip if user context is not open */
+		if (!pd || !pd->port_cnt)
+			continue;
+
+		/* Don't report the same point multiple times. */
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL)
+			tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
+		else
+			tl = ipath_get_rcvhdrtail(pd);
+		if (tl == pd->port_lastrcvhdrqtail)
+			continue;
+
+		hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
+		if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+			pd->port_lastrcvhdrqtail = tl;
+			pd->port_hdrqfull++;
+			/* flush hdrqfull so that poll() sees it */
+			wmb();
+			wake_up_interruptible(&pd->port_wait);
+		}
+	}
+
+	return chkerrpkts;
+}
+
+static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	char msg[128];
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int chkerrpkts = 0, noprint = 0;
+	unsigned supp_msgs;
+	int log_idx;
+
+	/*
+	 * don't report errors that are masked, either at init
+	 * (not set in ipath_errormask), or temporarily (set in
+	 * ipath_maskederrs)
+	 */
+	errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
+
+	supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
+		&noprint);
+
+	/* do these first, they are most important */
+	if (errs & INFINIPATH_E_HARDWARE) {
+		/* reuse same msg buf */
+		dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+	} else {
+		u64 mask;
+		for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
+			mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
+			if (errs & mask)
+				ipath_inc_eeprom_err(dd, log_idx, 1);
+		}
+	}
+
+	if (errs & INFINIPATH_E_SDMAERRS)
+		handle_sdma_errors(dd, errs);
+
+	if (!noprint && (errs & ~dd->ipath_e_bitsextant))
+		ipath_dev_err(dd, "error interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (errs & ~dd->ipath_e_bitsextant));
+
+	if (errs & E_SUM_ERRS)
+		ignore_this_time = handle_e_sum_errs(dd, errs);
+	else if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	if (supp_msgs == 250000) {
+		int s_iserr;
+		/*
+		 * It's not entirely reasonable assuming that the errors set
+		 * in the last clear period are all responsible for the
+		 * problem, but the alternative is to assume it's the only
+		 * ones on this particular interrupt, which also isn't great
+		 */
+		dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+
+		dd->ipath_errormask &= ~dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		s_iserr = ipath_decode_err(dd, msg, sizeof msg,
+					   dd->ipath_maskederrs);
+
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL |
+		      INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Temporarily disabling "
+			    "error(s) %llx reporting; too frequent (%s)\n",
+				(unsigned long long) dd->ipath_maskederrs,
+				msg);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal",
+			 * for some types of processes (mostly benchmarks)
+			 * that send huge numbers of messages, while not
+			 * processing them.  So only complain about
+			 * these at debug level.
+			 */
+			if (s_iserr)
+				ipath_dbg("Temporarily disabling reporting "
+				    "too frequent queue full errors (%s)\n",
+				    msg);
+			else
+				ipath_cdbg(ERRPKT,
+				    "Temporarily disabling reporting too"
+				    " frequent packet errors (%s)\n",
+				    msg);
+		}
+
+		/*
+		 * Re-enable the masked errors after around 3 minutes.  in
+		 * ipath_get_faststats().  If we have a series of fast
+		 * repeating but different errors, the interval will keep
+		 * stretching out, but that's OK, as that's pretty
+		 * catastrophic.
+		 */
+		dd->ipath_unmasktime = jiffies + HZ * 180;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+	if (ignore_this_time)
+		errs &= ~ignore_this_time;
+	if (errs & ~dd->ipath_lasterror) {
+		errs &= ~dd->ipath_lasterror;
+		/* never suppress duplicate hwerrors or ibstatuschange */
+		dd->ipath_lasterror |= errs &
+			~(INFINIPATH_E_HARDWARE |
+			  INFINIPATH_E_IBSTATUSCHANGED);
+	}
+
+	if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
+		dd->ipath_spectriggerhit++;
+		ipath_dbg("%lu special trigger hits\n",
+			dd->ipath_spectriggerhit);
+	}
+
+	/* likely due to cancel; so suppress message unless verbose */
+	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+		time_after(dd->ipath_lastcancel, jiffies)) {
+		/* armlaunch takes precedence; it often causes both. */
+		ipath_cdbg(VERBOSE,
+			"Suppressed %s error (%llx) after sendbuf cancel\n",
+			(errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
+			"armlaunch" : "sendpktlen", (unsigned long long)errs);
+		errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+	}
+
+	if (!errs)
+		return 0;
+
+	if (!noprint) {
+		ipath_err_t mask;
+		/*
+		 * The ones we mask off are handled specially below
+		 * or above.  Also mask SDMADISABLED by default as it
+		 * is too chatty.
+		 */
+		mask = INFINIPATH_E_IBSTATUSCHANGED |
+			INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
+	} else
+		/* so we don't need if (!noprint) at strlcat's below */
+		*msg = 0;
+
+	if (errs & E_SUM_PKTERRS) {
+		ipath_stats.sps_pkterrs++;
+		chkerrpkts = 1;
+	}
+	if (errs & E_SUM_ERRS)
+		ipath_stats.sps_errs++;
+
+	if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+		ipath_stats.sps_crcerrs++;
+		chkerrpkts = 1;
+	}
+	iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
+
+
+	/*
+	 * We don't want to print these two as they happen, or we can make
+	 * the situation even worse, because it takes so long to print
+	 * messages to serial consoles.  Kernel ports get printed from
+	 * fast_stats, no more than every 5 seconds, user ports get printed
+	 * on close
+	 */
+	if (errs & INFINIPATH_E_RRCVHDRFULL)
+		chkerrpkts |= handle_hdrq_full(dd);
+	if (errs & INFINIPATH_E_RRCVEGRFULL) {
+		struct ipath_portdata *pd = dd->ipath_pd[0];
+
+		/*
+		 * since this is of less importance and not likely to
+		 * happen without also getting hdrfull, only count
+		 * occurrences; don't check each port (or even the kernel
+		 * vs user)
+		 */
+		ipath_stats.sps_etidfull++;
+		if (pd->port_head != ipath_get_hdrqtail(pd))
+			chkerrpkts |= 1;
+	}
+
+	/*
+	 * do this before IBSTATUSCHANGED, in case both bits set in a single
+	 * interrupt; we want the STATUSCHANGE to "win", so we do our
+	 * internal copy of state machine correctly
+	 */
+	if (errs & INFINIPATH_E_RIBLOSTLINK) {
+		/*
+		 * force through block below
+		 */
+		errs |= INFINIPATH_E_IBSTATUSCHANGED;
+		ipath_stats.sps_iblink++;
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKARMED | IPATH_LINKACTIVE);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+
+		ipath_dbg("Lost link, link now down (%s)\n",
+			ipath_ibcstatus_str[ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_ibcstatus) & 0xf]);
+	}
+	if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+		handle_e_ibstatuschanged(dd, errs);
+
+	if (errs & INFINIPATH_E_RESET) {
+		if (!noprint)
+			ipath_dev_err(dd, "Got reset, requires re-init "
+				      "(unload and reload driver)\n");
+		dd->ipath_flags &= ~IPATH_INITTED;	/* needs re-init */
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+	}
+
+	if (!noprint && *msg) {
+		if (iserr)
+			ipath_dev_err(dd, "%s error\n", msg);
+	}
+	if (dd->ipath_state_wanted & dd->ipath_flags) {
+		ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
+			   "waking\n", dd->ipath_state_wanted,
+			   dd->ipath_flags);
+		wake_up_interruptible(&ipath_state_wait);
+	}
+
+	return chkerrpkts;
+}
+
+/*
+ * try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ */
+void ipath_clear_freeze(struct ipath_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	ipath_cancel_sends(dd, 1);
+
+	/* clear the freeze, and be sure chip saw it */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	ipath_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		E_SPKT_ERRS_IGNORE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+}
+
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
+{
+	/*
+	 * sometimes happen during driver init and unload, don't want
+	 * to process any interrupts at that point
+	 */
+
+	/* this is just a bandaid, not a fix, if something goes badly
+	 * wrong */
+	if (++*unexpectp > 100) {
+		if (++*unexpectp > 105) {
+			/*
+			 * ok, we must be taking somebody else's interrupts,
+			 * due to a messed up mptable and/or PIRQ table, so
+			 * unregister the interrupt.  We've seen this during
+			 * linuxbios development work, and it may happen in
+			 * the future again.
+			 */
+			if (dd->pcidev && dd->ipath_irq) {
+				ipath_dev_err(dd, "Now %u unexpected "
+					      "interrupts, unregistering "
+					      "interrupt handler\n",
+					      *unexpectp);
+				ipath_dbg("free_irq of irq %d\n",
+					  dd->ipath_irq);
+				dd->ipath_f_free_irq(dd);
+			}
+		}
+		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
+			ipath_dev_err(dd, "%u unexpected interrupts, "
+				      "disabling interrupts completely\n",
+				      *unexpectp);
+			/*
+			 * disable all interrupts, something is very wrong
+			 */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 0ULL);
+		}
+	} else if (*unexpectp > 1)
+		ipath_dbg("Interrupt when not ready, should not happen, "
+			  "ignoring\n");
+}
+
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of ipath_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	ipath_dev_err(dd,
+		      "Read of interrupt status failed (all bits set)\n");
+	if (allbits++) {
+		/* disable all interrupts, something is very wrong */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+		if (allbits == 2) {
+			ipath_dev_err(dd, "Still bad interrupt status, "
+				      "unregistering interrupt\n");
+			dd->ipath_f_free_irq(dd);
+		} else if (allbits > 2) {
+			if ((allbits % 10000) == 0)
+				printk(".");
+		} else
+			ipath_dev_err(dd, "Disabling interrupts, "
+				      "multiple errors\n");
+	}
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = ipath_ib_piobufavail(dd->verbs_dev);
+	if (ret > 0)
+		goto set;
+
+	return;
+set:
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+/*
+ * Handle receive interrupts for user ports; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll
+ */
+static void handle_urcv(struct ipath_devdata *dd, u64 istat)
+{
+	u64 portr;
+	int i;
+	int rcvdint = 0;
+
+	/*
+	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+	 * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+	 * would both like timely updates of the bits so that
+	 * we don't pass them by unnecessarily.  the rmb()
+	 * here ensures that we see them promptly -- the
+	 * corresponding wmb()'s are in ipath_poll_urgent()
+	 * and ipath_poll_next()...
+	 */
+	rmb();
+	portr = ((istat >> dd->ipath_i_rcvavail_shift) &
+		 dd->ipath_i_rcvavail_mask) |
+		((istat >> dd->ipath_i_rcvurg_shift) &
+		 dd->ipath_i_rcvurg_mask);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (portr & (1 << i) && pd && pd->port_cnt) {
+			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+					       &pd->port_flag)) {
+				clear_bit(i + dd->ipath_r_intravail_shift,
+					  &dd->ipath_rcvctrl);
+				wake_up_interruptible(&pd->port_wait);
+				rcvdint = 1;
+			} else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+						      &pd->port_flag)) {
+				pd->port_urgent++;
+				wake_up_interruptible(&pd->port_wait);
+			}
+		}
+	}
+	if (rcvdint) {
+		/* only want to take one interrupt, so turn off the rcv
+		 * interrupt for all the ports that we set the rcv_waiting
+		 * (but never for kernel port)
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+	}
+}
+
+irqreturn_t ipath_intr(int irq, void *data)
+{
+	struct ipath_devdata *dd = data;
+	u64 istat, chk0rcv = 0;
+	ipath_err_t estat = 0;
+	irqreturn_t ret;
+	static unsigned unexpected = 0;
+	u64 kportrbits;
+
+	ipath_stats.sps_ints++;
+
+	if (dd->ipath_int_counter != (u32) -1)
+		dd->ipath_int_counter++;
+
+	if (!(dd->ipath_flags & IPATH_PRESENT)) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * this needs to be flags&initted, not statusp, so we keep
+	 * taking interrupts even after link goes down, etc.
+	 * Also, we *must* clear the interrupt at some point, or we won't
+	 * take it again, which can be real bad for errors, etc...
+	 */
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		ipath_bad_intr(dd, &unexpected);
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ipath_stats.sps_nullintr++;
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		ipath_bad_regread(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	if (unexpected)
+		unexpected = 0;
+
+	if (unlikely(istat & ~dd->ipath_i_bitsextant))
+		ipath_dev_err(dd,
+			      "interrupt with unknown interrupts %Lx set\n",
+			      (unsigned long long)
+			      istat & ~dd->ipath_i_bitsextant);
+	else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
+		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+			(unsigned long long) istat);
+
+	if (istat & INFINIPATH_I_ERROR) {
+		ipath_stats.sps_errints++;
+		estat = ipath_read_kreg64(dd,
+					  dd->ipath_kregs->kr_errorstatus);
+		if (!estat)
+			dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
+				 "but no error bits set!\n",
+				 (unsigned long long) istat);
+		else if (estat == -1LL)
+			/*
+			 * should we try clearing all, or hope next read
+			 * works?
+			 */
+			ipath_dev_err(dd, "Read of error status failed "
+				      "(all bits set); ignoring\n");
+		else
+			chk0rcv |= handle_errors(dd, estat);
+	}
+
+	if (istat & INFINIPATH_I_GPIO) {
+		/*
+		 * GPIO interrupts fall in two broad classes:
+		 * GPIO_2 indicates (on some HT4xx boards) that a packet
+		 *        has arrived for Port 0. Checking for this
+		 *        is controlled by flag IPATH_GPIO_INTR.
+		 * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+		 *        errors that we need to count. Checking for this
+		 *        is controlled by flag IPATH_GPIO_ERRINTRS.
+		 */
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		gpiostatus = ipath_read_kreg32(
+			dd, dd->ipath_kregs->kr_gpio_status);
+		/* First the error-counter case. */
+		if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+		    (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+				ipath_dbg("FlowCtl on UnsupVL\n");
+				dd->ipath_rxfc_unsupvl_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+				ipath_dbg("Overrun Threshold exceeded\n");
+				dd->ipath_overrun_thresh_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+				ipath_dbg("Local Link Integrity error\n");
+				dd->ipath_lli_errs++;
+			}
+			gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
+		}
+		/* Now the Port0 Receive case */
+		if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+		    (dd->ipath_flags & IPATH_GPIO_INTR)) {
+			/*
+			 * GPIO status bit 2 is set, and we expected it.
+			 * clear it and indicate in p0bits.
+			 * This probably only happens if a Port0 pkt
+			 * arrives at _just_ the wrong time, and we
+			 * handle that by seting chk0rcv;
+			 */
+			to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+			gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
+			chk0rcv = 1;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = (u32) dd->ipath_gpio_mask;
+
+			if (mask & gpiostatus) {
+				ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+				  gpiostatus & mask);
+				to_clear |= (gpiostatus & mask);
+				dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+				ipath_write_kreg(dd,
+					dd->ipath_kregs->kr_gpio_mask,
+					dd->ipath_gpio_mask);
+			}
+		}
+		if (to_clear) {
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+					(u64) to_clear);
+		}
+	}
+
+	/*
+	 * Clear the interrupt bits we found set, unless they are receive
+	 * related, in which case we already cleared them above, and don't
+	 * want to clear them again, because we might lose an interrupt.
+	 * Clear it early, so we "know" know the chip will have seen this by
+	 * the time we process the queue, and will re-interrupt if necessary.
+	 * The processor itself won't take the interrupt again until we return.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway, and user's
+	 * waiting for receive are at the bottom.
+	 */
+	kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
+		(1ULL << dd->ipath_i_rcvurg_shift);
+	if (chk0rcv || (istat & kportrbits)) {
+		istat &= ~kportrbits;
+		ipath_kreceive(dd->ipath_pd[0]);
+	}
+
+	if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+		     (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
+		handle_urcv(dd, istat);
+
+	if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
+		handle_sdma_intr(dd, istat);
+
+	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* always process; sdma verbs uses PIO for acks and VL15  */
+		handle_layer_pioavail(dd);
+	}
+
+	ret = IRQ_HANDLED;
+
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h
new file mode 100644
index 0000000..f0f9471
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_kernel.h
@@ -0,0 +1,1373 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define IPATH_EEP_LOG_CNT (4)
+struct ipath_eep_log_mask {
+	u64 errs_to_log;
+	u64 hwerrs_to_log;
+};
+
+struct ipath_portdata {
+	void **port_rcvegrbuf;
+	dma_addr_t *port_rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *port_rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *port_rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *port_tid_pg_list;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t port_wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t port_rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t port_rcvhdrq_phys;
+	dma_addr_t port_rcvhdrqtailaddr_phys;
+	/*
+	 * number of opens (including slave subports) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int port_cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned port_port;
+	/* non-zero if port is being shared. */
+	u16 port_subport_cnt;
+	/* non-zero if port is being shared. */
+	u16 port_subport_id;
+	/* number of pio bufs for this port (all procs, if shared) */
+	u32 port_piocnt;
+	/* first pio buffer for this port */
+	u32 port_pio_base;
+	/* chip offset of PIO buffers for this port */
+	u32 port_piobufs;
+	/* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+	u32 port_rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u32 port_rcvegrbufs_perchunk;
+	/* order for port_rcvegrbuf_pages */
+	size_t port_rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t port_rcvhdrq_size;
+	/* next expected TID to check when looking for free */
+	u32 port_tidcursor;
+	/* next expected TID to check */
+	unsigned long port_flag;
+	/* what happened */
+	unsigned long int_flag;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 port_rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 port_piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 port_rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 port_pionowait;
+	/* total number of rcvhdrqfull errors */
+	u32 port_hdrqfull;
+	/*
+	 * Used to suppress multiple instances of same
+	 * port staying stuck at same point.
+	 */
+	u32 port_lastrcvhdrqtail;
+	/* saved total number of rcvhdrqfull errors for poll edge trigger */
+	u32 port_hdrqfull_poll;
+	/* total number of polled urgent packets */
+	u32 port_urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 port_urgent_poll;
+	/* pid of process using this port */
+	struct pid *port_pid;
+	struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
+	/* same size as task_struct .comm[] */
+	char port_comm[16];
+	/* pkeys set by this use of this port */
+	u16 port_pkeys[4];
+	/* so file ops can get at unit */
+	struct ipath_devdata *port_dd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subport_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subport_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subport_rcvhdr_base;
+	/* The version of the library which opened this port */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* port rcvhdrq head offset */
+	u32 port_head;
+	/* receive packet sequence counter */
+	u32 port_seq_cnt;
+};
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_verbs_txreq;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+	void *l_arg;
+};
+
+struct ipath_skbinfo {
+	struct sk_buff *skb;
+	dma_addr_t phys;
+};
+
+struct ipath_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	union {
+		struct scatterlist *sg;
+		void *map_addr;
+	};
+	void              (*callback)(void *, int);
+	void               *callback_cookie;
+	int                 callback_status;
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct ipath_sdma_desc {
+	__le64 qw[2];
+};
+
+#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
+#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
+#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
+#define IPATH_SDMA_TXREQ_F_VL15         0x20
+
+#define IPATH_SDMA_TXREQ_S_OK        0
+#define IPATH_SDMA_TXREQ_S_SENDERROR 1
+#define IPATH_SDMA_TXREQ_S_ABORTED   2
+#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
+
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG	(1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG			(1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE		(1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY			(1ull << 30)
+
+/* max dwords in small buffer packet */
+#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
+
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
+struct ipath_devdata {
+	struct list_head ipath_list;
+
+	struct ipath_kregs const *ipath_kregs;
+	struct ipath_cregs const *ipath_cregs;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *ipath_kregbase;
+	/* end of mem-mapped chip space; range checking */
+	u64 __iomem *ipath_kregend;
+	/* physical address of chip for io_remap, etc. */
+	unsigned long ipath_physaddr;
+	/* base of memory alloced for ipath_kregbase, for free */
+	u64 *ipath_kregalloc;
+	/* ipath_cfgports pointers */
+	struct ipath_portdata **ipath_pd;
+	/* sk_buffs used by port 0 eager receive queue */
+	struct ipath_skbinfo *ipath_port0_skbinfo;
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *ipath_pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *ipath_pio4kbase;
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *ipath_pioavailregs_dma;
+	/* physical address where updates occur */
+	dma_addr_t ipath_pioavailregs_phys;
+	struct _ipath_layer ipath_layer;
+	/* setup intr */
+	int (*ipath_f_intrsetup)(struct ipath_devdata *);
+	/* fallback to alternate interrupt type if possible */
+	int (*ipath_f_intr_fallback)(struct ipath_devdata *);
+	/* setup on-chip bus config */
+	int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+	/* hard reset chip */
+	int (*ipath_f_reset)(struct ipath_devdata *);
+	int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+				     size_t);
+	void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+	void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+					size_t);
+	void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+	int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+	int (*ipath_f_early_init)(struct ipath_devdata *);
+	void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+	void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+	void (*ipath_f_cleanup)(struct ipath_devdata *);
+	void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+	/* fill out chip-specific fields */
+	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+	/* free irq */
+	void (*ipath_f_free_irq)(struct ipath_devdata *);
+	struct ipath_message_header *(*ipath_f_get_msgheader)
+					(struct ipath_devdata *, __le32 *);
+	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+	int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+	int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+	void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+	void (*ipath_f_read_counters)(struct ipath_devdata *,
+					struct infinipath_counters *);
+	void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
+	unsigned ipath_lastegr_idx;
+	struct ipath_ibdev *verbs_dev;
+	struct timer_list verbs_timer;
+	/* total dwords sent (summed from counter) */
+	u64 ipath_sword;
+	/* total dwords rcvd (summed from counter) */
+	u64 ipath_rword;
+	/* total packets sent (summed from counter) */
+	u64 ipath_spkts;
+	/* total packets rcvd (summed from counter) */
+	u64 ipath_rpkts;
+	/* ipath_statusp initially points to this. */
+	u64 _ipath_status;
+	/* GUID for this interface, in network order */
+	__be64 ipath_guid;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of error reporting
+	 */
+	ipath_err_t ipath_lasterror;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of hwerror reporting
+	 */
+	ipath_err_t ipath_lasthwerror;
+	/* errors masked because they occur too fast */
+	ipath_err_t ipath_maskederrs;
+	u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+	/* these 5 fields are used to establish deltas for IB Symbol
+	 * errors and linkrecovery errors. They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+
+	/* time in jiffies at which to re-enable maskederrs */
+	unsigned long ipath_unmasktime;
+	/* count of egrfull errors, combined for all ports */
+	u64 ipath_last_tidfull;
+	/* for ipath_qcheck() */
+	u64 ipath_lastport0rcv_cnt;
+	/* template for writing TIDs  */
+	u64 ipath_tidtemplate;
+	/* value to write to free TIDs */
+	u64 ipath_tidinvalid;
+	/* IBA6120 rcv interrupt setup */
+	u64 ipath_rhdrhead_intr_off;
+
+	/* size of memory at ipath_kregbase */
+	u32 ipath_kregsize;
+	/* number of registers used for pioavail */
+	u32 ipath_pioavregs;
+	/* IPATH_POLL, etc. */
+	u32 ipath_flags;
+	/* ipath_flags driver is waiting for */
+	u32 ipath_state_wanted;
+	/* last buffer for user use, first buf for kernel use is this
+	 * index. */
+	u32 ipath_lastport_piobuf;
+	/* is a stats timer active */
+	u32 ipath_stats_timer_active;
+	/* number of interrupts for this device -- saturates... */
+	u32 ipath_int_counter;
+	/* dwords sent read from counter */
+	u32 ipath_lastsword;
+	/* dwords received read from counter */
+	u32 ipath_lastrword;
+	/* sent packets read from counter */
+	u32 ipath_lastspkts;
+	/* received packets read from counter */
+	u32 ipath_lastrpkts;
+	/* pio bufs allocated per port */
+	u32 ipath_pbufsport;
+	/* if remainder on bufs/port, ports < extrabuf get 1 extra */
+	u32 ipath_ports_extrabuf;
+	u32 ipath_pioupd_thresh; /* update threshold, some chips */
+	/*
+	 * number of ports configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/port, etc.
+	 */
+	u32 ipath_cfgports;
+	/* count of port 0 hdrqfull errors */
+	u32 ipath_p0_hdrqfull;
+	/* port 0 number of receive eager buffers */
+	u32 ipath_p0_rcvegrcnt;
+
+	/*
+	 * index of last piobuffer we used.  Speeds up searching, by
+	 * starting at this point.  Doesn't matter if multiple cpu's use and
+	 * update, last updater is only write that matters.  Whenever it
+	 * wraps, we update shadow copies.  Need a copy per device when we
+	 * get to multiple devices
+	 */
+	u32 ipath_lastpioindex;
+	u32 ipath_lastpioindexl;
+	/* max length of freezemsg */
+	u32 ipath_freezelen;
+	/*
+	 * consecutive times we wanted a PIO buffer but were unable to
+	 * get one
+	 */
+	u32 ipath_consec_nopiobuf;
+	/*
+	 * hint that we should update ipath_pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 ipath_upd_pio_shadow;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar1;
+	u32 ipath_x1_fix_tries;
+	u32 ipath_autoneg_tries;
+	u32 serdes_first_init_done;
+
+	struct ipath_relock {
+		atomic_t ipath_relock_timer_active;
+		struct timer_list ipath_relock_timer;
+		unsigned int ipath_relock_interval; /* in jiffies */
+	} ipath_relock_singleton;
+
+	/* interrupt number */
+	int ipath_irq;
+	/* HT/PCI Vendor ID (here for NodeInfo) */
+	u16 ipath_vendorid;
+	/* HT/PCI Device ID (here for NodeInfo) */
+	u16 ipath_deviceid;
+	/* offset in HT config space of slave/primary interface block */
+	u8 ipath_ht_slave_off;
+	/* for write combining settings */
+	int wc_cookie;
+	/* ref count for each pkey */
+	atomic_t ipath_pkeyrefs[4];
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **ipath_pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *ipath_physshadow;
+	u64 __iomem *ipath_egrtidbase;
+	/* lock to workaround chip bug 9437 and others */
+	spinlock_t ipath_kernel_tid_lock;
+	spinlock_t ipath_user_tid_lock;
+	spinlock_t ipath_sendctrl_lock;
+	/* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+	spinlock_t ipath_uctxt_lock;
+
+	/*
+	 * IPATH_STATUS_*,
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.
+	 */
+	u64 *ipath_statusp;
+	/* freeze msg if hw error put chip in freeze */
+	char *ipath_freezemsg;
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_dev;
+	struct device *diag_dev;
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list ipath_stats_timer;
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list ipath_intrchk_timer;
+	void *ipath_dummy_hdrq;	/* used after port close */
+	dma_addr_t ipath_dummy_hdrq_phys;
+
+	/* SendDMA related entries */
+	spinlock_t            ipath_sdma_lock;
+	unsigned long         ipath_sdma_status;
+	unsigned long         ipath_sdma_abort_jiffies;
+	unsigned long         ipath_sdma_abort_intr_timeout;
+	unsigned long         ipath_sdma_buf_jiffies;
+	struct ipath_sdma_desc *ipath_sdma_descq;
+	u64		      ipath_sdma_descq_added;
+	u64		      ipath_sdma_descq_removed;
+	int		      ipath_sdma_desc_nreserved;
+	u16                   ipath_sdma_descq_cnt;
+	u16                   ipath_sdma_descq_tail;
+	u16                   ipath_sdma_descq_head;
+	u16                   ipath_sdma_next_intr;
+	u16                   ipath_sdma_reset_wait;
+	u8                    ipath_sdma_generation;
+	struct tasklet_struct ipath_sdma_abort_task;
+	struct tasklet_struct ipath_sdma_notify_task;
+	struct list_head      ipath_sdma_activelist;
+	struct list_head      ipath_sdma_notifylist;
+	atomic_t              ipath_sdma_vl15_count;
+	struct timer_list     ipath_sdma_vl15_timer;
+
+	dma_addr_t       ipath_sdma_descq_phys;
+	volatile __le64 *ipath_sdma_head_dma;
+	dma_addr_t       ipath_sdma_head_phys;
+
+	unsigned long ipath_ureg_align; /* user register alignment */
+
+	struct delayed_work ipath_autoneg_work;
+	wait_queue_head_t ipath_autoneg_wait;
+
+	/* HoL blocking / user app forward-progress state */
+	unsigned          ipath_hol_state;
+	unsigned          ipath_hol_next;
+	struct timer_list ipath_hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to infinipath.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * HT transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	/*
+	 * shadow of pioavail, check to be sure it's large enough at
+	 * init time.
+	 */
+	unsigned long ipath_pioavailshadow[8];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long ipath_pioavailkernel[8];
+	/* shadow of kr_gpio_out, for rmw ops */
+	u64 ipath_gpio_out;
+	/* shadow the gpio mask register */
+	u64 ipath_gpio_mask;
+	/* shadow the gpio output enable, etc... */
+	u64 ipath_extctrl;
+	/* kr_revision shadow */
+	u64 ipath_revision;
+	/*
+	 * shadow of ibcctrl, for interrupt handling of link changes,
+	 * etc.
+	 */
+	u64 ipath_ibcctrl;
+	/*
+	 * last ibcstatus, to suppress "duplicate" status change messages,
+	 * mostly from 2 to 3
+	 */
+	u64 ipath_lastibcstat;
+	/* hwerrmask shadow */
+	ipath_err_t ipath_hwerrmask;
+	ipath_err_t ipath_errormask; /* errormask shadow */
+	/* interrupt config reg shadow */
+	u64 ipath_intconfig;
+	/* kr_sendpiobufbase value */
+	u64 ipath_piobufbase;
+	/* kr_ibcddrctrl shadow */
+	u64 ipath_ibcddrctrl;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * number of GUIDs in the flash for this interface; may need some
+	 * rethinking for setting on other ifaces
+	 */
+	u32 ipath_nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	/* shadow kr_rcvctrl */
+	unsigned long ipath_rcvctrl;
+	/* shadow kr_sendctrl */
+	unsigned long ipath_sendctrl;
+	/* to not count armlaunch after cancel */
+	unsigned long ipath_lastcancel;
+	/* count cases where special trigger was needed (double write) */
+	unsigned long ipath_spectriggerhit;
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 ipath_rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 ipath_rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 ipath_rcvhdrentsize;
+	/* offset of last entry in rcvhdrq */
+	u32 ipath_hdrqlast;
+	/* kr_portcnt value */
+	u32 ipath_portcnt;
+	/* kr_pagealign value */
+	u32 ipath_palign;
+	/* number of "2KB" PIO buffers */
+	u32 ipath_piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 ipath_piosize2k;
+	/* number of "4KB" PIO buffers */
+	u32 ipath_piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 ipath_piosize4k;
+	u32 ipath_pioreserved; /* reserved special-inkernel; */
+	/* kr_rcvegrbase value */
+	u32 ipath_rcvegrbase;
+	/* kr_rcvegrcnt value */
+	u32 ipath_rcvegrcnt;
+	/* kr_rcvtidbase value */
+	u32 ipath_rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 ipath_rcvtidcnt;
+	/* kr_sendregbase */
+	u32 ipath_sregbase;
+	/* kr_userregbase */
+	u32 ipath_uregbase;
+	/* kr_counterregbase */
+	u32 ipath_cregbase;
+	/* shadow the control register contents */
+	u32 ipath_control;
+	/* PCI revision register (HTC rev on FPGA) */
+	u32 ipath_pcirev;
+
+	/* chip address space used by 4k pio buffers */
+	u32 ipath_4kalign;
+	/* The MTU programmed for this unit */
+	u32 ipath_ibmtu;
+	/*
+	 * The max size IB packet, included IB headers that we can send.
+	 * Starts same as ipath_piosize, but is affected when ibmtu is
+	 * changed, or by size of eager buffers
+	 */
+	u32 ipath_ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 ipath_init_ibmaxlen;
+	/* size of each rcvegrbuffer */
+	u32 ipath_rcvegrbufsize;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 ipath_lbus_width;
+	/* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
+	u32 ipath_lbus_speed;
+	/*
+	 * number of sequential ibcstatus change for polling active/quiet
+	 * (i.e., link not coming up).
+	 */
+	u32 ipath_ibpollcnt;
+	/* low and high portions of MSI capability/vector */
+	u32 ipath_msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 ipath_msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 ipath_msi_data;
+	/* MLID programmed for this instance */
+	u16 ipath_mlid;
+	/* LID programmed for this instance */
+	u16 ipath_lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 ipath_pkeys[4];
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 ipath_serial[16];
+	/* human readable board version */
+	u8 ipath_boardversion[96];
+	u8 ipath_lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from ipath_revision */
+	u8 ipath_majrev;
+	/* chip minor rev, from ipath_revision */
+	u8 ipath_minrev;
+	/* board rev, from ipath_revision */
+	u8 ipath_boardrev;
+	/* saved for restore after reset */
+	u8 ipath_pci_cacheline;
+	/* LID mask control */
+	u8 ipath_lmc;
+	/* link width supported */
+	u8 ipath_link_width_supported;
+	/* link speed supported */
+	u8 ipath_link_speed_supported;
+	u8 ipath_link_width_enabled;
+	u8 ipath_link_speed_enabled;
+	u8 ipath_link_width_active;
+	u8 ipath_link_speed_active;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 ipath_rx_pol_inv;
+
+	u8 ipath_r_portenable_shift;
+	u8 ipath_r_intravail_shift;
+	u8 ipath_r_tailupd_shift;
+	u8 ipath_r_portcfg_shift;
+
+	/* unit # of this chip, if present */
+	int ipath_unit;
+
+	/* local link integrity counter */
+	u32 ipath_lli_counter;
+	/* local link integrity errors */
+	u32 ipath_lli_errors;
+	/*
+	 * Above counts only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of kern-packets.
+	 * Below are the three (monotonically increasing) counters
+	 * maintained via GPIO interrupts on iba6120-rev2.
+	 */
+	u32 ipath_rxfc_unsupvl_errs;
+	u32 ipath_overrun_thresh_errs;
+	u32 ipath_lli_errs;
+
+	/*
+	 * Not all devices managed by a driver instance are the same
+	 * type, so these fields must be per-device.
+	 */
+	u64 ipath_i_bitsextant;
+	ipath_err_t ipath_e_bitsextant;
+	ipath_err_t ipath_hwe_bitsextant;
+
+	/*
+	 * Below should be computable from number of ports,
+	 * since they are never modified.
+	 */
+	u64 ipath_i_rcvavail_mask;
+	u64 ipath_i_rcvurg_mask;
+	u16 ipath_i_rcvurg_shift;
+	u16 ipath_i_rcvavail_shift;
+
+	/*
+	 * Register bits for selecting i2c direction and values, used for
+	 * I2C serial flash.
+	 */
+	u8 ipath_gpio_sda_num;
+	u8 ipath_gpio_scl_num;
+	u8 ipath_i2c_chain_type;
+	u64 ipath_gpio_sda;
+	u64 ipath_gpio_scl;
+
+	/* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
+	spinlock_t ipath_gpio_lock;
+
+	/*
+	 * IB link and linktraining states and masks that vary per chip in
+	 * some way.  Set at init, to avoid each IB status change interrupt
+	 */
+	u8 ibcs_ls_shift;
+	u8 ibcs_lts_mask;
+	u32 ibcs_mask;
+	u32 ib_init;
+	u32 ib_arm;
+	u32 ib_active;
+
+	u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+	 * reg. Changes for IBA7220
+	 */
+	u8 ibcc_lic_mask; /* LinkInitCmd */
+	u8 ibcc_lc_shift; /* LinkCmd */
+	u8 ibcc_mpl_shift; /* Maxpktlen */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
+	u16 ipath_led_override_timeoff; /* delta to next timer event */
+	u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
+	u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t ipath_led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list ipath_led_override_timer;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t ipath_eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex ipath_eep_lock;
+	/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
+	uint64_t ipath_traffic_wds;
+	/* active time is kept in seconds, but logged in hours */
+	atomic_t ipath_active_time;
+	/* Below are nominal shadow of EEPROM, new since last EEPROM update */
+	uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
+	uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
+	uint16_t ipath_eep_hrs;
+	/*
+	 * masks for which bits of errs, hwerrs that cause
+	 * each of the counters to increment.
+	 */
+	struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+	/* interrupt mitigation reload register info */
+	u16 ipath_jint_idle_ticks;	/* idle clock ticks */
+	u16 ipath_jint_max_packets;	/* max packets across all ports */
+
+	/*
+	 * lock for access to SerDes, and flags to sequence preset
+	 * versus steady-state. 7220-only at the moment.
+	 */
+	spinlock_t ipath_sdepb_lock;
+	u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
+};
+
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
+#define IPATH_HOL_UP       0
+#define IPATH_HOL_DOWN     1
+/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
+#define IPATH_HOL_DOWNSTOP 0
+#define IPATH_HOL_DOWNCONT 1
+
+/* bit positions for sdma_status */
+#define IPATH_SDMA_ABORTING  0
+#define IPATH_SDMA_DISARMED  1
+#define IPATH_SDMA_DISABLED  2
+#define IPATH_SDMA_LAYERBUF  3
+#define IPATH_SDMA_RUNNING  30
+#define IPATH_SDMA_SHUTDOWN 31
+
+/* bit combinations that correspond to abort states */
+#define IPATH_SDMA_ABORT_NONE 0
+#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
+#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED))
+#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+
+#define IPATH_SDMA_BUF_NONE 0
+#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
+
+/* Private data for file operations */
+struct ipath_filedata {
+	struct ipath_portdata *pd;
+	unsigned subport;
+	unsigned tidcursor;
+	struct ipath_user_sdma_queue *pq;
+};
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_clear_freeze(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp);
+
+int ipath_diag_add(struct ipath_devdata *);
+void ipath_diag_remove(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_remove(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid);
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+		     ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+			  unsigned cnt);
+void ipath_cancel_sends(struct ipath_devdata *, int);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+void ipath_kreceive(struct ipath_portdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+int ipath_set_linkstate(struct ipath_devdata *, u8);
+int ipath_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_lid(struct ipath_devdata *, u32, u8);
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
+void ipath_hol_down(struct ipath_devdata *);
+void ipath_hol_up(struct ipath_devdata *);
+void ipath_hol_event(unsigned long);
+void ipath_toggle_rclkrls(struct ipath_devdata *);
+void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
+void ipath_set_relock_poll(struct ipath_devdata *, int);
+void ipath_shutdown_relock_poll(struct ipath_devdata *);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->tidcursor
+#define user_sdma_queue_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->pq
+
+/*
+ * values for ipath_flags
+ */
+		/* chip can report link latency (IB 1.2) */
+#define IPATH_HAS_LINK_LATENCY 0x1
+		/* The chip is up and initted */
+#define IPATH_INITTED       0x2
+		/* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET  0x4
+		/* The chip is present and valid for accesses */
+#define IPATH_PRESENT       0x8
+		/* HT link0 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT0   0x10
+		/* HT link1 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT1   0x20
+		/* The link is down */
+#define IPATH_LINKDOWN      0x40
+		/* The link level is up (0x11) */
+#define IPATH_LINKINIT      0x80
+		/* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED     0x100
+		/* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE    0x200
+		/* link current state is unknown */
+#define IPATH_LINKUNK       0x400
+		/* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC  0x1000
+		/* DMA Receive tail pointer */
+#define IPATH_NODMA_RTAIL   0x2000
+		/* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE       0x4000
+		/* Supports port zero per packet receive interrupts via
+		 * GPIO */
+#define IPATH_GPIO_INTR     0x8000
+		/* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID     0x10000
+		/* packet/word counters are 32 bit, else those 4 counters
+		 * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+		/* Interrupt register is 64 bits */
+#define IPATH_INTREG_64     0x40000
+		/* can miss port0 rx interrupts */
+#define IPATH_DISABLED      0x80000 /* administratively disabled */
+		/* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS  0x200000
+		/* Supports Send DMA */
+#define IPATH_HAS_SEND_DMA  0x400000
+		/* Supports Send Count (not just word count) in PBC */
+#define IPATH_HAS_PBC_CNT   0x800000
+		/* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT      0x1000000
+#define IPATH_HAS_THRESH_UPDATE 0x4000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
+#define IPATH_IB_AUTONEG_INPROG 0x10000000
+#define IPATH_IB_AUTONEG_FAILED 0x20000000
+		/* Linkdown-disable intentionally, Do not attempt to bring up */
+#define IPATH_IB_LINK_DISABLED 0x40000000
+#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
+
+/* portdata flag bit offsets */
+		/* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define IPATH_PORT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define IPATH_PORT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+				unsigned len, int avail);
+void ipath_init_iba6110_funcs(struct ipath_devdata *);
+void ipath_get_eeprom_info(struct ipath_devdata *);
+int ipath_update_eeprom_log(struct ipath_devdata *dd);
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
+void ipath_force_pio_avail_update(struct ipath_devdata *);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
+
+/* send dma routines */
+int setup_sdma(struct ipath_devdata *);
+void teardown_sdma(struct ipath_devdata *);
+void ipath_restart_sdma(struct ipath_devdata *);
+void ipath_sdma_intr(struct ipath_devdata *);
+int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
+			  u32, struct ipath_verbs_txreq *);
+/* ipath_sdma_lock should be locked before calling this. */
+int ipath_sdma_make_progress(struct ipath_devdata *dd);
+
+/* must be called under ipath_sdma_lock */
+static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
+{
+	return dd->ipath_sdma_descq_cnt -
+		(dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
+		1 - dd->ipath_sdma_desc_nreserved;
+}
+
+static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved += cnt;
+}
+
+static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved -= cnt;
+}
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
+int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+			   unsigned, u64);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+				    ipath_ureg regno, int port)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_uregbase +
+		      (char __iomem *)dd->ipath_kregbase +
+		      dd->ipath_ureg_align * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+				    ipath_ureg regno, u64 value, int port)
+{
+	u64 __iomem *ubase = (u64 __iomem *)
+		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+		 dd->ipath_ureg_align * port);
+	if (dd->ipath_kregbase)
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+
+	return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+				    ipath_kreg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+				  ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readq(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+					 ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+				    ipath_creg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, regno + (u64 __iomem *)
+		       (dd->ipath_cregbase +
+			(char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	*((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	return (u32) le64_to_cpu(*((volatile __le64 *)
+				pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
+{
+	const struct ipath_devdata *dd = pd->port_dd;
+	u32 hdrqtail;
+
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			pd->port_head + dd->ipath_rhf_offset;
+		seq = ipath_hdrget_seq(rhf_addr);
+		hdrqtail = pd->port_head;
+		if (seq == pd->port_seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+
+	return hdrqtail;
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+	return (dd->ipath_flags & IPATH_INTREG_64) ?
+		ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+		INFINIPATH_IBCS_LINKSTATE_MASK;
+	if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+		state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+	return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return logical link state
+ * combination of link state and linktraining state (down, active, init,
+ * arm, etc.
+ */
+static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 ibs;
+	ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+	ibs |= (u32)(ibcs &
+		(INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
+	return ibs;
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ib_ipath_version[];
+
+extern const struct attribute_group *ipath_driver_attr_groups[];
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+const char *ipath_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
+extern unsigned ipath_mtu4096;
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME		"ib_ipath"
+#define IPATH_MAJOR		233
+#define IPATH_USER_MINOR_BASE	0
+#define IPATH_DIAGPKT_MINOR	127
+#define IPATH_DIAG_MINOR_BASE	129
+#define IPATH_NMINORS		255
+
+#define ipath_dev_err(dd,fmt,...) \
+	do { \
+		const struct ipath_devdata *__dd = (dd); \
+		if (__dd->pcidev) \
+			dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+				ipath_get_unit_name(__dd->ipath_unit), \
+				##__VA_ARGS__); \
+		else \
+			printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+			       ipath_get_unit_name(__dd->ipath_unit), \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+	do { \
+		if (unlikely(ipath_debug & (which))) \
+			printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+			       __func__,##__VA_ARGS__); \
+	} while(0)
+
+# define ipath_dbg(fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t lmsg);
+
+#endif				/* _IPATH_KERNEL_H */
diff --git a/drivers/staging/rdma/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c
new file mode 100644
index 0000000..c0e933f
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_keys.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* Find the next available LKEY */
+	r = n = rkt->next;
+	for (;;) {
+		if (rkt->table[r] == NULL)
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n) {
+			spin_unlock_irqrestore(&rkt->lock, flags);
+			ipath_dbg("LKEY table full\n");
+			ret = 0;
+			goto bail;
+		}
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+		((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	rkt->table[r] = mr;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+	unsigned long flags;
+	u32 r;
+
+	if (lkey == 0)
+		return;
+	r = lkey >> (32 - ib_ipath_lkey_table_size);
+	spin_lock_irqsave(&rkt->lock, flags);
+	rkt->table[r] = NULL;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc)
+{
+	struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (sge->lkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		isge->mr = NULL;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		ret = 1;
+		goto bail;
+	}
+	mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ */
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_lkey_table *rkt = &dev->lk_table;
+	struct ipath_sge *sge = &ss->sge;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (rkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		sge->mr = NULL;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		ss->sg_list = NULL;
+		ss->num_sge = 1;
+		ret = 1;
+		goto bail;
+	}
+
+	mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != rkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+	ss->sg_list = NULL;
+	ss->num_sge = 1;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c
new file mode 100644
index 0000000..ad3a926
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mad.c
@@ -0,0 +1,1521 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define IB_SMP_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD	cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR	cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD	cpu_to_be16(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+					 struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+struct nodeinfo {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+	struct ipath_devdata *dd = to_idev(ibdev)->dd;
+	u32 vendor, majrev, minrev;
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || (dd->ipath_guid == 0))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;	/* channel adapter */
+	/*
+	 * XXX The num_ports value will need a layer function to get
+	 * the value if we ever have more than one IB port on a chip.
+	 * We will also need to get the GUID for the port.
+	 */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+	nip->node_guid = dd->ipath_guid;
+	nip->port_guid = dd->ipath_guid;
+	nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->ipath_deviceid);
+	majrev = dd->ipath_majrev;
+	minrev = dd->ipath_minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	vendor = dd->ipath_vendorid;
+	nip->vendor_id[0] = IPATH_SRC_OUI_1;
+	nip->vendor_id[1] = IPATH_SRC_OUI_2;
+	nip->vendor_id[2] = IPATH_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/*
+	 * We only support one GUID for now.  If this changes, the
+	 * portinfo.guid_cap field needs to be updated too.
+	 */
+	if (startgx == 0) {
+		__be64 g = to_idev(ibdev)->dd->ipath_guid;
+		if (g == 0)
+			/* GUID 0 is illegal */
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			/* The first is a copy of the read-only HW GUID. */
+			*p = g;
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+static int get_phyerrthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+	return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u16 lid;
+	u8 ibcstat;
+	u8 mtu;
+	int ret;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+	    dev->mkeyprot == 0)
+		pip->mkey = dev->mkey;
+	pip->gid_prefix = dev->gid_prefix;
+	lid = dd->ipath_lid;
+	pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+	pip->sm_lid = cpu_to_be16(dev->sm_lid);
+	pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = dd->ipath_link_width_enabled;
+	pip->link_width_supported = dd->ipath_link_width_supported;
+	pip->link_width_active = dd->ipath_link_width_active;
+	pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	pip->portphysstate_linkdown =
+		(ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
+		(get_linkdowndefaultstate(dd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
+	pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
+		dd->ipath_link_speed_enabled;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:		/* oops, something is wrong */
+		mtu = IB_MTU_2048;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+	pip->vlcap_inittype = 0x10;	/* VLCap = VL0, InitType = 0 */
+	pip->vl_high_limit = dev->vl_high_limit;
+	/* pip->vl_arb_high_cap; // only one VL */
+	/* pip->vl_arb_low_cap; // only one VL */
+	/* InitTypeReply = 0 */
+	/* our mtu cap depends on whether 4K MTU enabled or not */
+	pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo = 0x10;	/* OVLs = 1 */
+	pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations =
+		cpu_to_be16((ipath_get_cr_errpkey(dd) -
+			     dev->z_pkey_violations) & 0xFFFF);
+	pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = 1;
+	pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(dd) << 4) |
+		get_overrunthreshold(dd);
+	/* pip->max_credit_hint; */
+	if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+	/* always a kernel port, no locking needed */
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+
+	memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+	return 0;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct ipath_ibdev *dev = to_idev(ibdev);
+		unsigned i, n = ipath_get_npkeys(dev->dd);
+
+		get_pkeys(dev->dd, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	/* The only GUID we support is the first read-only entry. */
+	return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
+{
+	if (sleep)
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	else
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl);
+	return 0;
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	char clientrereg = 0;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u16 lstate;
+	u32 mtu;
+	int ret, ore;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+		goto err;
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	dev->mkey = pip->mkey;
+	dev->gid_prefix = pip->gid_prefix;
+	dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	if (dd->ipath_lid != lid ||
+	    dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		/* Must be a valid unicast LID address. */
+		if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	if (smlid != dev->sm_lid) {
+		/* Must be a valid unicast LID address. */
+		if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		dev->sm_lid = smlid;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			lwe = dd->ipath_link_width_supported;
+		else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
+			goto err;
+		set_link_width_enabled(dd, lwe);
+	}
+
+	/* Allow 2.5 or 5.0 Gbs. */
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		if (lse == 15)
+			lse = dd->ipath_link_speed_supported;
+		else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
+			goto err;
+		set_link_speed_enabled(dd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		if (set_linkdowndefaultstate(dd, 1))
+			goto err;
+		break;
+	case 2: /* POLL */
+		if (set_linkdowndefaultstate(dd, 0))
+			goto err;
+		break;
+	default:
+		goto err;
+	}
+
+	dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	dev->vl_high_limit = pip->vl_high_limit;
+
+	switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+	case IB_MTU_256:
+		mtu = 256;
+		break;
+	case IB_MTU_512:
+		mtu = 512;
+		break;
+	case IB_MTU_1024:
+		mtu = 1024;
+		break;
+	case IB_MTU_2048:
+		mtu = 2048;
+		break;
+	case IB_MTU_4096:
+		if (!ipath_mtu4096)
+			goto err;
+		mtu = 4096;
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+	ipath_set_mtu(dd, mtu);
+
+	dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+	/* We only support VL0 */
+	if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+		goto err;
+
+	if (pip->mkey_violations == 0)
+		dev->mkey_violations = 0;
+
+	/*
+	 * Hardware counter can't be reset so snapshot and subtract
+	 * later.
+	 */
+	if (pip->pkey_violations == 0)
+		dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
+
+	if (pip->qkey_violations == 0)
+		dev->qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
+		goto err;
+
+	if (set_overrunthreshold(dd, (ore & 0xF)))
+		goto err;
+
+	dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	if (pip->clientrereg_resv_subnetto & 0x80) {
+		clientrereg = 1;
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		goto err;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = IPATH_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = IPATH_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = IPATH_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = IPATH_IB_LINKDOWN_DISABLE;
+		else
+			goto err;
+		ipath_set_linkstate(dd, lstate);
+		if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
+				IPATH_LINKACTIVE, 1000);
+		break;
+	case IB_PORT_ARMED:
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+	if (clientrereg)
+		pip->clientrereg_resv_subnetto |= 0x80;
+
+	goto done;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (dd->ipath_pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+			dd->ipath_pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (dd->ipath_pkeys[i] == key) {
+			if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&dd->ipath_pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			dd->ipath_pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
+{
+	struct ipath_portdata *pd;
+	int i;
+	int changed = 0;
+
+	/* always a kernel port, no locking needed */
+	pd = dd->ipath_pd[0];
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = pd->port_pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(dd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(dd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		pd->port_pkeys[i] = key;
+	}
+	if (changed) {
+		u64 pkey;
+		struct ib_event event;
+
+		pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev->ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev, u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	unsigned i, n = ipath_get_npkeys(dev->dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Indicate AllPortSelect is valid (only one port anyway) */
+	p->capability_mask = cpu_to_be16(1 << 8);
+	p->base_version = 1;
+	p->class_version = 1;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+	 * sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+				    COUNTER_MASK(1, 1) | \
+				    COUNTER_MASK(1, 2) | \
+				    COUNTER_MASK(1, 3) | \
+				    COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+	/*
+	 * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+	 * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
+	 * intervals are counted in ticks.  Since we use Linux timers, that
+	 * count in jiffies, we can't sample for less than 1000 ticks if HZ
+	 * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
+	 * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
+	 * have hardware support for delaying packets.
+	 */
+	if (crp->cr_psstat)
+		p->tick = dev->dd->ipath_link_speed_active - 1;
+	else
+		p->tick = 250;		/* 1 usec. */
+	p->counter_width = 4;	/* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		p->sample_status = dev->pma_sample_status;
+	p->sample_start = cpu_to_be32(dev->pma_sample_start);
+	p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+	p->tag = cpu_to_be16(dev->pma_tag);
+	p->counter_select[0] = dev->pma_counter_select[0];
+	p->counter_select[1] = dev->pma_counter_select[1];
+	p->counter_select[2] = dev->pma_counter_select[2];
+	p->counter_select[3] = dev->pma_counter_select[3];
+	p->counter_select[4] = dev->pma_counter_select[4];
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 status;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (p->port_select != port && p->port_select != 0xFF)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+		dev->pma_sample_start = be32_to_cpu(p->sample_start);
+		dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		dev->pma_tag = be16_to_cpu(p->tag);
+		dev->pma_counter_select[0] = p->counter_select[0];
+		dev->pma_counter_select[1] = p->counter_select[1];
+		dev->pma_counter_select[2] = p->counter_select[2];
+		dev->pma_counter_select[3] = p->counter_select[3];
+		dev->pma_counter_select[4] = p->counter_select[4];
+		if (crp->cr_psstat) {
+			ipath_write_creg(dev->dd, crp->cr_psinterval,
+					 dev->pma_sample_interval);
+			ipath_write_creg(dev->dd, crp->cr_psstart,
+					 dev->pma_sample_start);
+		} else
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev,
+		       struct ipath_cregs const *crp,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = (crp->cr_psxmitdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+			dev->ipath_sword;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = (crp->cr_psrcvdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+			dev->ipath_rword;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = (crp->cr_psxmitpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+			dev->ipath_spkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = (crp->cr_psrcvpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+			dev->ipath_rpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = (crp->cr_psxmitwaitcount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+			dev->ipath_xmit_wait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+					  struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be32(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					      struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	/* 64 bits */
+	p->extended_width = cpu_to_be32(0x80000000);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be64(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	ipath_get_counters(dev->dd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		dev->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= dev->z_link_downed_counter;
+	cntrs.port_rcv_errors += dev->rcv_errors;
+	cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
+	cntrs.port_xmit_data -= dev->z_port_xmit_data;
+	cntrs.port_rcv_data -= dev->z_port_rcv_data;
+	cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		dev->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		dev->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= dev->z_vl15_dropped;
+	cntrs.vl15_dropped += dev->n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	u8 port_select = p->port_select;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= dev->z_port_xmit_data;
+	rwords -= dev->z_port_rcv_data;
+	spkts -= dev->z_port_xmit_packets;
+	rpkts -= dev->z_port_rcv_packets;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+	p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	ipath_get_counters(dev->dd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		dev->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		dev->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		dev->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		dev->z_port_rcv_errors =
+			cntrs.port_rcv_errors + dev->rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		dev->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		dev->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		dev->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		dev->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		dev->n_vl15_dropped = 0;
+		dev->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		dev->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		dev->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = rpkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		dev->n_unicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		dev->n_unicast_rcv = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		dev->n_multicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		dev->n_multicast_rcv = 0;
+
+	return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port_num, const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	/* Is the mkey in the process of expiring? */
+	if (dev->mkey_lease_timeout &&
+	    time_after_eq(jiffies, dev->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		dev->mkey_lease_timeout = 0;
+		dev->mkeyprot = 0;
+	}
+
+	/*
+	 * M_Key checking depends on
+	 * Portinfo:M_Key_protect_bits
+	 */
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+	    dev->mkey != smp->mkey &&
+	    (smp->method == IB_MGMT_METHOD_SET ||
+	     (smp->method == IB_MGMT_METHOD_GET &&
+	      dev->mkeyprot >= 2))) {
+		if (dev->mkey_violations != 0xFFFF)
+			++dev->mkey_violations;
+		if (dev->mkey_lease_timeout ||
+		    dev->mkey_lease_period == 0) {
+			ret = IB_MAD_RESULT_SUCCESS |
+				IB_MAD_RESULT_CONSUMED;
+			goto bail;
+		}
+		dev->mkey_lease_timeout = jiffies +
+			dev->mkey_lease_period * HZ;
+		/* Future: Generate a trap notice. */
+		ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		goto bail;
+	} else if (dev->mkey_lease_timeout)
+		dev->mkey_lease_timeout = 0;
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = recv_subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_get_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_get_pkeytable(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_set_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+			const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = recv_pma_get_classportinfo(pmp);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = recv_pma_get_portsamplesresult_ext(pmp,
+								 ibdev);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_get_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_set_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		      const struct ib_mad_hdr *in, size_t in_mad_size,
+		      struct ib_mad_hdr *out, size_t *out_mad_size,
+		      u16 *out_mad_pkey_index)
+{
+	int ret;
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port_num,
+				   in_mad, out_mad);
+		goto bail;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port_num, in_mad, out_mad);
+		goto bail;
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c
new file mode 100644
index 0000000..e732742
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct ipath_mmap_info
+ */
+void ipath_release_mmap_info(struct kref *ref)
+{
+	struct ipath_mmap_info *ip =
+		container_of(ref, struct ipath_mmap_info, ref);
+	struct ipath_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void ipath_vma_open(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void ipath_vma_close(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, ipath_release_mmap_info);
+}
+
+static const struct vm_operations_struct ipath_vm_ops = {
+	.open =     ipath_vma_open,
+	.close =    ipath_vma_close,
+};
+
+/**
+ * ipath_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ipath_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct ipath_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &ipath_vm_ops;
+		vma->vm_private_data = ip;
+		ipath_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for ipath_mmap
+ */
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj) {
+	struct ipath_mmap_info *ip;
+
+	ip = kmalloc(sizeof *ip, GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj) {
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
new file mode 100644
index 0000000..c7278f6
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mr.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/* Fast memory region */
+struct ipath_fmr {
+	struct ib_fmr ibfmr;
+	u8 page_shift;
+	struct ipath_mregion mr;        /* must be last */
+};
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see ipath_dma.c).
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ipath_mr *mr;
+	struct ib_mr *ret;
+
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+				 struct ipath_lkey_table *lk_table)
+{
+	struct ipath_mr *mr;
+	int m, i = 0;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+	if (!mr)
+		goto done;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+		if (!mr->mr.map[i])
+			goto bail;
+	}
+	mr->mr.mapsz = m;
+
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	if (!ipath_alloc_lkey(lk_table, &mr->mr))
+		goto bail;
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+	goto done;
+
+bail:
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+	kfree(mr);
+	mr = NULL;
+
+done:
+	return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct ipath_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+	if (mr == NULL) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.length = 0;
+	mr->mr.offset = 0;
+	mr->mr.access_flags = acc;
+	mr->mr.max_segs = num_phys_buf;
+	mr->umem = NULL;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata)
+{
+	struct ipath_mr *mr;
+	struct ib_umem *umem;
+	int n, m, entry;
+	struct scatterlist *sg;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+	mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = ib_umem_offset(umem);
+	mr->mr.access_flags = mr_access_flags;
+	mr->mr.max_segs = n;
+	mr->umem = umem;
+
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		void *vaddr;
+
+		vaddr = page_address(sg_page(sg));
+		if (!vaddr) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		mr->mr.map[m]->segs[n].vaddr = vaddr;
+		mr->mr.map[m]->segs[n].length = umem->page_size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+	struct ipath_mr *mr = to_imr(ibmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+	i = mr->mr.mapsz;
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+	return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr)
+{
+	struct ipath_fmr *fmr;
+	int m, i = 0;
+	struct ib_fmr *ret;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+					 GFP_KERNEL);
+		if (!fmr->mr.map[i])
+			goto bail;
+	}
+	fmr->mr.mapsz = m;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+		goto bail;
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.pd = pd;
+	fmr->mr.user_base = 0;
+	fmr->mr.iova = 0;
+	fmr->mr.length = 0;
+	fmr->mr.offset = 0;
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+	goto done;
+
+bail:
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	ret = ERR_PTR(-ENOMEM);
+
+done:
+	return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	ps = 1 << fmr->page_shift;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ipath_fmr *fmr;
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+	i = fmr->mr.mapsz;
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c
new file mode 100644
index 0000000..face876
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_qp.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+#define BITS_PER_PAGE		(PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off)	(((map) - (qpt)->map) * BITS_PER_PAGE + \
+				 (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+						      BITS_PER_PAGE, off)
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+	0,			/* 0 */
+	1,			/* 1 */
+	2,			/* 2 */
+	3,			/* 3 */
+	4,			/* 4 */
+	6,			/* 5 */
+	8,			/* 6 */
+	12,			/* 7 */
+	16,			/* 8 */
+	24,			/* 9 */
+	32,			/* A */
+	48,			/* B */
+	64,			/* C */
+	96,			/* D */
+	128,			/* E */
+	192,			/* F */
+	256,			/* 10 */
+	384,			/* 11 */
+	512,			/* 12 */
+	768,			/* 13 */
+	1024,			/* 14 */
+	1536,			/* 15 */
+	2048,			/* 16 */
+	3072,			/* 17 */
+	4096,			/* 18 */
+	6144,			/* 19 */
+	8192,			/* 1A */
+	12288,			/* 1B */
+	16384,			/* 1C */
+	24576,			/* 1D */
+	32768			/* 1E */
+};
+
+
+static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long flags;
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+
+static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret = -1;
+
+	if (type == IB_QPT_SMI)
+		ret = 0;
+	else if (type == IB_QPT_GSI)
+		ret = 1;
+
+	if (ret != -1) {
+		map = &qpt->map[0];
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page)) {
+				ret = -ENOMEM;
+				goto bail;
+			}
+		}
+		if (!test_and_set_bit(ret, map->page))
+			atomic_dec(&map->n_free);
+		else
+			ret = -EBUSY;
+		goto bail;
+	}
+
+	qpn = qpt->last + 1;
+	if (qpn >= QPN_MAX)
+		qpn = 2;
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		if (likely(atomic_read(&map->n_free))) {
+			do {
+				if (!test_and_set_bit(offset, map->page)) {
+					atomic_dec(&map->n_free);
+					qpt->last = qpn;
+					ret = qpn;
+					goto bail;
+				}
+				offset = find_next_offset(map, offset);
+				qpn = mk_qpn(qpt, map, offset);
+				/*
+				 * This test differs from alloc_pidmap().
+				 * If find_next_offset() does find a zero
+				 * bit, we don't need to check for QPN
+				 * wrapping around past our starting QPN.
+				 * We just need to be sure we don't loop
+				 * forever.
+				 */
+			} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		}
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+	atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+			   enum ib_qp_type type)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = alloc_qpn(qpt, type);
+	if (ret < 0)
+		goto bail;
+	qp->ibqp.qp_num = ret;
+
+	/* Add the QP to the hash table. */
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	ret %= qpt->max;
+	qp->next = qpt->table[ret];
+	qpt->table[ret] = qp;
+	atomic_inc(&qp->refcount);
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+	struct ipath_qp *q, **qpp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	/* Remove QP from the hash table. */
+	qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+	for (; (q = *qpp) != NULL; qpp = &q->next) {
+		if (q == qp) {
+			*qpp = qp->next;
+			qp->next = NULL;
+			atomic_dec(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+/**
+ * ipath_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+	u32 n, qp_inuse = 0;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	for (n = 0; n < qpt->max; n++) {
+		qp = qpt->table[n];
+		qpt->table[n] = NULL;
+
+		for (; qp; qp = qp->next)
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&qpt->lock, flags);
+
+	for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
+		if (qpt->map[n].page)
+			free_page((unsigned long) qpt->map[n].page);
+	return qp_inuse;
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+		if (qp->ibqp.qp_num == qpn) {
+			atomic_inc(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	atomic_set(&qp->s_dma_busy, 0);
+	qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_pkt_delay = 0;
+	qp->s_draining = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_rnr_timeout = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+}
+
+/**
+ * ipath_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		ipath_schedule_send(qp);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct ipath_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for ipathverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int lastwqe = 0;
+	int ret;
+
+	spin_lock_irq(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid == 0 ||
+		    attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
+			goto inval;
+
+		if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
+		    (attr->ah_attr.grh.sgid_index > 1))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	/*
+	 * don't allow invalid Path MTU values or greater than 2048
+	 * unless we are configured for a 4KB MTU
+	 */
+	if ((attr_mask & IB_QP_PATH_MTU) &&
+		(ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
+		(attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
+		goto inval;
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE)
+		if (attr->path_mig_state != IB_MIG_MIGRATED &&
+		    attr->path_mig_state != IB_MIG_REARM)
+			goto inval;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			spin_lock(&dev->pending_lock);
+			if (!list_empty(&qp->timerwait))
+				list_del_init(&qp->timerwait);
+			if (!list_empty(&qp->piowait))
+				list_del_init(&qp->piowait);
+			spin_unlock(&dev->pending_lock);
+			qp->s_flags &= ~IPATH_S_ANY_WAIT;
+			spin_unlock_irq(&qp->s_lock);
+			/* Stop the sending tasklet */
+			tasklet_kill(&qp->s_task);
+			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+			spin_lock_irq(&qp->s_lock);
+		}
+		ipath_reset_qp(qp, ibqp->qp_type);
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_psn = qp->s_next_psn = attr->sq_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU)
+		qp->path_mtu = attr->path_mtu;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry = attr->rnr_retry;
+		if (qp->s_rnr_retry > 7)
+			qp->s_rnr_retry = 7;
+		qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		qp->timeout = attr->timeout;
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock_irq(&qp->s_lock);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock_irq(&qp->s_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = 0;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn;
+	attr->sq_psn = qp->s_next_psn;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = 0;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = 1;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = 0;
+	attr->alt_timeout = 0;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = 1;
+	return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+	u32 aeth = qp->r_msn & IPATH_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct ipath_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * XXX Not holding the r_rq.lock here so there is a small
+		 * chance that the pair of reads are not atomic.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << IPATH_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ipath_qp *qp;
+	int err;
+	struct ipath_swqe *swq = NULL;
+	struct ipath_ibdev *dev;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
+	    init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
+		    init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		sz = sizeof(struct ipath_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct ipath_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct ipath_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+		    init_attr->qp_type == IB_QPT_SMI ||
+		    init_attr->qp_type == IB_QPT_GSI)) {
+			qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+			if (!qp->r_ud_sg_list) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		} else
+			qp->r_ud_sg_list = NULL;
+		if (init_attr->srq) {
+			sz = 0;
+			qp->r_rq.size = 0;
+			qp->r_rq.max_sge = 0;
+			qp->r_rq.wq = NULL;
+			init_attr->cap.max_recv_wr = 0;
+			init_attr->cap.max_recv_sge = 0;
+		} else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct ipath_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
+					      qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_sg_list;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_waitqueue_head(&qp->wait_dma);
+		tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
+		INIT_LIST_HEAD(&qp->piowait);
+		INIT_LIST_HEAD(&qp->timerwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
+		else
+			qp->s_flags = 0;
+		dev = to_idev(ibpd->device);
+		err = ipath_alloc_qpn(&dev->qp_table, qp,
+				      init_attr->qp_type);
+		if (err) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_sg_list;
+		}
+		qp->ip = NULL;
+		qp->s_tx = NULL;
+		ipath_reset_qp(qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct ipath_rwq) +
+				qp->r_rq.size * sz;
+
+			qp->ip =
+			    ipath_create_mmap_info(dev, s,
+						   ibpd->uobject->context,
+						   qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == ib_ipath_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	ipath_free_qp(&dev->qp_table, qp);
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+	kfree(qp->r_ud_sg_list);
+bail_qp:
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		spin_lock(&dev->pending_lock);
+		if (!list_empty(&qp->timerwait))
+			list_del_init(&qp->timerwait);
+		if (!list_empty(&qp->piowait))
+			list_del_init(&qp->piowait);
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+		spin_unlock_irq(&qp->s_lock);
+		/* Stop the sending tasklet */
+		tasklet_kill(&qp->s_task);
+		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+	} else
+		spin_unlock_irq(&qp->s_lock);
+
+	ipath_free_qp(&dev->qp_table, qp);
+
+	if (qp->s_tx) {
+		atomic_dec(&qp->refcount);
+		if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+			kfree(qp->s_tx->txreq.map_addr);
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+		spin_unlock_irq(&dev->pending_lock);
+		qp->s_tx = NULL;
+	}
+
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	kfree(qp->r_ud_sg_list);
+	vfree(qp->s_wq);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+	int i;
+	int ret;
+
+	idev->qp_table.last = 1;	/* QPN 0 and 1 are special. */
+	idev->qp_table.max = size;
+	idev->qp_table.nmaps = 1;
+	idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+				       GFP_KERNEL);
+	if (idev->qp_table.table == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+		atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+		idev->qp_table.map[i].page = NULL;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == IPATH_AETH_CREDIT_INVAL)
+		qp->s_lsn = (u32) -1;
+	else if (qp->s_lsn != (u32) -1) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
+		if (ipath_cmp24(credit, qp->s_lsn) > 0)
+			qp->s_lsn = credit;
+	}
+
+	/* Restart sending if it was blocked due to lack of credits. */
+	if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+	    qp->s_cur != qp->s_head &&
+	    (qp->s_lsn == (u32) -1 ||
+	     ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+			 qp->s_lsn + 1) <= 0))
+		ipath_schedule_send(qp);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c
new file mode 100644
index 0000000..79b3dbc
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_rc.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ipath_skip_sge(ss, len);
+	return wqe->length - len;
+}
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+	struct ipath_ibdev *dev;
+
+	qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
+				ib_mtu_enum_to_int(qp->path_mtu));
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&qp->timerwait))
+		list_add_tail(&qp->timerwait,
+			      &dev->pending[dev->pending_index]);
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			     struct ipath_other_headers *ohdr, u32 pmtu)
+{
+	struct ipath_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & IPATH_S_ACK_PENDING)
+				goto normal;
+			qp->s_ack_state = OP(ACKNOWLEDGE);
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/* Copy SGE state in case we need to resend */
+			qp->s_ack_rdma_sge = e->rdma_sge;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			len = e->rdma_sge.sge.sge_length;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = ipath_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		break;
+
+	default:
+	normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~IPATH_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
+	}
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	return 0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_rc_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ipath_sge_state *ss;
+	struct ipath_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	     (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	     qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+	    ipath_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	/* Leave BUSY set until RNR timeout. */
+	if (qp->s_rnr_timeout) {
+		qp->s_flags |= IPATH_S_WAITING;
+		goto bail;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head)
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= IPATH_S_FENCE_PENDING;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = 0;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && qp->s_lsn != (u32) -1)
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		bth2 |= qp->s_psn & IPATH_PSN_MASK;
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		/*
+		 * Put the QP on the pending list so lost ACKs will cause
+		 * a retry.  More than one request can be pending so the
+		 * QP may already be on the dev->pending list.
+		 */
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->timerwait))
+			list_add_tail(&qp->timerwait,
+				      &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * This case can only happen if a send is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * This case can only happen if a RDMA write is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * This case can only happen if a RDMA read is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = qp->s_psn & IPATH_PSN_MASK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+		bth2 |= 1 << 31;	/* Request ACK. */
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from ipath_rc_rcv() and only uses the receive
+ * side QP state.
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+static void send_rc_ack(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 __iomem *piobuf;
+	struct ipath_ib_header hdr;
+	struct ipath_other_headers *ohdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	    (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	    qp->s_ack_state != OP(ACKNOWLEDGE))
+		goto queue_ack;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	dd = dev->dd;
+	if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+		goto done;
+
+	piobuf = ipath_getpiobuf(dd, 0, NULL);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/* Construct the header. */
+	ohdr = &hdr.u.oth;
+	lrh0 = IPATH_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += ipath_make_grh(dev, &hdr.u.l.grh,
+					 &qp->remote_ah_attr.grh,
+					 hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = IPATH_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
+		(OP(ACKNOWLEDGE) << 24) | (1 << 22);
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = ipath_compute_aeth(qp);
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+				 qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
+
+	writeq(hwords + 1, piobuf);
+
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
+		ipath_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	ipath_flush_wc();
+
+	dev->n_unicast_xmit++;
+	goto done;
+
+queue_ack:
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+		dev->n_rc_qacks++;
+		qp->s_flags |= IPATH_S_ACK_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+	u32 n = qp->s_last;
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (ipath_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = ipath_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See ipath_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
+{
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+	struct ipath_ibdev *dev;
+
+	if (qp->s_retry == 0) {
+		ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		goto bail;
+	}
+	qp->s_retry--;
+
+	/*
+	 * Remove the QP from the timeout queue.
+	 * Note: it may already have been removed by ipath_ib_timer().
+	 */
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		dev->n_rc_resends++;
+	else
+		dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+	reset_psn(qp, psn);
+	ipath_schedule_send(qp);
+
+bail:
+	return;
+}
+
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held and interrupts disabled.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	enum ib_wc_status status;
+	struct ipath_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/*
+	 * Remove the QP from the timeout queue (or RNR timeout queue).
+	 * If ipath_ib_timer() has already removed it,
+	 * it's OK since we hold the QP s_lock and ipath_restart_rc()
+	 * just won't find anything to restart if we ACK everything.
+	 */
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	spin_unlock(&dev->pending_lock);
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/*
+			 * The last valid PSN seen is the previous
+			 * request's.
+			 */
+			update_last_psn(qp, wqe->psn - 1);
+			/* Retry this request. */
+			ipath_restart_rc(qp, wqe->psn);
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			*(u64 *) wqe->sg_list[0].vaddr = val;
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+			     !qp->s_num_rd_atomic) ||
+			    qp->s_flags & IPATH_S_RDMAR_PENDING)
+				ipath_schedule_send(qp);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			wc.src_qp = qp->remote_qpn;
+			wc.slid = qp->remote_ah_attr.dlid;
+			wc.sl = qp->remote_ah_attr.sl;
+			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If we are completing a request which is in the process of
+		 * being resent, we can stop resending it since we know the
+		 * responder has already seen it.
+		 */
+		if (qp->s_last == qp->s_cur) {
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			qp->s_last = qp->s_cur;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_cur);
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		} else {
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+			if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+				qp->s_draining = 0;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_last);
+		}
+	}
+
+	switch (aeth >> 29) {
+	case 0:		/* ACK */
+		dev->n_rc_acks++;
+		/* If this is a partial ACK, reset the retransmit timer. */
+		if (qp->s_last != qp->s_tail) {
+			spin_lock(&dev->pending_lock);
+			if (list_empty(&qp->timerwait))
+				list_add_tail(&qp->timerwait,
+					&dev->pending[dev->pending_index]);
+			spin_unlock(&dev->pending_lock);
+			/*
+			 * If we get a partial ACK for a resent operation,
+			 * we can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+				reset_psn(qp, psn + 1);
+				ipath_schedule_send(qp);
+			}
+		} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		ipath_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:		/* RNR NAK */
+		dev->n_rnr_naks++;
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			dev->n_rc_resends++;
+		else
+			dev->n_rc_resends +=
+				(qp->s_psn - psn) & IPATH_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_rnr_timeout =
+			ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
+					   IPATH_AETH_CREDIT_MASK];
+		ipath_insert_rnr_queue(qp);
+		ipath_schedule_send(qp);
+		goto bail;
+
+	case 3:		/* NAK */
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
+			IPATH_AETH_CREDIT_MASK) {
+		case 0:	/* PSN sequence error */
+			dev->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			ipath_restart_rc(qp, psn);
+			break;
+
+		case 1:	/* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 2:	/* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 3:	/* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			dev->n_other_naks++;
+		class_b:
+			ipath_send_complete(qp, wqe, status);
+			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:		/* 2: reserved */
+	reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data, u32 tlen,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn, u32 hdrsize, u32 pmtu,
+				     int header_in_data)
+{
+	struct ipath_swqe *wqe;
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = ipath_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			if (!header_in_data)
+				aeth = be32_to_cpu(ohdr->u.aeth);
+			else {
+				aeth = be32_to_cpu(((__be32 *) data)[0]);
+				data += sizeof(__be32);
+			}
+			if ((aeth >> 29) == 0)
+				ipath_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	if (unlikely(qp->s_last == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			if (!header_in_data) {
+				__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+				val = ((u64) be32_to_cpu(p[0]) << 32) |
+					be32_to_cpu(p[1]);
+			} else
+				val = be64_to_cpu(((__be64 *) data)[0]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+	read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/* We got a response so update the timeout. */
+		spin_lock(&dev->pending_lock);
+		if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
+			list_move_tail(&qp->timerwait,
+				       &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+	read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	ipath_send_complete(qp, wqe, status);
+	ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn,
+				     int diff,
+				     int header_in_data)
+{
+	struct ipath_ack_entry *e;
+	u8 i, prev;
+	int old_req;
+	unsigned long flags;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			goto send_ack;
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 */
+	psn &= IPATH_PSN_MASK;
+	e = NULL;
+	old_req = 1;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto unlock_done;
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = IPATH_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (ipath_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
+			goto unlock_done;
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & IPATH_PSN_MASK) *
+			ib_mtu_enum_to_int(qp->path_mtu);
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+			goto unlock_done;
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = ipath_rkey_ok(qp, &e->rdma_sge,
+					   len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->psn = psn;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		if (old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->r_nak_state = 0;
+	ipath_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = ipath_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > IPATH_MAX_RDMA_ATOMIC)
+		next = 0;
+	if (n == qp->s_tail_ack_queue) {
+		qp->s_tail_ack_queue = next;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+	}
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int diff;
+	struct ib_reth *reth;
+	int header_in_data;
+	unsigned long flags;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 4
+		 * bytes of the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+				  hdrsize, pmtu, header_in_data);
+		goto done;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = ipath_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+				       psn, diff, header_in_data))
+			goto done;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	memset(&wc, 0, sizeof wc);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+	send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		qp->r_msn++;
+		if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &qp->r_sge,
+					   qp->r_len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto send_last;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct ipath_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					   rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct ipath_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		if (!header_in_data)
+			ateth = &ohdr->u.atomic_eth;
+		else
+			ateth = (struct ib_atomic_eth *)data;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+					    sizeof(u64), vaddr, rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn & IPATH_PSN_MASK;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	goto done;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	send_rc_ack(qp);
+	goto done;
+
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h
new file mode 100644
index 0000000..8f44d0c
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_registers.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags.  It
+ * defines the registers, and their contents, for InfiniPath chips.
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code.  A few, that are visible to protocol and user
+ * code are in ipath_common.h.  Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+#define INFINIPATH_S_UPDTHRESH_SHIFT 24
+#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
+
+#define IPATH_S_ABORT		0
+#define IPATH_S_PIOINTBUFAVAIL	1
+#define IPATH_S_PIOBUFAVAILUPD	2
+#define IPATH_S_PIOENABLE	3
+#define IPATH_S_SDMAINTENABLE	9
+#define IPATH_S_SDMASINGLEDESCRIPTOR	10
+#define IPATH_S_SDMAENABLE	11
+#define IPATH_S_SDMAHALT	12
+#define IPATH_S_DISARM		31
+
+#define INFINIPATH_S_ABORT		(1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL	(1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD	(1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE		(1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_SDMAINTENABLE	(1U << IPATH_S_SDMAINTENABLE)
+#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
+					(1U << IPATH_S_SDMASINGLEDESCRIPTOR)
+#define INFINIPATH_S_SDMAENABLE		(1U << IPATH_S_SDMAENABLE)
+#define INFINIPATH_S_SDMAHALT		(1U << IPATH_S_SDMAHALT)
+#define INFINIPATH_S_DISARM		(1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits that are the same on multiple chips */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_SDMAINT		0x8000000000000000ULL
+#define INFINIPATH_I_SDMADISABLED	0x4000000000000000ULL
+#define INFINIPATH_I_ERROR		0x0000000080000000ULL
+#define INFINIPATH_I_SPIOSENT		0x0000000040000000ULL
+#define INFINIPATH_I_SPIOBUFAVAIL	0x0000000020000000ULL
+#define INFINIPATH_I_GPIO		0x0000000010000000ULL
+#define INFINIPATH_I_JINT		0x0000000004000000ULL
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR			0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC			0x0000000000000002ULL
+#define INFINIPATH_E_RICRC			0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN			0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN			0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN		0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN		0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR			0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL			0x0000000000000100ULL
+#define INFINIPATH_E_REBP			0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW			0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION		0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL		0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL		0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID			0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN			0x0000000000008000ULL
+#define INFINIPATH_E_RHDR			0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK		0x0000000000020000ULL
+#define INFINIPATH_E_SENDSPECIALTRIGGER		0x0000000008000000ULL
+#define INFINIPATH_E_SDMADISABLED		0x0000000010000000ULL
+#define INFINIPATH_E_SMINPKTLEN			0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN			0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN			0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN			0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT		0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT		0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH		0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM		0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL			0x0000002000000000ULL
+#define INFINIPATH_E_SENDBUFMISUSE		0x0000004000000000ULL
+#define INFINIPATH_E_SDMAGENMISMATCH		0x0000008000000000ULL
+#define INFINIPATH_E_SDMAOUTOFBOUND		0x0000010000000000ULL
+#define INFINIPATH_E_SDMATAILOUTOFBOUND		0x0000020000000000ULL
+#define INFINIPATH_E_SDMABASE			0x0000040000000000ULL
+#define INFINIPATH_E_SDMA1STDESC		0x0000080000000000ULL
+#define INFINIPATH_E_SDMARPYTAG			0x0000100000000000ULL
+#define INFINIPATH_E_SDMADWEN			0x0000200000000000ULL
+#define INFINIPATH_E_SDMAMISSINGDW		0x0000400000000000ULL
+#define INFINIPATH_E_SDMAUNEXPDATA		0x0000800000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED		0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR		0x0002000000000000ULL
+#define INFINIPATH_E_RESET			0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE			0x0008000000000000ULL
+#define INFINIPATH_E_SDMADESCADDRMISALIGN	0x0010000000000000ULL
+#define INFINIPATH_E_INVALIDEEPCMD		0x0020000000000000ULL
+
+/*
+ * this is used to print "common" packet errors only when the
+ * __IPATH_ERRPKTDBG bit is set in ipath_debug.
+ */
+#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
+		| INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
+		| INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
+		| INFINIPATH_E_REBP )
+
+/* Convenience for decoding Send DMA errors */
+#define INFINIPATH_E_SDMAERRS ( \
+	INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
+	INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
+	INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
+	INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
+	INFINIPATH_E_SDMAUNEXPDATA | \
+	INFINIPATH_E_SDMADESCADDRMISALIGN | \
+	INFINIPATH_E_SDMADISABLED | \
+	INFINIPATH_E_SENDBUFMISUSE)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
+ * 		bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF	0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC	0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
+#define INFINIPATH_HWE_MEMBISTFAILED	0x0040000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_DOWN 1		/* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2		/* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3	/* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+
+#define INFINIPATH_IBCS_TXREADY       0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
+/* link training states (shift by
+   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED	0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP		0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE	0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET	0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY	0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET	0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE	0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG	0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT	0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE	0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN	0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT	0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE	0x0f
+/* link state machine states (shift by ibcs_ls_shift) */
+#define INFINIPATH_IBCS_L_STATE_DOWN		0x0
+#define INFINIPATH_IBCS_L_STATE_INIT		0x1
+#define INFINIPATH_IBCS_L_STATE_ARM		0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE		0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER	0x4
+
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK  0xfULL	/* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL	/* pll reset */
+/* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
+/* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
+/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
+#define INFINIPATH_SERDC0_L1PWR_DN	 0xF0ULL
+
+/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
+#define INFINIPATH_XGXS_RX_POL_SHIFT 19
+#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
+
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers.  The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg,	/* infinipath general registers */
+ ipath_creg,			/* infinipath counter registers */
+ ipath_sreg;			/* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits.  Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity).  The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+	/* These are the 32 bit group */
+	ipath_kreg kr_control;
+	ipath_kreg kr_counterregbase;
+	ipath_kreg kr_intmask;
+	ipath_kreg kr_intstatus;
+	ipath_kreg kr_pagealign;
+	ipath_kreg kr_portcnt;
+	ipath_kreg kr_rcvtidbase;
+	ipath_kreg kr_rcvtidcnt;
+	ipath_kreg kr_rcvegrbase;
+	ipath_kreg kr_rcvegrcnt;
+	ipath_kreg kr_scratch;
+	ipath_kreg kr_sendctrl;
+	ipath_kreg kr_sendpiobufbase;
+	ipath_kreg kr_sendpiobufcnt;
+	ipath_kreg kr_sendpiosize;
+	ipath_kreg kr_sendregbase;
+	ipath_kreg kr_userregbase;
+	/* These are the 64 bit group */
+	ipath_kreg kr_debugport;
+	ipath_kreg kr_debugportselect;
+	ipath_kreg kr_errorclear;
+	ipath_kreg kr_errormask;
+	ipath_kreg kr_errorstatus;
+	ipath_kreg kr_extctrl;
+	ipath_kreg kr_extstatus;
+	ipath_kreg kr_gpio_clear;
+	ipath_kreg kr_gpio_mask;
+	ipath_kreg kr_gpio_out;
+	ipath_kreg kr_gpio_status;
+	ipath_kreg kr_hwdiagctrl;
+	ipath_kreg kr_hwerrclear;
+	ipath_kreg kr_hwerrmask;
+	ipath_kreg kr_hwerrstatus;
+	ipath_kreg kr_ibcctrl;
+	ipath_kreg kr_ibcstatus;
+	ipath_kreg kr_intblocked;
+	ipath_kreg kr_intclear;
+	ipath_kreg kr_interruptconfig;
+	ipath_kreg kr_mdio;
+	ipath_kreg kr_partitionkey;
+	ipath_kreg kr_rcvbthqp;
+	ipath_kreg kr_rcvbufbase;
+	ipath_kreg kr_rcvbufsize;
+	ipath_kreg kr_rcvctrl;
+	ipath_kreg kr_rcvhdrcnt;
+	ipath_kreg kr_rcvhdrentsize;
+	ipath_kreg kr_rcvhdrsize;
+	ipath_kreg kr_rcvintmembase;
+	ipath_kreg kr_rcvintmemsize;
+	ipath_kreg kr_revision;
+	ipath_kreg kr_sendbuffererror;
+	ipath_kreg kr_sendpioavailaddr;
+	ipath_kreg kr_serdesconfig0;
+	ipath_kreg kr_serdesconfig1;
+	ipath_kreg kr_serdesstatus;
+	ipath_kreg kr_txintmembase;
+	ipath_kreg kr_txintmemsize;
+	ipath_kreg kr_xgxsconfig;
+	ipath_kreg kr_ibpllcfg;
+	/* use these two (and the following N ports) only with
+	 * ipath_k*_kreg64_port(); not *kreg64() */
+	ipath_kreg kr_rcvhdraddr;
+	ipath_kreg kr_rcvhdrtailaddr;
+
+	/* remaining registers are not present on all types of infinipath
+	   chips  */
+	ipath_kreg kr_rcvpktledcnt;
+	ipath_kreg kr_pcierbuftestreg0;
+	ipath_kreg kr_pcierbuftestreg1;
+	ipath_kreg kr_pcieq0serdesconfig0;
+	ipath_kreg kr_pcieq0serdesconfig1;
+	ipath_kreg kr_pcieq0serdesstatus;
+	ipath_kreg kr_pcieq1serdesconfig0;
+	ipath_kreg kr_pcieq1serdesconfig1;
+	ipath_kreg kr_pcieq1serdesstatus;
+	ipath_kreg kr_hrtbt_guid;
+	ipath_kreg kr_ibcddrctrl;
+	ipath_kreg kr_ibcddrstatus;
+	ipath_kreg kr_jintreload;
+
+	/* send dma related regs */
+	ipath_kreg kr_senddmabase;
+	ipath_kreg kr_senddmalengen;
+	ipath_kreg kr_senddmatail;
+	ipath_kreg kr_senddmahead;
+	ipath_kreg kr_senddmaheadaddr;
+	ipath_kreg kr_senddmabufmask0;
+	ipath_kreg kr_senddmabufmask1;
+	ipath_kreg kr_senddmabufmask2;
+	ipath_kreg kr_senddmastatus;
+
+	/* SerDes related regs (IBA7220-only) */
+	ipath_kreg kr_ibserdesctrl;
+	ipath_kreg kr_ib_epbacc;
+	ipath_kreg kr_ib_epbtrans;
+	ipath_kreg kr_pcie_epbacc;
+	ipath_kreg kr_pcie_epbtrans;
+	ipath_kreg kr_ib_ddsrxeq;
+};
+
+struct ipath_cregs {
+	ipath_creg cr_badformatcnt;
+	ipath_creg cr_erricrccnt;
+	ipath_creg cr_errlinkcnt;
+	ipath_creg cr_errlpcrccnt;
+	ipath_creg cr_errpkey;
+	ipath_creg cr_errrcvflowctrlcnt;
+	ipath_creg cr_err_rlencnt;
+	ipath_creg cr_errslencnt;
+	ipath_creg cr_errtidfull;
+	ipath_creg cr_errtidvalid;
+	ipath_creg cr_errvcrccnt;
+	ipath_creg cr_ibstatuschange;
+	ipath_creg cr_intcnt;
+	ipath_creg cr_invalidrlencnt;
+	ipath_creg cr_invalidslencnt;
+	ipath_creg cr_lbflowstallcnt;
+	ipath_creg cr_iblinkdowncnt;
+	ipath_creg cr_iblinkerrrecovcnt;
+	ipath_creg cr_ibsymbolerrcnt;
+	ipath_creg cr_pktrcvcnt;
+	ipath_creg cr_pktrcvflowctrlcnt;
+	ipath_creg cr_pktsendcnt;
+	ipath_creg cr_pktsendflowcnt;
+	ipath_creg cr_portovflcnt;
+	ipath_creg cr_rcvebpcnt;
+	ipath_creg cr_rcvovflcnt;
+	ipath_creg cr_rxdroppktcnt;
+	ipath_creg cr_senddropped;
+	ipath_creg cr_sendstallcnt;
+	ipath_creg cr_sendunderruncnt;
+	ipath_creg cr_unsupvlcnt;
+	ipath_creg cr_wordrcvcnt;
+	ipath_creg cr_wordsendcnt;
+	ipath_creg cr_vl15droppedpktcnt;
+	ipath_creg cr_rxotherlocalphyerrcnt;
+	ipath_creg cr_excessbufferovflcnt;
+	ipath_creg cr_locallinkintegrityerrcnt;
+	ipath_creg cr_rxvlerrcnt;
+	ipath_creg cr_rxdlidfltrcnt;
+	ipath_creg cr_psstat;
+	ipath_creg cr_psstart;
+	ipath_creg cr_psinterval;
+	ipath_creg cr_psrcvdatacount;
+	ipath_creg cr_psrcvpktscount;
+	ipath_creg cr_psxmitdatacount;
+	ipath_creg cr_psxmitpktscount;
+	ipath_creg cr_psxmitwaitcount;
+};
+
+#endif				/* _IPATH_REGISTERS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c
new file mode 100644
index 0000000..1f95bba
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_ruc.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+	656,			/* 0 */
+	1,			/* 1 */
+	1,			/* 2 */
+	1,			/* 3 */
+	1,			/* 4 */
+	1,			/* 5 */
+	1,			/* 6 */
+	1,			/* 7 */
+	1,			/* 8 */
+	1,			/* 9 */
+	1,			/* A */
+	1,			/* B */
+	1,			/* C */
+	1,			/* D */
+	2,			/* E */
+	2,			/* F */
+	3,			/* 10 */
+	4,			/* 11 */
+	6,			/* 12 */
+	8,			/* 13 */
+	11,			/* 14 */
+	16,			/* 15 */
+	21,			/* 16 */
+	31,			/* 17 */
+	41,			/* 18 */
+	62,			/* 19 */
+	82,			/* 1A */
+	123,			/* 1B */
+	164,			/* 1C */
+	246,			/* 1D */
+	328,			/* 1E */
+	492			/* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * Called with the QP s_lock held and interrupts disabled.
+ * XXX Use a simple list for now.  We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+	/* We already did a spin_lock_irqsave(), so just use spin_lock */
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&dev->rnrwait))
+		list_add(&qp->timerwait, &dev->rnrwait);
+	else {
+		struct list_head *l = &dev->rnrwait;
+		struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+						  timerwait);
+
+		while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+			qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+			l = l->next;
+			if (l->next == &dev->rnrwait) {
+				nqp = NULL;
+				break;
+			}
+			nqp = list_entry(l->next, struct ipath_qp,
+					 timerwait);
+		}
+		if (nqp)
+			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
+		list_add(&qp->timerwait, l);
+	}
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+
+	*lengthp = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		*lengthp += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_rwq *wq;
+	struct ipath_srq *srq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	do {
+		if (unlikely(tail == wq->head)) {
+			ret = 0;
+			goto unlock;
+		}
+		/* Make sure entry is read after head index is read. */
+		smp_rmb();
+		wqe = get_rwqe_ptr(rq, tail);
+		if (++tail >= rq->size)
+			tail = 0;
+		if (wr_id_only)
+			break;
+		qp->r_sge.sg_list = qp->r_sg_list;
+	} while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
+	qp->r_wr_id = wqe->wr_id;
+	wq->tail = tail;
+
+	ret = 1;
+	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from ipath_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ipath_swqe *wqe;
+	struct ipath_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof wc);
+	send_status = IB_WC_SUCCESS;
+
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+					    wqe->wr.wr.atomic.remote_addr,
+					    wqe->wr.wr.atomic.rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	ipath_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= IPATH_S_WAITING;
+	dev->n_rnr_naks++;
+	sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+	ipath_insert_rnr_queue(sqp);
+	goto clr_busy;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	ipath_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ipath_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~IPATH_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
+{
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int ipath_no_bufs_available(struct ipath_qp *qp,
+				    struct ipath_ibdev *dev)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, ipath_ib_piobufavail()
+	 * could be called. Therefore, put QP on the piowait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+		dev->n_piowait++;
+		qp->s_flags |= IPATH_S_WAITING;
+		qp->s_flags &= ~IPATH_S_BUSY;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->piowait))
+			list_add_tail(&qp->piowait, &dev->piowait);
+		spin_unlock(&dev->pending_lock);
+	} else
+		ret = 0;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (ret)
+		want_buffer(dev->dd, qp);
+	return ret;
+}
+
+/**
+ * ipath_make_grh - construct a GRH header
+ * @dev: a pointer to the ipath device
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((6 << 28) |
+			    (grh->traffic_class << 20) |
+			    grh->flow_label);
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = 0x1B;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = dev->gid_prefix;
+	hdr->sgid.global.interface_id = dev->dd->ipath_guid;
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2)
+{
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = IPATH_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &qp->remote_ah_attr.grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+	}
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * ipath_do_send - perform a send on a QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void ipath_do_send(unsigned long data)
+{
+	struct ipath_qp *qp = (struct ipath_qp *)data;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	int (*make_req)(struct ipath_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+		ipath_ruc_loopback(qp);
+		goto bail;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+	       make_req = ipath_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+	       make_req = ipath_make_uc_req;
+	else
+	       make_req = ipath_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		goto bail;
+	}
+
+	qp->s_flags |= IPATH_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+again:
+	/* Check for a constructed packet to be sent. */
+	if (qp->s_hdrwords != 0) {
+		/*
+		 * If no PIO bufs are available, return.  An interrupt will
+		 * call ipath_ib_piobufavail() when one is available.
+		 */
+		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+				     qp->s_cur_sge, qp->s_cur_size)) {
+			if (ipath_no_bufs_available(qp, dev))
+				goto bail;
+		}
+		dev->n_unicast_xmit++;
+		/* Record that we sent the packet and s_hdr is empty. */
+		qp->s_hdrwords = 0;
+	}
+
+	if (make_req(qp))
+		goto again;
+
+bail:;
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status)
+{
+	u32 old_last, last;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof wc);
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			       status != IB_WC_SUCCESS);
+	}
+
+	old_last = last = qp->s_last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c
new file mode 100644
index 0000000..17a5177
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_sdma.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
+
+static void vl15_watchdog_enq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
+		unsigned long interval = (HZ + 19) / 20;
+		dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
+		add_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_deq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
+		unsigned long interval = (HZ + 19) / 20;
+		mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
+	} else {
+		del_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_timeout(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
+		ipath_dbg("vl15 watchdog timeout - clearing\n");
+		ipath_cancel_sends(dd, 1);
+		ipath_hol_down(dd);
+	} else {
+		ipath_dbg("vl15 watchdog timeout - "
+			  "condition already cleared\n");
+	}
+}
+
+static void unmap_desc(struct ipath_devdata *dd, unsigned head)
+{
+	__le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+/*
+ * ipath_sdma_lock should be locked before calling this.
+ */
+int ipath_sdma_make_progress(struct ipath_devdata *dd)
+{
+	struct list_head *lp = NULL;
+	struct ipath_sdma_txreq *txp = NULL;
+	u16 dmahead;
+	u16 start_idx = 0;
+	int progress = 0;
+
+	if (!list_empty(&dd->ipath_sdma_activelist)) {
+		lp = dd->ipath_sdma_activelist.next;
+		txp = list_entry(lp, struct ipath_sdma_txreq, list);
+		start_idx = txp->start_idx;
+	}
+
+	/*
+	 * Read the SDMA head register in order to know that the
+	 * interrupt clear has been written to the chip.
+	 * Otherwise, we may not get an interrupt for the last
+	 * descriptor in the queue.
+	 */
+	dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
+	/* sanity check return value for error handling (chip reset, etc.) */
+	if (dmahead >= dd->ipath_sdma_descq_cnt)
+		goto done;
+
+	while (dd->ipath_sdma_descq_head != dmahead) {
+		if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
+		    dd->ipath_sdma_descq_head == start_idx) {
+			unmap_desc(dd, dd->ipath_sdma_descq_head);
+			start_idx++;
+			if (start_idx == dd->ipath_sdma_descq_cnt)
+				start_idx = 0;
+		}
+
+		/* increment free count and head */
+		dd->ipath_sdma_descq_removed++;
+		if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
+			dd->ipath_sdma_descq_head = 0;
+
+		if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
+			/* move to notify list */
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(lp, &dd->ipath_sdma_notifylist);
+			if (!list_empty(&dd->ipath_sdma_activelist)) {
+				lp = dd->ipath_sdma_activelist.next;
+				txp = list_entry(lp, struct ipath_sdma_txreq,
+						 list);
+				start_idx = txp->start_idx;
+			} else {
+				lp = NULL;
+				txp = NULL;
+			}
+		}
+		progress = 1;
+	}
+
+	if (progress)
+		tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+done:
+	return progress;
+}
+
+static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
+{
+	struct ipath_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, list, list) {
+		list_del_init(&txp->list);
+
+		if (txp->callback)
+			(*txp->callback)(txp->callback_cookie,
+					 txp->callback_status);
+	}
+}
+
+static void sdma_notify_taskbody(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	struct list_head list;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	list_splice_init(&dd->ipath_sdma_notifylist, &list);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	ipath_sdma_notify(dd, &list);
+
+	/*
+	 * The IB verbs layer needs to see the callback before getting
+	 * the call to ipath_ib_piobufavail() because the callback
+	 * handles releasing resources the next send will need.
+	 * Otherwise, we could do these calls in
+	 * ipath_sdma_make_progress().
+	 */
+	ipath_ib_piobufavail(dd->verbs_dev);
+}
+
+static void sdma_notify_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		sdma_notify_taskbody(dd);
+}
+
+static void dump_sdma_state(struct ipath_devdata *dd)
+{
+	unsigned long reg;
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
+	ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
+	ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+	ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+	ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
+}
+
+static void sdma_abort_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	u64 status;
+	unsigned long flags;
+
+	if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		return;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
+
+	/* nothing to do */
+	if (status == IPATH_SDMA_ABORT_NONE)
+		goto unlock;
+
+	/* ipath_sdma_abort() is done, waiting for interrupt */
+	if (status == IPATH_SDMA_ABORT_DISARMED) {
+		if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
+			goto resched_noprint;
+		/* give up, intr got lost somewhere */
+		ipath_dbg("give up waiting for SDMADISABLED intr\n");
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		status = IPATH_SDMA_ABORT_ABORTED;
+	}
+
+	/* everything is stopped, time to clean up and restart */
+	if (status == IPATH_SDMA_ABORT_ABORTED) {
+		struct ipath_sdma_txreq *txp, *txpnext;
+		u64 hwstatus;
+		int notify = 0;
+
+		hwstatus = ipath_read_kreg64(dd,
+				dd->ipath_kregs->kr_senddmastatus);
+
+		if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+				 IPATH_SDMA_STATUS_ABORT_IN_PROG	     |
+				 IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+		    !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
+			if (dd->ipath_sdma_reset_wait > 0) {
+				/* not done shutting down sdma */
+				--dd->ipath_sdma_reset_wait;
+				goto resched;
+			}
+			ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
+				"status after SDMA reset, continuing\n");
+			dump_sdma_state(dd);
+		}
+
+		/* dequeue all "sent" requests */
+		list_for_each_entry_safe(txp, txpnext,
+					 &dd->ipath_sdma_activelist, list) {
+			txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+			notify = 1;
+		}
+		if (notify)
+			tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+		/* reset our notion of head and tail */
+		dd->ipath_sdma_descq_tail = 0;
+		dd->ipath_sdma_descq_head = 0;
+		dd->ipath_sdma_head_dma[0] = 0;
+		dd->ipath_sdma_generation = 0;
+		dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
+
+		/* Reset SendDmaLenGen */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
+			(u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
+
+		/* done with sdma state for a bit */
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+		/*
+		 * Don't restart sdma here (with the exception
+		 * below). Wait until link is up to ACTIVE.  VL15 MADs
+		 * used to bring the link up use PIO, and multiple link
+		 * transitions otherwise cause the sdma engine to be
+		 * stopped and started multiple times.
+		 * The disable is done here, including the shadow,
+		 * so the state is kept consistent.
+		 * See ipath_restart_sdma() for the actual starting
+		 * of sdma.
+		 */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* make sure I see next message */
+		dd->ipath_sdma_abort_jiffies = 0;
+
+		/*
+		 * Not everything that takes SDMA offline is a link
+		 * status change.  If the link was up, restart SDMA.
+		 */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			ipath_restart_sdma(dd);
+
+		goto done;
+	}
+
+resched:
+	/*
+	 * for now, keep spinning
+	 * JAG - this is bad to just have default be a loop without
+	 * state change
+	 */
+	if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
+		ipath_dbg("looping with status 0x%08lx\n",
+			  dd->ipath_sdma_status);
+		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
+	}
+resched_noprint:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	return;
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+done:
+	return;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void ipath_sdma_intr(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	(void) ipath_sdma_make_progress(dd);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+}
+
+static int alloc_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
+		SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
+			"FIFO memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	dd->ipath_sdma_descq_cnt =
+		SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
+
+	/* Allocate memory for DMA of head register to memory */
+	dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
+		PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
+	if (!dd->ipath_sdma_head_dma) {
+		ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
+		ret = -ENOMEM;
+		goto cleanup_descq;
+	}
+	dd->ipath_sdma_head_dma[0] = 0;
+
+	init_timer(&dd->ipath_sdma_vl15_timer);
+	dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
+	dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
+	atomic_set(&dd->ipath_sdma_vl15_count, 0);
+
+	goto done;
+
+cleanup_descq:
+	dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+		(void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
+	dd->ipath_sdma_descq = NULL;
+	dd->ipath_sdma_descq_phys = 0;
+done:
+	return ret;
+}
+
+int setup_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	unsigned i, n;
+	u64 tmp64;
+	u64 senddmabufmask[3] = { 0 };
+	unsigned long flags;
+
+	ret = alloc_sdma(dd);
+	if (ret)
+		goto done;
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "SendDMA memory not allocated\n");
+		goto done;
+	}
+
+	/*
+	 * Set initial status as if we had been up, then gone down.
+	 * This lets initial start on transition to ACTIVE be the
+	 * same as restart after link flap.
+	 */
+	dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
+	dd->ipath_sdma_abort_jiffies = 0;
+	dd->ipath_sdma_generation = 0;
+	dd->ipath_sdma_descq_tail = 0;
+	dd->ipath_sdma_descq_head = 0;
+	dd->ipath_sdma_descq_removed = 0;
+	dd->ipath_sdma_descq_added = 0;
+
+	/* Set SendDmaBase */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
+			 dd->ipath_sdma_descq_phys);
+	/* Set SendDmaLenGen */
+	tmp64 = dd->ipath_sdma_descq_cnt;
+	tmp64 |= 1<<18; /* enable generation checking */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
+	/* Set SendDmaTail */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
+			 dd->ipath_sdma_descq_tail);
+	/* Set SendDmaHeadAddr */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
+			 dd->ipath_sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+	ipath_chg_pioavailkernel(dd, i, n - i , 0);
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
+			 senddmabufmask[0]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
+			 senddmabufmask[1]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
+			 senddmabufmask[2]);
+
+	INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
+	INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
+
+	tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
+		     (unsigned long) dd);
+	tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
+		     (unsigned long) dd);
+
+	/*
+	 * No use to turn on SDMA here, as link is probably not ACTIVE
+	 * Just mark it RUNNING and enable the interrupt, and let the
+	 * ipath_restart_sdma() on link transition to ACTIVE actually
+	 * enable it.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	__set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+done:
+	return ret;
+}
+
+void teardown_sdma(struct ipath_devdata *dd)
+{
+	struct ipath_sdma_txreq *txp, *txpnext;
+	unsigned long flags;
+	dma_addr_t sdma_head_phys = 0;
+	dma_addr_t sdma_descq_phys = 0;
+	void *sdma_descq = NULL;
+	void *sdma_head_dma = NULL;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	tasklet_kill(&dd->ipath_sdma_abort_task);
+	tasklet_kill(&dd->ipath_sdma_notify_task);
+
+	/* turn off sdma */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	/* dequeue all "sent" requests */
+	list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
+				 list) {
+		txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+		if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+			vl15_watchdog_deq(dd);
+		list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	sdma_notify_taskbody(dd);
+
+	del_timer_sync(&dd->ipath_sdma_vl15_timer);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	dd->ipath_sdma_abort_jiffies = 0;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
+
+	if (dd->ipath_sdma_head_dma) {
+		sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
+		sdma_head_phys = dd->ipath_sdma_head_phys;
+		dd->ipath_sdma_head_dma = NULL;
+		dd->ipath_sdma_head_phys = 0;
+	}
+
+	if (dd->ipath_sdma_descq) {
+		sdma_descq = dd->ipath_sdma_descq;
+		sdma_descq_phys = dd->ipath_sdma_descq_phys;
+		dd->ipath_sdma_descq = NULL;
+		dd->ipath_sdma_descq_phys = 0;
+	}
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	if (sdma_head_dma)
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  sdma_head_dma, sdma_head_phys);
+
+	if (sdma_descq)
+		dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+				  sdma_descq, sdma_descq_phys);
+}
+
+/*
+ * [Re]start SDMA, if we use it, and it's not already OK.
+ * This is called on transition to link ACTIVE, either the first or
+ * subsequent times.
+ */
+void ipath_restart_sdma(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int needed = 1;
+
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		goto bail;
+
+	/*
+	 * First, make sure we should, which is to say,
+	 * check that we are "RUNNING" (not in teardown)
+	 * and not "SHUTDOWN"
+	 */
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
+		|| test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			needed = 0;
+	else {
+		__clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!needed) {
+		ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
+			dd->ipath_sdma_status);
+		goto bail;
+	}
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/*
+	 * First clear, just to be safe. Enable is only done
+	 * in chip on 0->1 transition
+	 */
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* notify upper layers */
+	ipath_ib_piobufavail(dd->verbs_dev);
+
+bail:
+	return;
+}
+
+static inline void make_sdma_desc(struct ipath_devdata *dd,
+	u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
+{
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int ipath_sdma_verbs_send(struct ipath_devdata *dd,
+	struct ipath_sge_state *ss, u32 dwords,
+	struct ipath_verbs_txreq *tx)
+{
+
+	unsigned long flags;
+	struct ipath_sge *sge;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+
+	if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
+		ipath_dbg("packet size %X > ibmax %X, fail\n",
+			tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
+		ret = -EMSGSIZE;
+		goto fail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+retry:
+	if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
+		if (ipath_sdma_make_progress(dd))
+			goto retry;
+		ret = -ENOBUFS;
+		goto unlock;
+	}
+
+	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
+			      tx->map_len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, addr))
+		goto ioerr;
+
+	dwoffset = tx->map_len >> 2;
+	make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
+
+	/* SDmaFirstDesc */
+	sdmadesc[0] |= 1ULL << 12;
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= 1ULL << 14;	/* SDmaUseLargeBuf */
+
+	/* write to the descq */
+	tail = dd->ipath_sdma_descq_tail;
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
+		tx->txreq.start_idx = tail;
+
+	/* increment the tail */
+	if (++tail == dd->ipath_sdma_descq_cnt) {
+		tail = 0;
+		descqp = &dd->ipath_sdma_descq[0].qw[0];
+		++dd->ipath_sdma_generation;
+	}
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
+				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev, addr))
+			goto unmap;
+		make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= 1ULL << 14;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == dd->ipath_sdma_descq_cnt) {
+			tail = 0;
+			descqp = &dd->ipath_sdma_descq[0].qw[0];
+			++dd->ipath_sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	/* SDmaLastDesc */
+	descqp[0] |= cpu_to_le64(1ULL << 11);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
+		/* SDmaIntReq */
+		descqp[0] |= cpu_to_le64(1ULL << 15);
+	}
+
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+
+	tx->txreq.next_descq_idx = tail;
+	tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
+	dd->ipath_sdma_descq_tail = tail;
+	dd->ipath_sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
+		vl15_watchdog_enq(dd);
+	goto unlock;
+
+unmap:
+	while (tail != dd->ipath_sdma_descq_tail) {
+		if (!tail)
+			tail = dd->ipath_sdma_descq_cnt - 1;
+		else
+			tail--;
+		unmap_desc(dd, tail);
+	}
+ioerr:
+	ret = -EIO;
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+fail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c
new file mode 100644
index 0000000..2627198
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+	struct ipath_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_wr == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
+	    (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct ipath_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    ipath_create_mmap_info(dev, s,
+					   ibpd->uobject->context,
+					   srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+ 	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for ipathverbs.so
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct ipath_rwq *owq;
+		struct ipath_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct ipath_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		if (head >= srq->rq.size)
+			head = 0;
+		tail = owq->tail;
+		if (tail >= srq->rq.size)
+			tail = 0;
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct ipath_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct ipath_rwqe *)((char *) p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct ipath_mmap_info *ip = srq->ip;
+			struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct ipath_rwq) + size * sz;
+
+			ipath_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See ipath_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c
new file mode 100644
index 0000000..f63e143
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_stats.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time".  The words sent and received, and
+ * the packets sent and received are all that we worry about.  For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care.  We may eventually just make this
+ * handle all the counters.  word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+	u32 val, reg64 = 0;
+	u64 val64;
+	unsigned long t0, t1;
+	u64 ret;
+
+	t0 = jiffies;
+	/* If fast increment counters are only 32 bits, snapshot them,
+	 * and maintain them as 64bit values in the driver */
+	if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+	    (creg == dd->ipath_cregs->cr_wordsendcnt ||
+	     creg == dd->ipath_cregs->cr_wordrcvcnt ||
+	     creg == dd->ipath_cregs->cr_pktsendcnt ||
+	     creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+		val64 = ipath_read_creg(dd, creg);
+		val = val64 == ~0ULL ? ~0U : 0;
+		reg64 = 1;
+	} else			/* val64 just to keep gcc quiet... */
+		val64 = val = ipath_read_creg32(dd, creg);
+	/*
+	 * See if a second has passed.  This is just a way to detect things
+	 * that are quite broken.  Normally this should take just a few
+	 * cycles (the check is for long enough that we don't care if we get
+	 * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
+	 * normal NB values
+	 */
+	t1 = jiffies;
+	if (time_before(t0 + HZ, t1) && val == -1) {
+		ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
+			      creg);
+		ret = 0ULL;
+		goto bail;
+	}
+	if (reg64) {
+		ret = val64;
+		goto bail;
+	}
+
+	if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+		if (val != dd->ipath_lastsword) {
+			dd->ipath_sword += val - dd->ipath_lastsword;
+			dd->ipath_lastsword = val;
+		}
+		val64 = dd->ipath_sword;
+	} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+		if (val != dd->ipath_lastrword) {
+			dd->ipath_rword += val - dd->ipath_lastrword;
+			dd->ipath_lastrword = val;
+		}
+		val64 = dd->ipath_rword;
+	} else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+		if (val != dd->ipath_lastspkts) {
+			dd->ipath_spkts += val - dd->ipath_lastspkts;
+			dd->ipath_lastspkts = val;
+		}
+		val64 = dd->ipath_spkts;
+	} else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+		if (val != dd->ipath_lastrpkts) {
+			dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+			dd->ipath_lastrpkts = val;
+		}
+		val64 = dd->ipath_rpkts;
+	} else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->ibsymsnap;
+		val64 -= dd->ibsymdelta;
+	} else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->iblnkerrsnap;
+		val64 -= dd->iblnkerrdelta;
+	} else
+		val64 = (u64) val;
+
+	ret = val64;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds.  User processes are printed at close, but kernel doesn't
+ * close, so...  Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+	static u64 last_tot_hdrqfull;
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+	size_t blen = 0;
+	char buf[128];
+	u32 hdrqtail;
+
+	*buf = 0;
+	if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+		blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+				pd->port_hdrqfull -
+				dd->ipath_p0_hdrqfull);
+		dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
+	}
+	if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%srcvegrfull %llu",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_etidfull -
+				  dd->ipath_last_tidfull));
+		dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+	}
+
+	/*
+	 * this is actually the number of hdrq full interrupts, not actual
+	 * events, but at the moment that's mostly what I'm interested in.
+	 * Actual count, etc. is in the counters, if needed.  For production
+	 * users this won't ordinarily be printed.
+	 */
+
+	if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+	    ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%shdrqfull %llu (all ports)",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_hdrqfull -
+				  last_tot_hdrqfull));
+		last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+	}
+	if (blen)
+		ipath_dbg("%s\n", buf);
+
+	hdrqtail = ipath_get_hdrqtail(pd);
+	if (pd->port_head != hdrqtail) {
+		if (dd->ipath_lastport0rcv_cnt ==
+		    ipath_stats.sps_port0pkts) {
+			ipath_cdbg(PKT, "missing rcv interrupts? "
+				   "port0 hd=%x tl=%x; port0pkts %llx; write"
+				   " hd (w/intr)\n",
+				   pd->port_head, hdrqtail,
+				   (unsigned long long)
+				   ipath_stats.sps_port0pkts);
+			ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
+				dd->ipath_rhdrhead_intr_off, pd->port_port);
+		}
+		dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+	}
+}
+
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+	if (errormask == dd->ipath_errormask)
+		return;
+	fixed++;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+
+	if ((hwerrs & dd->ipath_hwerrmask) ||
+		(ctrl & INFINIPATH_C_FREEZEMODE)) {
+		/* force re-interrupt of pending events, just in case */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+		dev_info(&dd->pcidev->dev,
+			"errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+			fixed, errormask, (unsigned long)dd->ipath_errormask,
+			ctrl, hwerrs);
+	} else
+		ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+			fixed, errormask,
+			(unsigned long)dd->ipath_errormask);
+}
+
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	int i;
+	static unsigned cnt;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
+	    ipath_diag_inuse)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain a "active timer", based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	traffic_wds -= dd->ipath_traffic_wds;
+	dd->ipath_traffic_wds += traffic_wds;
+	if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+		atomic_add(5, &dd->ipath_active_time); /* S/B #define */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+
+	if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	}
+
+	ipath_qcheck(dd);
+
+	/*
+	 * deal with repeat error suppression.  Doesn't really matter if
+	 * last error was almost a full interval ago, or just a few usecs
+	 * ago; still won't get more than 2 per interval.  We may want
+	 * longer intervals for this eventually, could do with mod, counter
+	 * or separate timer.  Also see code in ipath_handle_errors() and
+	 * ipath_handle_hwerrors().
+	 */
+
+	if (dd->ipath_lasterror)
+		dd->ipath_lasterror = 0;
+	if (dd->ipath_lasthwerror)
+		dd->ipath_lasthwerror = 0;
+	if (dd->ipath_maskederrs
+	    && time_after(jiffies, dd->ipath_unmasktime)) {
+		char ebuf[256];
+		int iserr;
+		iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
+					 dd->ipath_maskederrs);
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		      INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Re-enabling masked errors "
+				      "(%s)\n", ebuf);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them.  So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg(
+					"Re-enabling queue full errors (%s)\n",
+					ebuf);
+			else
+				ipath_cdbg(ERRPKT, "Re-enabling packet"
+					" problem interrupt (%s)\n", ebuf);
+		}
+
+		/* re-enable masked errors */
+		dd->ipath_errormask |= dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		dd->ipath_maskederrs = 0;
+	}
+
+	/* limit qfull messages to ~one per minute per port */
+	if ((++cnt & 0x10)) {
+		for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			if (pd && pd->port_lastrcvhdrqtail != -1)
+				pd->port_lastrcvhdrqtail = -1;
+		}
+	}
+
+	ipath_chk_errormask(dd);
+done:
+	mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c
new file mode 100644
index 0000000..75558f3
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_sysfs.c
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+	unsigned long val;
+	char *end;
+	int ret;
+
+	if (!isdigit(str[0])) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	val = simple_strtoul(str, &end, 0);
+
+	if (val > 0xffff) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*valp = val;
+
+	ret = end + 1 - str;
+	if (ret == 0)
+		ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			 ipath_count_units(NULL, NULL, NULL));
+}
+
+static ssize_t show_status(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			(unsigned long long) *(dd->ipath_statusp));
+
+bail:
+	return ret;
+}
+
+static const char *ipath_status_str[] = {
+	"Initted",
+	"Disabled",
+	"Admin_Disabled",
+	"", /* This used to be the old "OIB_SMA" status. */
+	"", /* This used to be the old "SMA" status. */
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"NoIBcable",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(dd->ipath_statusp);
+	*buf = '\0';
+	for (any = i = 0; s && ipath_status_str[i]; i++) {
+		if (s & 1) {
+			if (any && strlcat(buf, " ", PAGE_SIZE) >=
+			    PAGE_SIZE)
+				/* overflow */
+				break;
+			if (strlcat(buf, ipath_status_str[i],
+				    PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_localbus_info(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
+}
+
+static ssize_t show_lmc(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf,
+			 size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lmc = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lmc);
+	if (ret < 0)
+		goto invalid;
+
+	if (lmc > 7) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+	return ret;
+}
+
+static ssize_t show_lid(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lid = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lid);
+	if (ret < 0)
+		goto invalid;
+
+	if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, lid, dd->ipath_lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
+bail:
+	return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 mlid;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &mlid);
+	if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
+		goto invalid;
+
+	dd->ipath_mlid = mlid;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u8 *guid;
+
+	guid = (u8 *) & (dd->ipath_guid);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+			 guid[0], guid[1], guid[2], guid[3],
+			 guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	unsigned short guid[8];
+	__be64 new_guid;
+	u8 *ng;
+	int i;
+
+	if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+		   &guid[0], &guid[1], &guid[2], &guid[3],
+		   &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+		goto invalid;
+
+	ng = (u8 *) &new_guid;
+
+	for (i = 0; i < 8; i++) {
+		if (guid[i] > 0xff)
+			goto invalid;
+		ng[i] = guid[i];
+	}
+
+	if (new_guid == 0)
+		goto invalid;
+
+	dd->ipath_guid = new_guid;
+	dd->ipath_nguid = 1;
+	if (dd->verbs_dev)
+		dd->verbs_dev->ibdev.node_guid = new_guid;
+
+	ret = strlen(buf);
+	goto bail;
+
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid GUID\n");
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_nports(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	/* Return the number of user ports available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
+static ssize_t show_serial(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	buf[sizeof dd->ipath_serial] = '\0';
+	memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+static ssize_t show_jint_max_packets(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf,
+				      size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_max_packets.\n");
+	else
+		dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+	return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+	else
+		dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+	return ret;
+}
+
+#define DEVICE_COUNTER(name, attr) \
+	static ssize_t show_counter_##name(struct device *dev, \
+					   struct device_attribute *attr, \
+					   char *buf) \
+	{ \
+		struct ipath_devdata *dd = dev_get_drvdata(dev); \
+		return scnprintf(\
+			buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+			ipath_snap_cntr( \
+				dd, offsetof(struct infinipath_counters, \
+					     attr) / sizeof(u64)));	\
+	} \
+	static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+	&dev_attr_ib_link_downeds.attr,
+	&dev_attr_ib_link_err_recoveries.attr,
+	&dev_attr_ib_status_changes.attr,
+	&dev_attr_ib_symbol_errs.attr,
+	&dev_attr_lb_flow_stalls.attr,
+	&dev_attr_lb_ints.attr,
+	&dev_attr_rx_bad_formats.attr,
+	&dev_attr_rx_buf_ovfls.attr,
+	&dev_attr_rx_data_pkts.attr,
+	&dev_attr_rx_dropped_pkts.attr,
+	&dev_attr_rx_dwords.attr,
+	&dev_attr_rx_ebps.attr,
+	&dev_attr_rx_flow_ctrl_errs.attr,
+	&dev_attr_rx_flow_pkts.attr,
+	&dev_attr_rx_icrc_errs.attr,
+	&dev_attr_rx_len_errs.attr,
+	&dev_attr_rx_link_problems.attr,
+	&dev_attr_rx_lpcrc_errs.attr,
+	&dev_attr_rx_max_min_len_errs.attr,
+	&dev_attr_rx_p0_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p1_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p2_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p3_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p4_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p5_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p6_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p7_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p8_hdr_egr_ovfls.attr,
+	&dev_attr_rx_pkey_mismatches.attr,
+	&dev_attr_rx_tid_full_errs.attr,
+	&dev_attr_rx_tid_valid_errs.attr,
+	&dev_attr_rx_vcrc_errs.attr,
+	&dev_attr_tx_data_pkts.attr,
+	&dev_attr_tx_dropped_pkts.attr,
+	&dev_attr_tx_dwords.attr,
+	&dev_attr_tx_flow_pkts.attr,
+	&dev_attr_tx_flow_stalls.attr,
+	&dev_attr_tx_len_errs.attr,
+	&dev_attr_tx_max_min_len_errs.attr,
+	&dev_attr_tx_underruns.attr,
+	&dev_attr_tx_unsup_vl_errs.attr,
+	NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+	.name = "counters",
+	.attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (dd->ipath_flags & IPATH_DISABLED) {
+		/*
+		 * post-reset init would re-enable interrupts, etc.
+		 * so don't allow reset on disabled devices.  Not
+		 * perfect error, but about the best choice.
+		 */
+		dev_info(dev,"Unit %d is disabled, can't reset\n",
+			 dd->ipath_unit);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_reset_device(dd->ipath_unit);
+bail:
+	return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 state;
+
+	ret = ipath_parse_ushort(buf, &state);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_linkstate(dd, state);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 mtu = 0;
+	int r;
+
+	ret = ipath_parse_ushort(buf, &mtu);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_mtu(dd, mtu);
+	if (r < 0)
+		ret = r;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 enable = 0;
+
+	ret = ipath_parse_ushort(buf, &enable);
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+		goto bail;
+	}
+
+	if (enable) {
+		if (!(dd->ipath_flags & IPATH_DISABLED))
+			goto bail;
+
+		dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+		/* same as post-reset */
+		ret = ipath_init_chip(dd, 1);
+		if (ret)
+			ipath_dev_err(dd, "Failed to enable unit %d\n",
+				      dd->ipath_unit);
+		else {
+			dd->ipath_flags &= ~IPATH_DISABLED;
+			*dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+		}
+	}
+	else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+		dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+		ipath_shutdown_device(dd);
+		dd->ipath_flags |= IPATH_DISABLED;
+		*dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+	}
+
+bail:
+	return ret;
+}
+
+static ssize_t store_rx_pol_inv(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_rx_pol_inv(dd, val);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
+bail:
+	return ret;
+}
+
+static ssize_t store_led_override(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret > 0)
+		ipath_set_led_override(dd, val);
+	else
+		ipath_dev_err(dd, "attempt to set invalid LED override\n");
+	return ret;
+}
+
+static ssize_t show_logged_errs(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int idx, count;
+
+	/* force consistency with actual EEPROM */
+	if (ipath_update_eeprom_log(dd) != 0)
+		return -ENXIO;
+
+	count = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+			dd->ipath_eep_st_errs[idx],
+			idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
+	}
+
+	return count;
+}
+
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 3)
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		goto bail;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+	if (r < 0)
+		ret = r;
+	else if (val == IPATH_IB_HRTBT_OFF)
+		dd->ipath_flags |= IPATH_NO_HRTBT;
+	else
+		dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > 3))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Width (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Speed (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Rx Polarity (enable)\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ret = -EINVAL;
+		ipath_dev_err(dd,
+			"attempt to set invalid Lane reversal (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+	&driver_attr_num_units.attr,
+	&driver_attr_version.attr,
+	NULL
+};
+
+static struct attribute_group driver_attr_group = {
+	.attrs = driver_attributes
+};
+
+static ssize_t store_tempsense(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, stat;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret <= 0) {
+		ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
+		goto bail;
+	}
+	/* If anything but the highest limit, enable T_CRIT_A "interrupt" */
+	stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set tempsense config\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set local Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set remote Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = ipath_tempsense_read(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+			*(signed char *)(regvals),
+			*(signed char *)(regvals + 1),
+			regvals[2], regvals[3],
+			*(signed char *)(regvals + 5),
+			*(signed char *)(regvals + 7));
+	return ret;
+}
+
+const struct attribute_group *ipath_driver_attr_groups[] = {
+	&driver_attr_group,
+	NULL,
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
+static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+		   show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+		   show_jint_idle_ticks, store_jint_idle_ticks);
+static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
+		   show_tempsense, store_tempsense);
+
+static struct attribute *dev_attributes[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_lmc.attr,
+	&dev_attr_lid.attr,
+	&dev_attr_link_state.attr,
+	&dev_attr_mlid.attr,
+	&dev_attr_mtu.attr,
+	&dev_attr_nguid.attr,
+	&dev_attr_nports.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_status.attr,
+	&dev_attr_status_str.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_unit.attr,
+	&dev_attr_enabled.attr,
+	&dev_attr_rx_pol_inv.attr,
+	&dev_attr_led_override.attr,
+	&dev_attr_logged_errors.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_localbus_info.attr,
+	NULL
+};
+
+static struct attribute_group dev_attr_group = {
+	.attrs = dev_attributes
+};
+
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+		   store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+		   store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+		   store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+		   store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+		   store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+	&dev_attr_hrtbt_enable.attr,
+	&dev_attr_link_width_enable.attr,
+	&dev_attr_link_width.attr,
+	&dev_attr_link_speed_enable.attr,
+	&dev_attr_link_speed.attr,
+	&dev_attr_rx_pol_inv_enable.attr,
+	&dev_attr_rx_lane_rev_enable.attr,
+	NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+	.attrs = dev_ibcfg_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode.  A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ *
+ * Called with ipath_mutex held.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+	static int exposed;
+	int ret;
+
+	if (!exposed) {
+		ret = device_create_file(dev, &dev_attr_reset);
+		exposed = 1;
+	}
+	else
+		ret = 0;
+
+	return ret;
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+	int ret;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+	if (ret)
+		goto bail;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+	if (ret)
+		goto bail_attrs;
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+		if (ret)
+			goto bail_counter;
+		ret = device_create_file(dev, &dev_attr_jint_max_packets);
+		if (ret)
+			goto bail_idle;
+
+		ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+		if (ret)
+			goto bail_max;
+	}
+
+	return 0;
+
+bail_max:
+	device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+	device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+	return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+		device_remove_file(dev, &dev_attr_jint_idle_ticks);
+		device_remove_file(dev, &dev_attr_jint_max_packets);
+	}
+
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+	device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c
new file mode 100644
index 0000000..22e6099
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_uc.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_uc_req(struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head)
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		qp->s_psn = wqe->psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = len = wqe->length;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(to_idev(qp->ibqp.device),
+			      qp, ohdr, bth0 | (qp->s_state << 24),
+			      qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	struct ib_reth *reth;
+	int header_in_data;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the
+		 * core driver sets the eager header buffer
+		 * size to 56 bytes so the last 4 bytes of
+		 * the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+	memset(&wc, 0, sizeof wc);
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+	inv:
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			dev->n_pkt_drops++;
+			goto done;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+	send_first:
+		if (qp->r_flags & IPATH_R_REUSE_SGE) {
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+			qp->r_sge = qp->s_rdma_read_sge;
+		} else if (!ipath_get_rwqe(qp, 0)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Save the WQE so we can reuse it in case of an error. */
+		qp->s_rdma_read_sge = qp->r_sge;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.opcode = IB_WC_RECV;
+	last_imm:
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+	rdma_first:
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+					   vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok)) {
+				dev->n_pkt_drops++;
+				goto done;
+			}
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			goto rdma_last_imm;
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+	rdma_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (qp->r_flags & IPATH_R_REUSE_SGE)
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+		else if (!ipath_get_rwqe(qp, 1)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+	rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		dev->n_pkt_drops++;
+		goto done;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+done:
+	return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c
new file mode 100644
index 0000000..e8a2a91
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_ud.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling ipath_ud_rcv()
+ * while this is being called.
+ */
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_srq *srq;
+	struct ipath_sge_state rsge;
+	struct ipath_sge *sge;
+	struct ipath_rwq *wq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	struct ib_wc wc;
+	u32 tail;
+	u32 rlen;
+	u32 length;
+
+	qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		goto done;
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (unlikely(qp->ibqp.qp_num &&
+		     ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+		      sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
+		/* XXX OK to lose a count once in a while. */
+		dev->qkey_violations++;
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof wc);
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	/*
+	 * This would be a lot simpler if we could call ipath_get_rwqe()
+	 * but that uses state that the receive interrupt handler uses
+	 * so we would need to lock out receive interrupts while doing
+	 * local loopback.
+	 */
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 * Note that it is safe to drop the lock after changing rq->tail
+	 * since ipath_post_receive() won't fill the empty slot.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	wqe = get_rwqe_ptr(rq, tail);
+	rsge.sg_list = qp->r_ud_sg_list;
+	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > rlen) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	wc.wr_id = wqe->wr_id;
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+		} else
+			spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		spin_unlock_irqrestore(&rq->lock, flags);
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+	sge = swqe->sg_list;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&rsge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--swqe->wr.num_sge)
+				sge++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = dev->dd->ipath_lid |
+		(ah_attr->src_path_bits &
+		 ((1 << dev->dd->ipath_lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits =
+		ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+done:;
+}
+
+/**
+ * ipath_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_ud_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
+		if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
+			dev->n_multicast_xmit++;
+		else
+			dev->n_unicast_xmit++;
+	} else {
+		dev->n_unicast_xmit++;
+		lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid == dev->dd->ipath_lid)) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_dma_busy)) {
+				qp->s_flags |= IPATH_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			ipath_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_dmult = ah_attr->static_rate;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &ah_attr->grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+		ohdr = &qp->s_hdr.u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = IPATH_LRH_BTH;
+		ohdr = &qp->s_hdr.u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= ah_attr->sl << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000;	/* Set VL (see ch. 13.5.3.1) */
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);	/* DEST LID */
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+					   SIZE_OF_CRC);
+	lid = dev->dd->ipath_lid;
+	if (lid) {
+		lid |= ah_attr->src_path_bits &
+			((1 << dev->dd->ipath_lmc) - 1);
+		qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+	} else
+		qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= 1 << 23;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
+		ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
+		cpu_to_be32(IPATH_MULTICAST_QPN) :
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+	int header_in_data;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;	/* LRH + BTH + DETH */
+		qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+		src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+		/*
+		 * The header with GRH is 68 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 12
+		 * bytes of the IB header is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			qkey = be32_to_cpu(((__be32 *) data)[1]);
+			src_qp = be32_to_cpu(((__be32 *) data)[2]);
+			data += 12;
+		} else {
+			qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+			src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		}
+	}
+	src_qp &= IPATH_QPN_MASK;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			/* XXX OK to lose a count once in a while. */
+			dev->qkey_violations++;
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	} else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		   hdr->lrh[3] == IB_LID_PERMISSIVE) {
+		struct ib_smp *smp = (struct ib_smp *) data;
+
+		if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else
+			wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		hdrsize += sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/* Get the number of bytes the message was padded by. */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4))) {
+		/* Drop incomplete packets. */
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	tlen -= hdrsize + pad + 4;
+
+	/* Drop invalid MAD packets (see 13.5.3.1). */
+	if (unlikely((qp->ibqp.qp_num == 0 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+		     (qp->ibqp.qp_num == 1 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & IPATH_R_REUSE_SGE)
+		qp->r_flags &= ~IPATH_R_REUSE_SGE;
+	else if (!ipath_get_rwqe(qp, 0)) {
+		/*
+		 * Count VL15 packets dropped due to no receive buffer.
+		 * Otherwise, count them as buffer overruns since usually,
+		 * the HW will be able to receive packets even if there are
+		 * no QPs with posted receive buffers.
+		 */
+		if (qp->ibqp.qp_num == 0)
+			dev->n_vl15_dropped++;
+		else
+			dev->rcv_errors++;
+		goto bail;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > qp->r_len) {
+		qp->r_flags |= IPATH_R_REUSE_SGE;
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	if (has_grh) {
+		ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			       sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+	ipath_copy_sge(&qp->r_sge, data,
+		       wc.byte_len - sizeof(struct ib_grh));
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto bail;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       (ohdr->bth[0] &
+			cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
new file mode 100644
index 0000000..1da1252
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_pages.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+				   int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+			   (unsigned long) num_pages, p[i]);
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/* call with current->mm->mmap_sem held */
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+				  struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+		   (unsigned long) num_pages, start_page);
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__ipath_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+	unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+	int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_single(hwdev, ptr, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_single(hwdev, phys, size, direction);
+		phys = pci_map_single(hwdev, ptr, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+			 struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __ipath_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+	down_write(&current->mm->mmap_sem);
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	current->mm->pinned_vm -= num_pages;
+
+	up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+	struct work_struct work;
+	struct mm_struct *mm;
+	unsigned long num_pages;
+};
+
+static void user_pages_account(struct work_struct *_work)
+{
+	struct ipath_user_pages_work *work =
+		container_of(_work, struct ipath_user_pages_work, work);
+
+	down_write(&work->mm->mmap_sem);
+	work->mm->pinned_vm -= work->num_pages;
+	up_write(&work->mm->mmap_sem);
+	mmput(work->mm);
+	kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+	struct ipath_user_pages_work *work;
+	struct mm_struct *mm;
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return;
+
+	work = kmalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		goto bail_mm;
+
+	INIT_WORK(&work->work, user_pages_account);
+	work->mm = mm;
+	work->num_pages = num_pages;
+
+	queue_work(ib_wq, &work->work);
+	return;
+
+bail_mm:
+	mmput(mm);
+	return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c
new file mode 100644
index 0000000..cc04b7b
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_user_sdma.h"
+
+/* minimum size of header */
+#define IPATH_USER_SDMA_MIN_HEADER_LENGTH	64
+/* expected size of headers (for dma_pool) */
+#define IPATH_USER_SDMA_EXP_HEADER_LENGTH	64
+/* length mask in PBC (lower 11 bits) */
+#define IPATH_PBC_LENGTH_MASK			((1 << 11) - 1)
+
+struct ipath_user_sdma_pkt {
+	u8 naddr;		/* dimension of addr (1..3) ... */
+	u32 counter;		/* sdma pkts queued counter for this entry */
+	u64 added;		/* global descq number of entries */
+
+	struct {
+		u32 offset;			/* offset for kvaddr, addr */
+		u32 length;			/* length in page */
+		u8  put_page;			/* should we put_page? */
+		u8  dma_mapped;			/* is page dma_mapped? */
+		struct page *page;		/* may be NULL (coherent mem) */
+		void *kvaddr;			/* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+	struct list_head list;	/* list element */
+};
+
+struct ipath_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct ipath_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
+{
+	struct ipath_user_sdma_queue *pq =
+		kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	INIT_LIST_HEAD(&pq->sent);
+
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct ipath_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   IPATH_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	goto done;
+
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
+				      int i, size_t offset, size_t len,
+				      int put_page, int dma_mapped,
+				      struct page *page,
+				      void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+}
+
+static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
+					u32 counter, size_t offset,
+					size_t len, int dma_mapped,
+					struct page *page,
+					void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->naddr = 1;
+	pkt->counter = counter;
+	ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
+				  kvaddr, dma_addr);
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
+				    struct ipath_user_sdma_pkt *pkt,
+				    const struct iovec *iov,
+				    unsigned long niov) {
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+	dma_addr_t dma_addr;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		ret = -ENOMEM;
+		goto free_unmap;
+	}
+
+	ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
+				  dma_addr);
+	pkt->naddr = 2;
+
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/* how many pages in this iovec element? */
+static int ipath_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+/* truncate length to page boundary */
+static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
+{
+	const unsigned long offset = addr & ~PAGE_MASK;
+
+	return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
+}
+
+static void ipath_user_sdma_free_pkt_frag(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct ipath_user_sdma_pkt *pkt,
+					  int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr)
+		/* free coherent mem from cache... */
+		dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+}
+
+/* return number of pages pinned... */
+static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
+				     struct ipath_user_sdma_pkt *pkt,
+				     unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[2];
+	int j;
+	int ret;
+
+	ret = get_user_pages_fast(addr, npages, 0, pages);
+	if (ret != npages) {
+		int i;
+
+		for (i = 0; i < ret; i++)
+			put_page(pages[i]);
+
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	for (j = 0; j < npages; j++) {
+		/* map the pages... */
+		const int flen =
+			ipath_user_sdma_page_length(addr, tlen);
+		dma_addr_t dma_addr =
+			dma_map_page(&dd->pcidev->dev,
+				     pages[j], 0, flen, DMA_TO_DEVICE);
+		unsigned long fofs = addr & ~PAGE_MASK;
+
+		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+
+		ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
+					  pages[j], kmap(pages[j]),
+					  dma_addr);
+
+		pkt->naddr++;
+		addr += flen;
+		tlen -= flen;
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   struct ipath_user_sdma_pkt *pkt,
+				   const struct iovec *iov,
+				   unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = ipath_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = ipath_user_sdma_pin_pages(dd, pkt,
+						addr, iov[idx].iov_len,
+						npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	for (idx = 0; idx < pkt->naddr; idx++)
+		ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
+					struct ipath_user_sdma_queue *pq,
+					struct ipath_user_sdma_pkt *pkt,
+					const struct iovec *iov,
+					unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (npages >= ARRAY_SIZE(pkt->addr))
+		ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
+	else
+		ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void ipath_user_sdma_free_pkt_list(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct list_head *list)
+{
+	struct ipath_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		kmem_cache_free(pq->pkt_slab, pkt);
+	}
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
+				      struct ipath_user_sdma_queue *pq,
+				      struct list_head *list,
+				      const struct iovec *iov,
+				      unsigned long niov,
+				      int maxpkts)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	struct page *page = NULL;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct ipath_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	int dma_mapped = 0;
+
+	while (idx < niov && npkts < maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int cfur;
+
+		dma_mapped = 0;
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+		page = NULL;
+
+		pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+		if (!pkt) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_pkt;
+		}
+
+		if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
+			pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     &dma_addr);
+		else
+			pbc = NULL;
+
+		if (!pbc) {
+			page = alloc_page(GFP_KERNEL);
+			if (!page) {
+				ret = -ENOMEM;
+				goto free_pkt;
+			}
+			pbc = kmap(page);
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * this assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
+		if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen ||
+			    slen > PAGE_SIZE) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages++;
+			if ((faddr & PAGE_MASK) !=
+			    ((faddr + slen - 1) & PAGE_MASK))
+				npages++;
+
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (page) {
+			dma_addr = dma_map_page(&dd->pcidev->dev,
+						page, 0, len, DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+
+			dma_mapped = 1;
+		}
+
+		ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
+					    page, pbc, dma_addr);
+
+		if (nfrags) {
+			ret = ipath_user_sdma_init_payload(dd, pq, pkt,
+							   iov + idx_save + 1,
+							   nfrags, npages);
+			if (ret < 0)
+				goto free_pbc_dma;
+		}
+
+		counter++;
+		npkts++;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	ret = idx;
+	goto done;
+
+free_pbc_dma:
+	if (dma_mapped)
+		dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
+free_pbc:
+	if (page) {
+		kunmap(page);
+		__free_page(page);
+	} else
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+free_pkt:
+	kmem_cache_free(pq->pkt_slab, pkt);
+free_list:
+	ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+						 u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
+				       struct ipath_user_sdma_queue *pq)
+{
+	struct list_head free_list;
+	struct ipath_user_sdma_pkt *pkt;
+	struct ipath_user_sdma_pkt *pkt_prev;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+	}
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct ipath_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		ipath_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	kmem_cache_destroy(pq->pkt_slab);
+	dma_pool_destroy(pq->header_cache);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	ret = ipath_sdma_make_progress(dd);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq)
+{
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < 100; i++) {
+		mutex_lock(&pq->lock);
+		if (list_empty(&pq->sent)) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(10);
+	}
+
+	if (!list_empty(&pq->sent)) {
+		struct list_head free_list;
+
+		printk(KERN_INFO "drain: lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		mutex_lock(&pq->lock);
+		list_splice_init(&pq->sent, &free_list);
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
+					   u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((dd->ipath_sdma_generation & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 ipath_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
+				      struct ipath_user_sdma_pkt *pkt, int idx,
+				      unsigned ofs, u16 tail)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+
+	descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
+	if (idx == 0)
+		descq0 = ipath_sdma_make_first_desc0(descq0);
+	if (idx == pkt->naddr - 1)
+		descq0 = ipath_sdma_make_last_desc0(descq0);
+
+	descqp[0] = descq0;
+	descqp[1] = ipath_sdma_make_desc1(addr);
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
+				     struct ipath_user_sdma_queue *pq,
+				     struct list_head *pktlist)
+{
+	int ret = 0;
+	unsigned long flags;
+	u16 tail;
+
+	if (list_empty(pktlist))
+		return 0;
+
+	if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
+		return -ECOMM;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
+		ret = -ECOMM;
+		goto unlock;
+	}
+
+	tail = dd->ipath_sdma_descq_tail;
+	while (!list_empty(pktlist)) {
+		struct ipath_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct ipath_user_sdma_pkt,
+				   list);
+		int i;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
+			goto unlock_check_tail;
+
+		for (i = 0; i < pkt->naddr; i++) {
+			ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == dd->ipath_sdma_descq_cnt) {
+				tail = 0;
+				++dd->ipath_sdma_generation;
+			}
+		}
+
+		if ((ofs<<2) > dd->ipath_ibmaxlen) {
+			ipath_dbg("packet size %X > ibmax %X, fail\n",
+				ofs<<2, dd->ipath_ibmaxlen);
+			ret = -EMSGSIZE;
+			goto unlock;
+		}
+
+		/*
+		 * if the packet is >= 2KB mtu equivalent, we have to use
+		 * the large buffers, and have to mark each descriptor as
+		 * part of a large buffer packet.
+		 */
+		if (ofs >= IPATH_SMALLBUF_DWORDS) {
+			for (i = 0; i < pkt->naddr; i++) {
+				dd->ipath_sdma_descq[dtail].qw[0] |=
+					cpu_to_le64(1ULL << 14);
+				if (++dtail == dd->ipath_sdma_descq_cnt)
+					dtail = 0;
+			}
+		}
+
+		dd->ipath_sdma_descq_added += pkt->naddr;
+		pkt->added = dd->ipath_sdma_descq_added;
+		list_move_tail(&pkt->list, &pq->sent);
+		ret++;
+	}
+
+unlock_check_tail:
+	/* advance the tail on the chip if necessary */
+	if (dd->ipath_sdma_descq_tail != tail) {
+		wmb();
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+		dd->ipath_sdma_descq_tail = tail;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim)
+{
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+	}
+
+	while (dim) {
+		const int mxp = 8;
+
+		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+		if (ret <= 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * lazily clean hw queue.  the 4 is a guess of about
+			 * how many sdma descriptors a packet will take (it
+			 * doesn't have to be perfect).
+			 */
+			if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
+				ipath_user_sdma_hwqueue_clean(dd);
+				ipath_user_sdma_queue_clean(dd, pq);
+			}
+
+			ret = ipath_user_sdma_push_pkts(dd, pq, &list);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += ret;
+				pq->counter += ret;
+
+				if (!list_empty(&list))
+					goto done_unlock;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	ipath_user_sdma_hwqueue_clean(dd);
+	ret = ipath_user_sdma_queue_clean(dd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
+{
+	return pq->sent_counter;
+}
+
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
+{
+	return pq->counter;
+}
+
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h
new file mode 100644
index 0000000..fc76316
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct ipath_user_sdma_queue;
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim);
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq);
+
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq);
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
new file mode 100644
index 0000000..30ba49c
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -0,0 +1,2364 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+static unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_ipath_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_ipath_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_ipath_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_ipath_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_ipath_max_qps = 16384;
+module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_ipath_max_sges = 0x60;
+module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_ipath_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_ipath_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_ipath_max_srqs = 1024;
+module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_ipath_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_ipath_disable_sma;
+module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = IPATH_POST_RECV_OK,
+	[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+	    IPATH_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+	[IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+};
+
+struct ipath_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+static __be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sg_list = ss->sg_list;
+	struct ipath_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;	/* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr != NULL) {
+			if (++sge.n >= IPATH_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+				u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+	struct ipath_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (qp->ibqp.qp_type != IB_QPT_SMI &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+		goto bail_inval;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UD) {
+		/* Check UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0, j = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+					   &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval;
+	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+		goto bail_inval;
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval:
+	ret = -EINVAL;
+bail:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			   struct ib_send_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	int err = 0;
+
+	for (; wr; wr = wr->next) {
+		err = ipath_post_one_send(qp, wr);
+		if (err) {
+			*bad_wr = wr;
+			goto bail;
+		}
+	}
+
+	/* Try to do the send work in the caller's context. */
+	ipath_do_send((unsigned long) qp);
+
+bail:
+	return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+			 struct ipath_ib_header *hdr, int has_grh,
+			 void *data, u32 tlen, struct ipath_qp *qp)
+{
+	/* Check for valid receive state. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		return;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_ipath_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * ipath_ib_rcv - process an incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
+		  u32 tlen)
+{
+	struct ipath_ib_header *hdr = rhdr;
+	struct ipath_other_headers *ohdr;
+	struct ipath_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	if (unlikely(dev == NULL))
+		goto bail;
+
+	if (unlikely(tlen < 24)) {	/* LRH+BTH+CRC */
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < IPATH_MULTICAST_LID_BASE) {
+		lid &= ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid != dev->dd->ipath_lid)) {
+			dev->rcv_errors++;
+			goto bail;
+		}
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == IPATH_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == IPATH_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else {
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+	dev->opstats[opcode].n_bytes += tlen;
+	dev->opstats[opcode].n_packets++;
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
+	if (qp_num == IPATH_MULTICAST_QPN) {
+		struct ipath_mcast *mcast;
+		struct ipath_mcast_qp *p;
+
+		if (lnh != IPATH_LRH_GRH) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+		if (mcast == NULL) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		dev->n_multicast_rcv++;
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify ipath_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+		if (qp) {
+			dev->n_unicast_rcv++;
+			ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
+				     tlen, qp);
+			/*
+			 * Notify ipath_destroy_qp() if it is waiting
+			 * for us to finish.
+			 */
+			if (atomic_dec_and_test(&qp->refcount))
+				wake_up(&qp->wait);
+		} else
+			dev->n_pkt_drops++;
+	}
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(struct ipath_ibdev *dev)
+{
+	struct ipath_qp *resend = NULL;
+	struct ipath_qp *rnr = NULL;
+	struct list_head *last;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		return;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* Start filling the next pending queue. */
+	if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+		dev->pending_index = 0;
+	/* Save any requests still in the new queue, they have timed out. */
+	last = &dev->pending[dev->pending_index];
+	while (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		list_del_init(&qp->timerwait);
+		qp->timer_next = resend;
+		resend = qp;
+		atomic_inc(&qp->refcount);
+	}
+	last = &dev->rnrwait;
+	if (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		if (--qp->s_rnr_timeout == 0) {
+			do {
+				list_del_init(&qp->timerwait);
+				qp->timer_next = rnr;
+				rnr = qp;
+				atomic_inc(&qp->refcount);
+				if (list_empty(last))
+					break;
+				qp = list_entry(last->next, struct ipath_qp,
+						timerwait);
+			} while (qp->s_rnr_timeout == 0);
+		}
+	}
+	/*
+	 * We should only be in the started state if pma_sample_start != 0
+	 */
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+	    --dev->pma_sample_start == 0) {
+		dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
+					&dev->ipath_rword,
+					&dev->ipath_spkts,
+					&dev->ipath_rpkts,
+					&dev->ipath_xmit_wait);
+	}
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		if (dev->pma_sample_interval == 0) {
+			u64 ta, tb, tc, td, te;
+
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+			ipath_snapshot_counters(dev->dd, &ta, &tb,
+						&tc, &td, &te);
+
+			dev->ipath_sword = ta - dev->ipath_sword;
+			dev->ipath_rword = tb - dev->ipath_rword;
+			dev->ipath_spkts = tc - dev->ipath_spkts;
+			dev->ipath_rpkts = td - dev->ipath_rpkts;
+			dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+		}
+		else
+			dev->pma_sample_interval--;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	/* XXX What if timer fires again while this is running? */
+	while (resend != NULL) {
+		qp = resend;
+		resend = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_last != qp->s_tail &&
+		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+			dev->n_timeouts++;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+	while (rnr != NULL) {
+		qp = rnr;
+		rnr = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr != NULL) {
+		if (++sge->n >= IPATH_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l,
+								  extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		update_sge(ss, len);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	update_sge(ss, length);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		ipath_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		ipath_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 8;
+	case IB_RATE_5_GBPS:   return 4;
+	case IB_RATE_10_GBPS:  return 2;
+	case IB_RATE_20_GBPS:  return 1;
+	default:	       return 0;
+	}
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+	switch (mult) {
+	case 8:  return IB_RATE_2_5_GBPS;
+	case 4:  return IB_RATE_5_GBPS;
+	case 2:  return IB_RATE_10_GBPS;
+	case 1:  return IB_RATE_20_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+	struct ipath_verbs_txreq *tx = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+	return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+			     struct ipath_verbs_txreq *tx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	list_add(&tx->txreq.list, &dev->txreq_free);
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+	struct ipath_verbs_txreq *tx = cookie;
+	struct ipath_qp *qp = tx->qp;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (tx->wqe)
+			ipath_send_complete(qp, tx->wqe, ibs);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	} else if (tx->wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, tx->wqe, ibs);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+		kfree(tx->txreq.map_addr);
+	put_txreq(dev, tx);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+	unsigned long flags;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	}
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+	return (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+				struct ipath_ib_header *hdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd = dev->dd;
+	struct ipath_verbs_txreq *tx;
+	u32 *piobuf;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = qp->s_tx;
+	if (tx) {
+		qp->s_tx = NULL;
+		/* resend previously constructed packet */
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+		if (ret) {
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	tx = get_txreq(dev);
+	if (!tx) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	tx->qp = qp;
+	atomic_inc(&qp->refcount);
+	tx->wqe = qp->s_wqe;
+	tx->txreq.callback = sdma_complete;
+	tx->txreq.callback_cookie = tx;
+	tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+		IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+		control |= 1ULL << 31;
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+	}
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = ipath_count_sge(ss, len);
+		if (ndesc >= dd->ipath_sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		tx->hdr.pbc[0] = cpu_to_le32(plen);
+		tx->hdr.pbc[1] = cpu_to_le32(control);
+		memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+		tx->txreq.sg_count = ndesc;
+		tx->map_len = (hdrwords + 2) << 2;
+		tx->txreq.map_addr = &tx->hdr;
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+		if (ret) {
+			/* save ss and length in dwords */
+			tx->ss = ss;
+			tx->len = dwords;
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->map_len = (plen + 1) << 2;
+	piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto err_tx;
+	}
+	tx->txreq.map_addr = piobuf;
+	tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+
+	*piobuf++ = (__force u32) cpu_to_le32(plen);
+	*piobuf++ = (__force u32) cpu_to_le32(control);
+	memcpy(piobuf, hdr, hdrwords << 2);
+	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+	atomic_inc(&qp->s_dma_busy);
+	ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+	/*
+	 * If we couldn't queue the DMA request, save the info
+	 * and try again later rather than destroying the
+	 * buffer and undoing the side effects of the copy.
+	 */
+	if (ret) {
+		tx->ss = NULL;
+		tx->len = 0;
+		qp->s_tx = tx;
+		decrement_dma_busy(qp);
+	}
+	dev->n_unaligned++;
+	goto bail;
+
+err_tx:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+	put_txreq(dev, tx);
+bail:
+	return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+				struct ipath_ib_header *ibhdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf;
+	unsigned flush_wc;
+	u32 control;
+	int ret;
+	unsigned long flags;
+
+	piobuf = ipath_getpiobuf(dd, plen, NULL);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+		control |= 1ULL << 31;
+
+	/*
+	 * Write the length to the control qword plus any needed flags.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(((u64) control << 32) | plen, piobuf);
+	piobuf += 2;
+
+	flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			ipath_flush_wc();
+			__iowrite32_copy(piobuf, hdr, hdrwords - 1);
+			ipath_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		ipath_flush_wc();
+	__iowrite32_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		update_sge(ss, len);
+		if (flush_wc) {
+			__iowrite32_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			ipath_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+	else
+		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+
+	return ret;
+}
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait)
+{
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+	*rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	*spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+	*rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	*xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs)
+{
+	struct ipath_cregs const *crp = dd->ipath_cregs;
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
+	cntrs->link_error_recovery_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
+	cntrs->port_rcv_errors =
+		ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+		ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+		ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+		ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+		ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_badformatcnt) +
+		dd->ipath_rxfc_unsupvl_errs;
+	if (crp->cr_rxotherlocalphyerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+	if (crp->cr_rxvlerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
+	cntrs->port_rcv_remphys_errors =
+		ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+	cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+	cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+	cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+	cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+	cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
+	cntrs->local_link_integrity_errors =
+		crp->cr_locallinkintegrityerrcnt ?
+		ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+		((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		 dd->ipath_lli_errs : dd->ipath_lli_errors);
+	cntrs->excessive_buffer_overrun_errors =
+		crp->cr_excessbufferovflcnt ?
+		ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+		dd->ipath_overrun_thresh_errs;
+	cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+		ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available.  Return 1 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just restart the send tasklet and
+ * return zero).
+ */
+int ipath_ib_piobufavail(struct ipath_ibdev *dev)
+{
+	struct list_head *list;
+	struct ipath_qp *qplist;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		goto bail;
+
+	list = &dev->piowait;
+	qplist = NULL;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	while (!list_empty(list)) {
+		qp = list_entry(list->next, struct ipath_qp, piowait);
+		list_del_init(&qp->piowait);
+		qp->pio_next = qplist;
+		qplist = qp;
+		atomic_inc(&qp->refcount);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	while (qplist != NULL) {
+		qp = qplist;
+		qplist = qp->pio_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+bail:
+	return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			      struct ib_udata *uhw)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
+	props->vendor_part_id = dev->dd->ipath_deviceid;
+	props->hw_ver = dev->dd->ipath_pcirev;
+
+	props->sys_image_guid = dev->sys_image_guid;
+
+	props->max_mr_size = ~0ull;
+	props->max_qp = ib_ipath_max_qps;
+	props->max_qp_wr = ib_ipath_max_qp_wrs;
+	props->max_sge = ib_ipath_max_sges;
+	props->max_cq = ib_ipath_max_cqs;
+	props->max_ah = ib_ipath_max_ahs;
+	props->max_cqe = ib_ipath_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = ib_ipath_max_pds;
+	props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = ib_ipath_max_srqs;
+	props->max_srq_wr = ib_ipath_max_srq_wrs;
+	props->max_srq_sge = ib_ipath_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = ipath_get_npkeys(dev->dd);
+	props->max_mcast_grp = ib_ipath_max_mcast_grps;
+	props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+const u8 ipath_cvt_physportstate[32] = {
+	[INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
+{
+	return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_devdata *dd = dev->dd;
+	enum ib_mtu mtu;
+	u16 lid = dd->ipath_lid;
+	u64 ibcstat;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = dd->ipath_lmc;
+	props->sm_lid = dev->sm_lid;
+	props->sm_sl = dev->sm_sl;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	/* See phys_state_show() */
+	props->phys_state = /* MEA: assumes shift == 0 */
+		ipath_cvt_physportstate[dd->ipath_lastibcstat &
+		dd->ibcs_lts_mask];
+	props->port_cap_flags = dev->port_cap_flags;
+	props->gid_tbl_len = 1;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = ipath_get_npkeys(dd);
+	props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
+		dev->z_pkey_violations;
+	props->qkey_viol_cntr = dev->qkey_violations;
+	props->active_width = dd->ipath_link_width_active;
+	/* See rate_show() */
+	props->active_speed = dd->ipath_link_speed_active;
+	props->max_vl_num = 1;		/* VLCap = VL0 */
+	props->init_type_reply = 0;
+
+	props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+	props->subnet_timeout = dev->subnet_timeout;
+
+	return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+			       int device_modify_mask,
+			       struct ib_device_modify *device_modify)
+{
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+		to_idev(device)->sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	dev->port_cap_flags |= props->set_port_cap_mask;
+	dev->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		dev->qkey_violations = 0;
+	return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= 1) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	gid->global.subnet_prefix = dev->gid_prefix;
+	gid->global.interface_id = dev->dd->ipath_guid;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.	Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == ib_ipath_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ipath_pd *pd = to_ipd(ibpd);
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah;
+	struct ib_ah *ret;
+	struct ipath_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != IPATH_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->dlid == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > pd->device->phys_port_cnt) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+	struct ipath_ibdev *dev = to_idev(ibah->device);
+	struct ipath_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+	ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
+
+	return 0;
+}
+
+/**
+ * ipath_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_get_npkeys(struct ipath_devdata *dd)
+{
+	return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+/**
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+	unsigned ret;
+
+	/* always a kernel port, no locking needed */
+	if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+		ret = 0;
+	else
+		ret = dd->ipath_pd[0]->port_pkeys[index];
+
+	return ret;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			    u16 *pkey)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= ipath_get_npkeys(dev->dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = ipath_get_pkey(dev->dd, index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct ipath_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+static void __verbs_timer(unsigned long arg)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+	/* Handle verbs layer timeouts. */
+	ipath_ib_timer(dd->verbs_dev);
+
+	mod_timer(&dd->verbs_timer, jiffies + 1);
+}
+
+static int enable_timer(struct ipath_devdata *dd)
+{
+	/*
+	 * Early chips had a design flaw where the chip and kernel idea
+	 * of the tail register don't always agree, and therefore we won't
+	 * get an interrupt on the next packet received.
+	 * If the board supports per packet receive interrupts, use it.
+	 * Otherwise, the timer function periodically checks for packets
+	 * to cover this case.
+	 * Either way, the timer is needed for verbs layer related
+	 * processing.
+	 */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+				 0x2074076542310ULL);
+		/* Enable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	init_timer(&dd->verbs_timer);
+	dd->verbs_timer.function = __verbs_timer;
+	dd->verbs_timer.data = (unsigned long)dd;
+	dd->verbs_timer.expires = jiffies + 1;
+	add_timer(&dd->verbs_timer);
+
+	return 0;
+}
+
+static int disable_timer(struct ipath_devdata *dd)
+{
+	/* Disable GPIO bit 2 interrupt */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+                /* Disable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+		/*
+		 * We might want to undo changes to debugportselect,
+		 * but how?
+		 */
+	}
+
+	del_timer_sync(&dd->verbs_timer);
+
+	return 0;
+}
+
+static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
+			        struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = ipath_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+int ipath_register_ib_device(struct ipath_devdata *dd)
+{
+	struct ipath_verbs_counters cntrs;
+	struct ipath_ibdev *idev;
+	struct ib_device *dev;
+	struct ipath_verbs_txreq *tx;
+	unsigned i;
+	int ret;
+
+	idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+	if (idev == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dev = &idev->ibdev;
+
+	if (dd->ipath_sdma_descq_cnt) {
+		tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+			     GFP_KERNEL);
+		if (tx == NULL) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+	} else
+		tx = NULL;
+	idev->txreq_bufs = tx;
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&idev->n_pds_lock);
+	spin_lock_init(&idev->n_ahs_lock);
+	spin_lock_init(&idev->n_cqs_lock);
+	spin_lock_init(&idev->n_qps_lock);
+	spin_lock_init(&idev->n_srqs_lock);
+	spin_lock_init(&idev->n_mcast_grps_lock);
+
+	spin_lock_init(&idev->qp_table.lock);
+	spin_lock_init(&idev->lk_table.lock);
+	idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
+
+	ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+	if (ret)
+		goto err_qp;
+
+	/*
+	 * The top ib_ipath_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+	idev->lk_table.table = kzalloc(idev->lk_table.max *
+				       sizeof(*idev->lk_table.table),
+				       GFP_KERNEL);
+	if (idev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	INIT_LIST_HEAD(&idev->pending_mmaps);
+	spin_lock_init(&idev->pending_lock);
+	idev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&idev->mmap_offset_lock);
+	INIT_LIST_HEAD(&idev->pending[0]);
+	INIT_LIST_HEAD(&idev->pending[1]);
+	INIT_LIST_HEAD(&idev->pending[2]);
+	INIT_LIST_HEAD(&idev->piowait);
+	INIT_LIST_HEAD(&idev->rnrwait);
+	INIT_LIST_HEAD(&idev->txreq_free);
+	idev->pending_index = 0;
+	idev->port_cap_flags =
+		IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+	if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+		idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	ipath_get_counters(dd, &cntrs);
+	idev->z_symbol_error_counter = cntrs.symbol_error_counter;
+	idev->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	idev->z_link_downed_counter = cntrs.link_downed_counter;
+	idev->z_port_rcv_errors = cntrs.port_rcv_errors;
+	idev->z_port_rcv_remphys_errors =
+		cntrs.port_rcv_remphys_errors;
+	idev->z_port_xmit_discards = cntrs.port_xmit_discards;
+	idev->z_port_xmit_data = cntrs.port_xmit_data;
+	idev->z_port_rcv_data = cntrs.port_rcv_data;
+	idev->z_port_xmit_packets = cntrs.port_xmit_packets;
+	idev->z_port_rcv_packets = cntrs.port_rcv_packets;
+	idev->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	idev->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	idev->z_vl15_dropped = cntrs.vl15_dropped;
+
+	for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+		list_add(&tx->txreq.list, &idev->txreq_free);
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!sys_image_guid)
+		sys_image_guid = dd->ipath_guid;
+	idev->sys_image_guid = sys_image_guid;
+	idev->ib_unit = dd->ipath_unit;
+	idev->dd = dd;
+
+	strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+	dev->owner = THIS_MODULE;
+	dev->node_guid = dd->ipath_guid;
+	dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+	dev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)		|
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = 1;
+	dev->num_comp_vectors = 1;
+	dev->dma_device = &dd->pcidev->dev;
+	dev->query_device = ipath_query_device;
+	dev->modify_device = ipath_modify_device;
+	dev->query_port = ipath_query_port;
+	dev->modify_port = ipath_modify_port;
+	dev->query_pkey = ipath_query_pkey;
+	dev->query_gid = ipath_query_gid;
+	dev->alloc_ucontext = ipath_alloc_ucontext;
+	dev->dealloc_ucontext = ipath_dealloc_ucontext;
+	dev->alloc_pd = ipath_alloc_pd;
+	dev->dealloc_pd = ipath_dealloc_pd;
+	dev->create_ah = ipath_create_ah;
+	dev->destroy_ah = ipath_destroy_ah;
+	dev->query_ah = ipath_query_ah;
+	dev->create_srq = ipath_create_srq;
+	dev->modify_srq = ipath_modify_srq;
+	dev->query_srq = ipath_query_srq;
+	dev->destroy_srq = ipath_destroy_srq;
+	dev->create_qp = ipath_create_qp;
+	dev->modify_qp = ipath_modify_qp;
+	dev->query_qp = ipath_query_qp;
+	dev->destroy_qp = ipath_destroy_qp;
+	dev->post_send = ipath_post_send;
+	dev->post_recv = ipath_post_receive;
+	dev->post_srq_recv = ipath_post_srq_receive;
+	dev->create_cq = ipath_create_cq;
+	dev->destroy_cq = ipath_destroy_cq;
+	dev->resize_cq = ipath_resize_cq;
+	dev->poll_cq = ipath_poll_cq;
+	dev->req_notify_cq = ipath_req_notify_cq;
+	dev->get_dma_mr = ipath_get_dma_mr;
+	dev->reg_phys_mr = ipath_reg_phys_mr;
+	dev->reg_user_mr = ipath_reg_user_mr;
+	dev->dereg_mr = ipath_dereg_mr;
+	dev->alloc_fmr = ipath_alloc_fmr;
+	dev->map_phys_fmr = ipath_map_phys_fmr;
+	dev->unmap_fmr = ipath_unmap_fmr;
+	dev->dealloc_fmr = ipath_dealloc_fmr;
+	dev->attach_mcast = ipath_multicast_attach;
+	dev->detach_mcast = ipath_multicast_detach;
+	dev->process_mad = ipath_process_mad;
+	dev->mmap = ipath_mmap;
+	dev->dma_ops = &ipath_dma_mapping_ops;
+	dev->get_port_immutable = ipath_port_immutable;
+
+	snprintf(dev->node_desc, sizeof(dev->node_desc),
+		 IPATH_IDSTR " %s", init_utsname()->nodename);
+
+	ret = ib_register_device(dev, NULL);
+	if (ret)
+		goto err_reg;
+
+	ret = ipath_verbs_register_sysfs(dev);
+	if (ret)
+		goto err_class;
+
+	enable_timer(dd);
+
+	goto bail;
+
+err_class:
+	ib_unregister_device(dev);
+err_reg:
+	kfree(idev->lk_table.table);
+err_lk:
+	kfree(idev->qp_table.table);
+err_qp:
+	kfree(idev->txreq_bufs);
+err_tx:
+	ib_dealloc_device(dev);
+	ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+	idev = NULL;
+
+bail:
+	dd->verbs_dev = idev;
+	return ret;
+}
+
+void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+	struct ib_device *ibdev = &dev->ibdev;
+	u32 qps_inuse;
+
+	ib_unregister_device(ibdev);
+
+	disable_timer(dev->dd);
+
+	if (!list_empty(&dev->pending[0]) ||
+	    !list_empty(&dev->pending[1]) ||
+	    !list_empty(&dev->pending[2]))
+		ipath_dev_err(dev->dd, "pending list not empty!\n");
+	if (!list_empty(&dev->piowait))
+		ipath_dev_err(dev->dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->rnrwait))
+		ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+	if (!ipath_mcast_tree_empty())
+		ipath_dev_err(dev->dd, "multicast table memory leak!\n");
+	/*
+	 * Note that ipath_unregister_ib_device() can be called before all
+	 * the QPs are destroyed!
+	 */
+	qps_inuse = ipath_free_all_qps(&dev->qp_table);
+	if (qps_inuse)
+		ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+			qps_inuse);
+	kfree(dev->qp_table.table);
+	kfree(dev->lk_table.table);
+	kfree(dev->txreq_bufs);
+	ib_dealloc_device(ibdev);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int ret;
+
+	ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
+	if (ret < 0)
+		goto bail;
+	strcat(buf, "\n");
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int i;
+	int len;
+
+	len = sprintf(buf,
+		      "RC resends  %d\n"
+		      "RC no QACK  %d\n"
+		      "RC ACKs     %d\n"
+		      "RC SEQ NAKs %d\n"
+		      "RC RDMA seq %d\n"
+		      "RC RNR NAKs %d\n"
+		      "RC OTH NAKs %d\n"
+		      "RC timeouts %d\n"
+		      "RC RDMA dup %d\n"
+		      "piobuf wait %d\n"
+		      "unaligned   %d\n"
+		      "PKT drops   %d\n"
+		      "WQE errs    %d\n",
+		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+		      dev->n_other_naks, dev->n_timeouts,
+		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+		      dev->n_pkt_drops, dev->n_wqe_errs);
+	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+		const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+		if (!si->n_packets && !si->n_bytes)
+			continue;
+		len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+			       (unsigned long long) si->n_packets,
+			       (unsigned long long) si->n_bytes);
+	}
+	return len;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct device_attribute *ipath_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+		ret = device_create_file(&dev->dev,
+				       ipath_class_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+		device_remove_file(&dev->dev, ipath_class_attributes[i]);
+	return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
new file mode 100644
index 0000000..ec167e5
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs.h
@@ -0,0 +1,939 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ipath_kernel.h"
+
+#define IPATH_MAX_RDMA_ATOMIC	4
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE	(IB_CQ_NEXT_COMP + 1)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK			0x20
+#define IB_NAK_PSN_ERROR		0x60
+#define IB_NAK_INVALID_REQUEST		0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR	0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST	0x64
+
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
+#define IPATH_POST_SEND_OK		0x01
+#define IPATH_POST_RECV_OK		0x02
+#define IPATH_PROCESS_RECV_OK		0x04
+#define IPATH_PROCESS_SEND_OK		0x08
+#define IPATH_PROCESS_NEXT_SEND_OK	0x10
+#define IPATH_FLUSH_SEND		0x20
+#define IPATH_FLUSH_RECV		0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+	(IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE	0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED	0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING	0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA	cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA	cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS	cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS	cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT	cpu_to_be16(0x0005)
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];	/* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+	__be32 bth[3];
+	union {
+		struct {
+			__be32 deth[2];
+			__be32 imm_data;
+		} ud;
+		struct {
+			struct ib_reth reth;
+			__be32 imm_data;
+		} rc;
+		struct {
+			__be32 aeth;
+			__be32 atomic_ack_eth[2];
+		} at;
+		__be32 imm_data;
+		__be32 aeth;
+		struct ib_atomic_eth atomic_eth;
+	} u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ipath_other_headers oth;
+		} l;
+		struct ipath_other_headers oth;
+	} u;
+} __attribute__ ((packed));
+
+struct ipath_pio_header {
+	__le32 pbc[2];
+	struct ipath_ib_header hdr;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+	struct list_head list;
+	struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct ipath_pd {
+	struct ib_pd ibpd;
+	int user;		/* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+};
+
+/*
+ * This structure is used by ipath_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct ipath_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct ipath_cq_wc {
+	u32 head;		/* index of next entry to fill */
+	u32 tail;		/* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct ipath_cq {
+	struct ib_cq ibcq;
+	struct tasklet_struct comptask;
+	spinlock_t lock;
+	u8 notify;
+	u8 triggered;
+	struct ipath_cq_wc *queue;
+	struct ipath_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+	struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+	struct ib_pd *pd;	/* shares refcnt of ibmr.pd */
+	u64 user_base;		/* User's address for this region */
+	u64 iova;		/* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;		/* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;		/* number of ipath_segs in all the arrays */
+	u32 mapsz;		/* size of the map array */
+	struct ipath_segarray *map[0];	/* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+	struct ipath_mregion *mr;
+	void *vaddr;		/* kernel virtual address of segment */
+	u32 sge_length;		/* length of the SGE */
+	u32 length;		/* remaining length of the segment */
+	u16 m;			/* current index: mr->map[m] */
+	u16 n;			/* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct ipath_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ipath_mregion mr;	/* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+	struct ib_send_wr wr;	/* don't use wr.sg_list */
+	u32 psn;		/* first packet sequence number */
+	u32 lpsn;		/* last packet sequence number */
+	u32 ssn;		/* send sequence number */
+	u32 length;		/* total length of data in sg_list */
+	struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct ipath_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct ipath_rwq {
+	u32 head;		/* new work requests posted to the head */
+	u32 tail;		/* receives pull requests from here. */
+	struct ipath_rwqe wq[0];
+};
+
+struct ipath_rq {
+	struct ipath_rwq *wq;
+	spinlock_t lock;
+	u32 size;		/* size of RWQE array */
+	u8 max_sge;
+};
+
+struct ipath_srq {
+	struct ib_srq ibsrq;
+	struct ipath_rq rq;
+	struct ipath_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct ipath_sge_state {
+	struct ipath_sge *sg_list;      /* next SGE to be used if any */
+	struct ipath_sge sge;   /* progress state for the current SGE */
+	u8 num_sge;
+	u8 static_rate;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct ipath_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	union {
+		struct ipath_sge_state rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+	struct ib_qp ibqp;
+	struct ipath_qp *next;		/* link list for QPN hash table */
+	struct ipath_qp *timer_next;	/* link list for ipath_ib_timer() */
+	struct ipath_qp *pio_next;	/* link for ipath_ib_piobufavail() */
+	struct list_head piowait;	/* link for wait PIO buf */
+	struct list_head timerwait;	/* link for waiting for timeouts */
+	struct ib_ah_attr remote_ah_attr;
+	struct ipath_ib_header s_hdr;	/* next packet header to send */
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	wait_queue_head_t wait_dma;
+	struct tasklet_struct s_task;
+	struct ipath_mmap_info *ip;
+	struct ipath_sge_state *s_cur_sge;
+	struct ipath_verbs_txreq *s_tx;
+	struct ipath_sge_state s_sge;	/* current send request data */
+	struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
+	struct ipath_sge_state s_ack_rdma_sge;
+	struct ipath_sge_state s_rdma_read_sge;
+	struct ipath_sge_state r_sge;	/* current receive data */
+	spinlock_t s_lock;
+	atomic_t s_dma_busy;
+	u16 s_pkt_delay;
+	u16 s_hdrwords;		/* size of s_hdr in 32 bit words */
+	u32 s_cur_size;		/* size of send packet in bytes */
+	u32 s_len;		/* total length of s_sge */
+	u32 s_rdma_read_len;	/* total length of s_rdma_read_sge */
+	u32 s_next_psn;		/* PSN for next request */
+	u32 s_last_psn;		/* last response PSN processed */
+	u32 s_psn;		/* current packet sequence number */
+	u32 s_ack_rdma_psn;	/* PSN for sending RDMA read responses */
+	u32 s_ack_psn;		/* PSN for acking sends and RDMA writes */
+	u32 s_rnr_timeout;	/* number of milliseconds for RNR timeout */
+	u32 r_ack_psn;		/* PSN for next ACK or atomic ACK */
+	u64 r_wr_id;		/* ID for current receive WQE */
+	unsigned long r_aflags;
+	u32 r_len;		/* total length of r_sge */
+	u32 r_rcv_len;		/* receive data len processed */
+	u32 r_psn;		/* expected rcv packet sequence number */
+	u32 r_msn;		/* message sequence number */
+	u8 state;		/* QP state */
+	u8 s_state;		/* opcode of last packet sent */
+	u8 s_ack_state;		/* opcode of packet to ACK */
+	u8 s_nak_state;		/* non-zero if NAK is pending */
+	u8 r_state;		/* opcode of last packet received */
+	u8 r_nak_state;		/* non-zero if NAK is pending */
+	u8 r_min_rnr_timer;	/* retry timeout value for RNR NAKs */
+	u8 r_flags;
+	u8 r_max_rd_atomic;	/* max number of RDMA read/atomic to receive */
+	u8 r_head_ack_queue;	/* index into s_ack_queue[] */
+	u8 qp_access_flags;
+	u8 s_max_sge;		/* size of s_wq->sg_list */
+	u8 s_retry_cnt;		/* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 s_retry;		/* requester retry counter */
+	u8 s_rnr_retry;		/* requester RNR retry counter */
+	u8 s_pkey_index;	/* PKEY index to use */
+	u8 s_max_rd_atomic;	/* max number of RDMA read/atomic to send */
+	u8 s_num_rd_atomic;	/* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;	/* index into s_ack_queue[] */
+	u8 s_flags;
+	u8 s_dmult;
+	u8 s_draining;
+	u8 timeout;		/* Timeout for this QP */
+	enum ib_mtu path_mtu;
+	u32 remote_qpn;
+	u32 qkey;		/* QKEY for this QP (for UD or RD) */
+	u32 s_size;		/* send work queue size */
+	u32 s_head;		/* new entries added here */
+	u32 s_tail;		/* next entry to process */
+	u32 s_cur;		/* current work queue entry */
+	u32 s_last;		/* last un-ACK'ed entry */
+	u32 s_ssn;		/* SSN of tail entry */
+	u32 s_lsn;		/* limit sequence number (credit) */
+	struct ipath_swqe *s_wq;	/* send work queue */
+	struct ipath_swqe *s_wqe;
+	struct ipath_sge *r_ud_sg_list;
+	struct ipath_rq r_rq;		/* receive work queue */
+	struct ipath_sge r_sg_list[0];	/* verified SGEs */
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID	0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE	0x01
+#define IPATH_R_RDMAR_SEQ	0x02
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *		      next send completion entry not via send DMA.
+ */
+#define IPATH_S_SIGNAL_REQ_WR	0x01
+#define IPATH_S_FENCE_PENDING	0x02
+#define IPATH_S_RDMAR_PENDING	0x04
+#define IPATH_S_ACK_PENDING	0x08
+#define IPATH_S_BUSY		0x10
+#define IPATH_S_WAITING		0x20
+#define IPATH_S_WAIT_SSN_CREDIT	0x40
+#define IPATH_S_WAIT_DMA	0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+	IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
+
+#define IPATH_PSN_CREDIT	512
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+					      unsigned n)
+{
+	return (struct ipath_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct ipath_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rwq.wq.  This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+					      unsigned n)
+{
+	return (struct ipath_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct ipath_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	atomic_t n_free;
+	void *page;
+};
+
+struct ipath_qp_table {
+	spinlock_t lock;
+	u32 last;		/* last QP number allocated */
+	u32 max;		/* size of the hash table */
+	u32 nmaps;		/* size of the map table */
+	struct ipath_qp **table;
+	/* bit map of free numbers */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+	spinlock_t lock;
+	u32 next;		/* next unused index (speeds search) */
+	u32 gen;		/* generation count */
+	u32 max;		/* size of the table */
+	struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+	u64 n_packets;		/* number of packets */
+	u64 n_bytes;		/* total number of bytes */
+};
+
+struct ipath_ibdev {
+	struct ib_device ibdev;
+	struct ipath_devdata *dd;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock;
+	u32 mmap_offset;
+	int ib_unit;		/* This is the device number */
+	u16 sm_lid;		/* in host order */
+	u8 sm_sl;
+	u8 mkeyprot;
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+
+	/* The following fields are really per port. */
+	struct ipath_qp_table qp_table;
+	struct ipath_lkey_table lk_table;
+	struct list_head pending[3];	/* FIFO of QPs waiting for ACKs */
+	struct list_head piowait;	/* list for wait PIO buf */
+	struct list_head txreq_free;
+	void *txreq_bufs;
+	/* list of QPs waiting for RNR timer */
+	struct list_head rnrwait;
+	spinlock_t pending_lock;
+	__be64 sys_image_guid;	/* in network order */
+	__be64 gid_prefix;	/* in network order */
+	__be64 mkey;
+
+	u32 n_pds_allocated;	/* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;	/* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;	/* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;	/* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;	/* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+
+	u64 ipath_sword;	/* total dwords sent (sample result) */
+	u64 ipath_rword;	/* total dwords received (sample result) */
+	u64 ipath_spkts;	/* total packets sent (sample result) */
+	u64 ipath_rpkts;	/* total packets received (sample result) */
+	/* # of ticks no data sent (sample result) */
+	u64 ipath_xmit_wait;
+	u64 rcv_errors;		/* # of packets with SW detected rcv errs */
+	u64 n_unicast_xmit;	/* total unicast packets sent */
+	u64 n_unicast_rcv;	/* total unicast packets received */
+	u64 n_multicast_xmit;	/* total multicast packets sent */
+	u64 n_multicast_rcv;	/* total multicast packets received */
+	u64 z_symbol_error_counter;		/* starting count for PMA */
+	u64 z_link_error_recovery_counter;	/* starting count for PMA */
+	u64 z_link_downed_counter;		/* starting count for PMA */
+	u64 z_port_rcv_errors;			/* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;		/* starting count for PMA */
+	u64 z_port_xmit_discards;		/* starting count for PMA */
+	u64 z_port_xmit_data;			/* starting count for PMA */
+	u64 z_port_rcv_data;			/* starting count for PMA */
+	u64 z_port_xmit_packets;		/* starting count for PMA */
+	u64 z_port_rcv_packets;			/* starting count for PMA */
+	u32 z_pkey_violations;			/* starting count for PMA */
+	u32 z_local_link_integrity_errors;	/* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;	/* starting count for PMA */
+	u32 z_vl15_dropped;			/* starting count for PMA */
+	u32 n_rc_resends;
+	u32 n_rc_acks;
+	u32 n_rc_qacks;
+	u32 n_seq_naks;
+	u32 n_rdma_seq;
+	u32 n_rnr_naks;
+	u32 n_other_naks;
+	u32 n_timeouts;
+	u32 n_pkt_drops;
+	u32 n_vl15_dropped;
+	u32 n_wqe_errs;
+	u32 n_rdma_dup_busy;
+	u32 n_piowait;
+	u32 n_unaligned;
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 pending_index;	/* which pending queue is active */
+	u8 pma_sample_status;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+struct ipath_verbs_txreq {
+	struct ipath_qp         *qp;
+	struct ipath_swqe       *wqe;
+	u32                      map_len;
+	u32                      len;
+	struct ipath_sge_state  *ss;
+	struct ipath_pio_header  hdr;
+	struct ipath_sdma_txreq  txreq;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+	if (qp->s_flags & IPATH_S_ANY_WAIT)
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+	if (!(qp->s_flags & IPATH_S_BUSY))
+		tasklet_hi_schedule(&qp->s_task);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad_hdr *in, size_t in_mad_size,
+		      struct ib_mad_hdr *out, size_t *out_mad_size,
+		      u16 *out_mad_pkey_index);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait);
+
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
+
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+		     struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+			      const struct ib_cq_init_attr *attr,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_release_mmap_info(struct kref *ref);
+
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj);
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj);
+
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
+
+int ipath_make_uc_req(struct ipath_qp *qp);
+
+int ipath_make_ud_req(struct ipath_qp *qp);
+
+int ipath_register_ib_device(struct ipath_devdata *);
+
+void ipath_unregister_ib_device(struct ipath_ibdev *);
+
+void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
+
+int ipath_ib_piobufavail(struct ipath_ibdev *);
+
+unsigned ipath_get_npkeys(struct ipath_devdata *);
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *);
+
+unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
+extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern unsigned int ib_ipath_max_cqes;
+
+extern unsigned int ib_ipath_max_cqs;
+
+extern unsigned int ib_ipath_max_qp_wrs;
+
+extern unsigned int ib_ipath_max_qps;
+
+extern unsigned int ib_ipath_max_sges;
+
+extern unsigned int ib_ipath_max_mcast_grps;
+
+extern unsigned int ib_ipath_max_mcast_qp_attached;
+
+extern unsigned int ib_ipath_max_srqs;
+
+extern unsigned int ib_ipath_max_srq_sges;
+
+extern unsigned int ib_ipath_max_srq_wrs;
+
+extern const u32 ib_ipath_rnr_table[];
+
+extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
+
+#endif				/* IPATH_VERBS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
new file mode 100644
index 0000000..6216ea9
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+	struct ipath_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+	struct ipath_qp *qp = mqp->qp;
+
+	/* Notify ipath_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+	struct ipath_mcast *mcast;
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+	struct ipath_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		ipath_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct ipath_mcast *mcast;
+
+	spin_lock_irqsave(&mcast_lock, flags);
+	n = mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&mcast_lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&mcast_lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_ibdev *dev,
+			   struct ipath_mcast *mcast,
+			   struct ipath_mcast_qp *mqp)
+{
+	struct rb_node **n = &mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	while (*n) {
+		struct ipath_mcast *tmcast;
+		struct ipath_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&mcast_lock);
+
+	return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast;
+	struct ipath_mcast_qp *mqp;
+	int ret;
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = ipath_mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = ipath_mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	switch (ipath_mcast_add(dev, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: can't attach the same QP twice. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -EINVAL;
+		goto bail;
+	case EEXIST:		/* The mcast wasn't used */
+		ipath_mcast_free(mcast);
+		break;
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast = NULL;
+	struct ipath_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	/* Find the GID in the mcast table. */
+	n = mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&mcast_lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&mcast_lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		ipath_mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		ipath_mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+	return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
new file mode 100644
index 0000000..1a7e20a
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
new file mode 100644
index 0000000..7b6e4c8
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
+		unsigned long pio2kbase, pio4kbase;
+		pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) { /* all, for now */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->ipath_piobcnt4k * dd->ipath_4kalign;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->ipath_piobcnt2k * dd->ipath_palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->ipath_piobufbase;
+		piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
+			dd->ipath_piobcnt4k * dd->ipath_4kalign;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		/* do nothing */ ;
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp;
+		ipath_dbg("pioaddr %llx not on right boundary for size "
+			  "%llx, fixing\n",
+			  (unsigned long long) pioaddr,
+			  (unsigned long long) piolen);
+		atmp = pioaddr & ~(piolen - 1);
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			ipath_dev_err(dd, "No way to align address/size "
+				      "(%llx/%llx), no WC mtrr\n",
+				      (unsigned long long) atmp,
+				      (unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			ipath_dbg("changing WC base from %llx to %llx, "
+				  "len from %llx to %llx\n",
+				  (unsigned long long) pioaddr,
+				  (unsigned long long) atmp,
+				  (unsigned long long) piolen,
+				  (unsigned long long) piolen << 1);
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
+		if (dd->wc_cookie < 0) {
+			ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
+			ret = -ENODEV;
+		} else if (dd->wc_cookie == 0)
+			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
+		else
+			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+	arch_phys_wc_del(dd->wc_cookie);
+}
-- 
cgit v0.10.2


From 072bf1f7e4b5963034df35460f5f311396347a36 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Wed, 29 Jul 2015 09:44:14 -0500
Subject: RDMA/amso1100: Deprecate the amso1100 driver and move to staging

The HW hasn't been sold since 2005, and the SW has definite bit rot.
Its time to remove it.  So move it to staging for a few releases and
then remove it after that.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 7d6034f..da4c697 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -57,7 +57,6 @@ config INFINIBAND_ADDR_TRANS
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/ehca/Kconfig"
-source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index d1212c4..1bdb999 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
 obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_EHCA)		+= ehca/
-obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild
deleted file mode 100644
index 950dfab..0000000
--- a/drivers/infiniband/hw/amso1100/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
-
-obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
-
-iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
-	c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/infiniband/hw/amso1100/Kconfig
deleted file mode 100644
index e6ce5f2..0000000
--- a/drivers/infiniband/hw/amso1100/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-config INFINIBAND_AMSO1100
-	tristate "Ammasso 1100 HCA support"
-	depends on PCI && INET
-	---help---
-	  This is a low-level driver for the Ammasso 1100 host
-	  channel adapter (HCA).
-
-config INFINIBAND_AMSO1100_DEBUG
-	bool "Verbose debugging output"
-	depends on INFINIBAND_AMSO1100
-	default n
-	---help---
-	  This option causes the amso1100 driver to produce a bunch of
-	  debug messages.  Select this if you are developing the driver
-	  or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
deleted file mode 100644
index 766a71c..0000000
--- a/drivers/infiniband/hw/amso1100/c2.c
+++ /dev/null
@@ -1,1241 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <linux/prefetch.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_provider.h"
-
-MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
-MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
-
-static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
-    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
-
-static int debug = -1;		/* defaults above */
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
-
-static int c2_up(struct net_device *netdev);
-static int c2_down(struct net_device *netdev);
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
-static void c2_tx_interrupt(struct net_device *netdev);
-static void c2_rx_interrupt(struct net_device *netdev);
-static irqreturn_t c2_interrupt(int irq, void *dev_id);
-static void c2_tx_timeout(struct net_device *netdev);
-static int c2_change_mtu(struct net_device *netdev, int new_mtu);
-static void c2_reset(struct c2_port *c2_port);
-
-static struct pci_device_id c2_pci_table[] = {
-	{ PCI_DEVICE(0x18b8, 0xb001) },
-	{ 0 }
-};
-
-MODULE_DEVICE_TABLE(pci, c2_pci_table);
-
-static void c2_print_macaddr(struct net_device *netdev)
-{
-	pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
-}
-
-static void c2_set_rxbufsize(struct c2_port *c2_port)
-{
-	struct net_device *netdev = c2_port->netdev;
-
-	if (netdev->mtu > RX_BUF_SIZE)
-		c2_port->rx_buf_size =
-		    netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
-		    NET_IP_ALIGN;
-	else
-		c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
-}
-
-/*
- * Allocate TX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
-			    dma_addr_t base, void __iomem * mmio_txp_ring)
-{
-	struct c2_tx_desc *tx_desc;
-	struct c2_txp_desc __iomem *txp_desc;
-	struct c2_element *elem;
-	int i;
-
-	tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
-	if (!tx_ring->start)
-		return -ENOMEM;
-
-	elem = tx_ring->start;
-	tx_desc = vaddr;
-	txp_desc = mmio_txp_ring;
-	for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
-		tx_desc->len = 0;
-		tx_desc->status = 0;
-
-		/* Set TXP_HTXD_UNINIT */
-		__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-			     (void __iomem *) txp_desc + C2_TXP_ADDR);
-		__raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
-		__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-			     (void __iomem *) txp_desc + C2_TXP_FLAGS);
-
-		elem->skb = NULL;
-		elem->ht_desc = tx_desc;
-		elem->hw_desc = txp_desc;
-
-		if (i == tx_ring->count - 1) {
-			elem->next = tx_ring->start;
-			tx_desc->next_offset = base;
-		} else {
-			elem->next = elem + 1;
-			tx_desc->next_offset =
-			    base + (i + 1) * sizeof(*tx_desc);
-		}
-	}
-
-	tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
-
-	return 0;
-}
-
-/*
- * Allocate RX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
-			    dma_addr_t base, void __iomem * mmio_rxp_ring)
-{
-	struct c2_rx_desc *rx_desc;
-	struct c2_rxp_desc __iomem *rxp_desc;
-	struct c2_element *elem;
-	int i;
-
-	rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
-	if (!rx_ring->start)
-		return -ENOMEM;
-
-	elem = rx_ring->start;
-	rx_desc = vaddr;
-	rxp_desc = mmio_rxp_ring;
-	for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
-		rx_desc->len = 0;
-		rx_desc->status = 0;
-
-		/* Set RXP_HRXD_UNINIT */
-		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
-		       (void __iomem *) rxp_desc + C2_RXP_STATUS);
-		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
-		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
-		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-			     (void __iomem *) rxp_desc + C2_RXP_ADDR);
-		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-			     (void __iomem *) rxp_desc + C2_RXP_FLAGS);
-
-		elem->skb = NULL;
-		elem->ht_desc = rx_desc;
-		elem->hw_desc = rxp_desc;
-
-		if (i == rx_ring->count - 1) {
-			elem->next = rx_ring->start;
-			rx_desc->next_offset = base;
-		} else {
-			elem->next = elem + 1;
-			rx_desc->next_offset =
-			    base + (i + 1) * sizeof(*rx_desc);
-		}
-	}
-
-	rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
-
-	return 0;
-}
-
-/* Setup buffer for receiving */
-static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
-{
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_rx_desc *rx_desc = elem->ht_desc;
-	struct sk_buff *skb;
-	dma_addr_t mapaddr;
-	u32 maplen;
-	struct c2_rxp_hdr *rxp_hdr;
-
-	skb = dev_alloc_skb(c2_port->rx_buf_size);
-	if (unlikely(!skb)) {
-		pr_debug("%s: out of memory for receive\n",
-			c2_port->netdev->name);
-		return -ENOMEM;
-	}
-
-	/* Zero out the rxp hdr in the sk_buff */
-	memset(skb->data, 0, sizeof(*rxp_hdr));
-
-	skb->dev = c2_port->netdev;
-
-	maplen = c2_port->rx_buf_size;
-	mapaddr =
-	    pci_map_single(c2dev->pcidev, skb->data, maplen,
-			   PCI_DMA_FROMDEVICE);
-
-	/* Set the sk_buff RXP_header to RXP_HRXD_READY */
-	rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-	rxp_hdr->flags = RXP_HRXD_READY;
-
-	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-	__raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
-		     elem->hw_desc + C2_RXP_LEN);
-	__raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
-	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-		     elem->hw_desc + C2_RXP_FLAGS);
-
-	elem->skb = skb;
-	elem->mapaddr = mapaddr;
-	elem->maplen = maplen;
-	rx_desc->len = maplen;
-
-	return 0;
-}
-
-/*
- * Allocate buffers for the Rx ring
- * For receive:  rx_ring.to_clean is next received frame
- */
-static int c2_rx_fill(struct c2_port *c2_port)
-{
-	struct c2_ring *rx_ring = &c2_port->rx_ring;
-	struct c2_element *elem;
-	int ret = 0;
-
-	elem = rx_ring->start;
-	do {
-		if (c2_rx_alloc(c2_port, elem)) {
-			ret = 1;
-			break;
-		}
-	} while ((elem = elem->next) != rx_ring->start);
-
-	rx_ring->to_clean = rx_ring->start;
-	return ret;
-}
-
-/* Free all buffers in RX ring, assumes receiver stopped */
-static void c2_rx_clean(struct c2_port *c2_port)
-{
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_ring *rx_ring = &c2_port->rx_ring;
-	struct c2_element *elem;
-	struct c2_rx_desc *rx_desc;
-
-	elem = rx_ring->start;
-	do {
-		rx_desc = elem->ht_desc;
-		rx_desc->len = 0;
-
-		__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-		__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-		__raw_writew(0, elem->hw_desc + C2_RXP_LEN);
-		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-			     elem->hw_desc + C2_RXP_ADDR);
-		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-			     elem->hw_desc + C2_RXP_FLAGS);
-
-		if (elem->skb) {
-			pci_unmap_single(c2dev->pcidev, elem->mapaddr,
-					 elem->maplen, PCI_DMA_FROMDEVICE);
-			dev_kfree_skb(elem->skb);
-			elem->skb = NULL;
-		}
-	} while ((elem = elem->next) != rx_ring->start);
-}
-
-static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
-{
-	struct c2_tx_desc *tx_desc = elem->ht_desc;
-
-	tx_desc->len = 0;
-
-	pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
-			 PCI_DMA_TODEVICE);
-
-	if (elem->skb) {
-		dev_kfree_skb_any(elem->skb);
-		elem->skb = NULL;
-	}
-
-	return 0;
-}
-
-/* Free all buffers in TX ring, assumes transmitter stopped */
-static void c2_tx_clean(struct c2_port *c2_port)
-{
-	struct c2_ring *tx_ring = &c2_port->tx_ring;
-	struct c2_element *elem;
-	struct c2_txp_desc txp_htxd;
-	int retry;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-	elem = tx_ring->start;
-
-	do {
-		retry = 0;
-		do {
-			txp_htxd.flags =
-			    readw(elem->hw_desc + C2_TXP_FLAGS);
-
-			if (txp_htxd.flags == TXP_HTXD_READY) {
-				retry = 1;
-				__raw_writew(0,
-					     elem->hw_desc + C2_TXP_LEN);
-				__raw_writeq(0,
-					     elem->hw_desc + C2_TXP_ADDR);
-				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
-					     elem->hw_desc + C2_TXP_FLAGS);
-				c2_port->netdev->stats.tx_dropped++;
-				break;
-			} else {
-				__raw_writew(0,
-					     elem->hw_desc + C2_TXP_LEN);
-				__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-					     elem->hw_desc + C2_TXP_ADDR);
-				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-					     elem->hw_desc + C2_TXP_FLAGS);
-			}
-
-			c2_tx_free(c2_port->c2dev, elem);
-
-		} while ((elem = elem->next) != tx_ring->start);
-	} while (retry);
-
-	c2_port->tx_avail = c2_port->tx_ring.count - 1;
-	c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
-
-	if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-		netif_wake_queue(c2_port->netdev);
-
-	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-}
-
-/*
- * Process transmit descriptors marked 'DONE' by the firmware,
- * freeing up their unneeded sk_buffs.
- */
-static void c2_tx_interrupt(struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_ring *tx_ring = &c2_port->tx_ring;
-	struct c2_element *elem;
-	struct c2_txp_desc txp_htxd;
-
-	spin_lock(&c2_port->tx_lock);
-
-	for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
-	     elem = elem->next) {
-		txp_htxd.flags =
-		    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
-
-		if (txp_htxd.flags != TXP_HTXD_DONE)
-			break;
-
-		if (netif_msg_tx_done(c2_port)) {
-			/* PCI reads are expensive in fast path */
-			txp_htxd.len =
-			    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
-			pr_debug("%s: tx done slot %3Zu status 0x%x len "
-				"%5u bytes\n",
-				netdev->name, elem - tx_ring->start,
-				txp_htxd.flags, txp_htxd.len);
-		}
-
-		c2_tx_free(c2dev, elem);
-		++(c2_port->tx_avail);
-	}
-
-	tx_ring->to_clean = elem;
-
-	if (netif_queue_stopped(netdev)
-	    && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-		netif_wake_queue(netdev);
-
-	spin_unlock(&c2_port->tx_lock);
-}
-
-static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
-{
-	struct c2_rx_desc *rx_desc = elem->ht_desc;
-	struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-
-	if (rxp_hdr->status != RXP_HRXD_OK ||
-	    rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
-		pr_debug("BAD RXP_HRXD\n");
-		pr_debug("  rx_desc : %p\n", rx_desc);
-		pr_debug("    index : %Zu\n",
-			elem - c2_port->rx_ring.start);
-		pr_debug("    len   : %u\n", rx_desc->len);
-		pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
-			(void *) __pa((unsigned long) rxp_hdr));
-		pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
-		pr_debug("    status: 0x%x\n", rxp_hdr->status);
-		pr_debug("    len   : %u\n", rxp_hdr->len);
-		pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
-	}
-
-	/* Setup the skb for reuse since we're dropping this pkt */
-	elem->skb->data = elem->skb->head;
-	skb_reset_tail_pointer(elem->skb);
-
-	/* Zero out the rxp hdr in the sk_buff */
-	memset(elem->skb->data, 0, sizeof(*rxp_hdr));
-
-	/* Write the descriptor to the adapter's rx ring */
-	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-	__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-	__raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
-		     elem->hw_desc + C2_RXP_LEN);
-	__raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
-		     elem->hw_desc + C2_RXP_ADDR);
-	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-		     elem->hw_desc + C2_RXP_FLAGS);
-
-	pr_debug("packet dropped\n");
-	c2_port->netdev->stats.rx_dropped++;
-}
-
-static void c2_rx_interrupt(struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_ring *rx_ring = &c2_port->rx_ring;
-	struct c2_element *elem;
-	struct c2_rx_desc *rx_desc;
-	struct c2_rxp_hdr *rxp_hdr;
-	struct sk_buff *skb;
-	dma_addr_t mapaddr;
-	u32 maplen, buflen;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c2dev->lock, flags);
-
-	/* Begin where we left off */
-	rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
-
-	for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
-	     elem = elem->next) {
-		rx_desc = elem->ht_desc;
-		mapaddr = elem->mapaddr;
-		maplen = elem->maplen;
-		skb = elem->skb;
-		rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-
-		if (rxp_hdr->flags != RXP_HRXD_DONE)
-			break;
-		buflen = rxp_hdr->len;
-
-		/* Sanity check the RXP header */
-		if (rxp_hdr->status != RXP_HRXD_OK ||
-		    buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
-			c2_rx_error(c2_port, elem);
-			continue;
-		}
-
-		/*
-		 * Allocate and map a new skb for replenishing the host
-		 * RX desc
-		 */
-		if (c2_rx_alloc(c2_port, elem)) {
-			c2_rx_error(c2_port, elem);
-			continue;
-		}
-
-		/* Unmap the old skb */
-		pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
-				 PCI_DMA_FROMDEVICE);
-
-		prefetch(skb->data);
-
-		/*
-		 * Skip past the leading 8 bytes comprising of the
-		 * "struct c2_rxp_hdr", prepended by the adapter
-		 * to the usual Ethernet header ("struct ethhdr"),
-		 * to the start of the raw Ethernet packet.
-		 *
-		 * Fix up the various fields in the sk_buff before
-		 * passing it up to netif_rx(). The transfer size
-		 * (in bytes) specified by the adapter len field of
-		 * the "struct rxp_hdr_t" does NOT include the
-		 * "sizeof(struct c2_rxp_hdr)".
-		 */
-		skb->data += sizeof(*rxp_hdr);
-		skb_set_tail_pointer(skb, buflen);
-		skb->len = buflen;
-		skb->protocol = eth_type_trans(skb, netdev);
-
-		netif_rx(skb);
-
-		netdev->stats.rx_packets++;
-		netdev->stats.rx_bytes += buflen;
-	}
-
-	/* Save where we left off */
-	rx_ring->to_clean = elem;
-	c2dev->cur_rx = elem - rx_ring->start;
-	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-	spin_unlock_irqrestore(&c2dev->lock, flags);
-}
-
-/*
- * Handle netisr0 TX & RX interrupts.
- */
-static irqreturn_t c2_interrupt(int irq, void *dev_id)
-{
-	unsigned int netisr0, dmaisr;
-	int handled = 0;
-	struct c2_dev *c2dev = (struct c2_dev *) dev_id;
-
-	/* Process CCILNET interrupts */
-	netisr0 = readl(c2dev->regs + C2_NISR0);
-	if (netisr0) {
-
-		/*
-		 * There is an issue with the firmware that always
-		 * provides the status of RX for both TX & RX
-		 * interrupts.  So process both queues here.
-		 */
-		c2_rx_interrupt(c2dev->netdev);
-		c2_tx_interrupt(c2dev->netdev);
-
-		/* Clear the interrupt */
-		writel(netisr0, c2dev->regs + C2_NISR0);
-		handled++;
-	}
-
-	/* Process RNIC interrupts */
-	dmaisr = readl(c2dev->regs + C2_DISR);
-	if (dmaisr) {
-		writel(dmaisr, c2dev->regs + C2_DISR);
-		c2_rnic_interrupt(c2dev);
-		handled++;
-	}
-
-	if (handled) {
-		return IRQ_HANDLED;
-	} else {
-		return IRQ_NONE;
-	}
-}
-
-static int c2_up(struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_element *elem;
-	struct c2_rxp_hdr *rxp_hdr;
-	struct in_device *in_dev;
-	size_t rx_size, tx_size;
-	int ret, i;
-	unsigned int netimr0;
-
-	if (netif_msg_ifup(c2_port))
-		pr_debug("%s: enabling interface\n", netdev->name);
-
-	/* Set the Rx buffer size based on MTU */
-	c2_set_rxbufsize(c2_port);
-
-	/* Allocate DMA'able memory for Tx/Rx host descriptor rings */
-	rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
-	tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
-
-	c2_port->mem_size = tx_size + rx_size;
-	c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
-					     &c2_port->dma);
-	if (c2_port->mem == NULL) {
-		pr_debug("Unable to allocate memory for "
-			"host descriptor rings\n");
-		return -ENOMEM;
-	}
-
-	/* Create the Rx host descriptor ring */
-	if ((ret =
-	     c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
-			      c2dev->mmio_rxp_ring))) {
-		pr_debug("Unable to create RX ring\n");
-		goto bail0;
-	}
-
-	/* Allocate Rx buffers for the host descriptor ring */
-	if (c2_rx_fill(c2_port)) {
-		pr_debug("Unable to fill RX ring\n");
-		goto bail1;
-	}
-
-	/* Create the Tx host descriptor ring */
-	if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
-				    c2_port->dma + rx_size,
-				    c2dev->mmio_txp_ring))) {
-		pr_debug("Unable to create TX ring\n");
-		goto bail1;
-	}
-
-	/* Set the TX pointer to where we left off */
-	c2_port->tx_avail = c2_port->tx_ring.count - 1;
-	c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
-	    c2_port->tx_ring.start + c2dev->cur_tx;
-
-	/* missing: Initialize MAC */
-
-	BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
-
-	/* Reset the adapter, ensures the driver is in sync with the RXP */
-	c2_reset(c2_port);
-
-	/* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
-	for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
-	     i++, elem++) {
-		rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-		rxp_hdr->flags = 0;
-		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-			     elem->hw_desc + C2_RXP_FLAGS);
-	}
-
-	/* Enable network packets */
-	netif_start_queue(netdev);
-
-	/* Enable IRQ */
-	writel(0, c2dev->regs + C2_IDIS);
-	netimr0 = readl(c2dev->regs + C2_NIMR0);
-	netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
-	writel(netimr0, c2dev->regs + C2_NIMR0);
-
-	/* Tell the stack to ignore arp requests for ipaddrs bound to
-	 * other interfaces.  This is needed to prevent the host stack
-	 * from responding to arp requests to the ipaddr bound on the
-	 * rdma interface.
-	 */
-	in_dev = in_dev_get(netdev);
-	IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
-	in_dev_put(in_dev);
-
-	return 0;
-
-      bail1:
-	c2_rx_clean(c2_port);
-	kfree(c2_port->rx_ring.start);
-
-      bail0:
-	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-			    c2_port->dma);
-
-	return ret;
-}
-
-static int c2_down(struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-	struct c2_dev *c2dev = c2_port->c2dev;
-
-	if (netif_msg_ifdown(c2_port))
-		pr_debug("%s: disabling interface\n",
-			netdev->name);
-
-	/* Wait for all the queued packets to get sent */
-	c2_tx_interrupt(netdev);
-
-	/* Disable network packets */
-	netif_stop_queue(netdev);
-
-	/* Disable IRQs by clearing the interrupt mask */
-	writel(1, c2dev->regs + C2_IDIS);
-	writel(0, c2dev->regs + C2_NIMR0);
-
-	/* missing: Stop transmitter */
-
-	/* missing: Stop receiver */
-
-	/* Reset the adapter, ensures the driver is in sync with the RXP */
-	c2_reset(c2_port);
-
-	/* missing: Turn off LEDs here */
-
-	/* Free all buffers in the host descriptor rings */
-	c2_tx_clean(c2_port);
-	c2_rx_clean(c2_port);
-
-	/* Free the host descriptor rings */
-	kfree(c2_port->rx_ring.start);
-	kfree(c2_port->tx_ring.start);
-	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-			    c2_port->dma);
-
-	return 0;
-}
-
-static void c2_reset(struct c2_port *c2_port)
-{
-	struct c2_dev *c2dev = c2_port->c2dev;
-	unsigned int cur_rx = c2dev->cur_rx;
-
-	/* Tell the hardware to quiesce */
-	C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
-
-	/*
-	 * The hardware will reset the C2_PCI_HRX_QUI bit once
-	 * the RXP is quiesced.  Wait 2 seconds for this.
-	 */
-	ssleep(2);
-
-	cur_rx = C2_GET_CUR_RX(c2dev);
-
-	if (cur_rx & C2_PCI_HRX_QUI)
-		pr_debug("c2_reset: failed to quiesce the hardware!\n");
-
-	cur_rx &= ~C2_PCI_HRX_QUI;
-
-	c2dev->cur_rx = cur_rx;
-
-	pr_debug("Current RX: %u\n", c2dev->cur_rx);
-}
-
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-	struct c2_dev *c2dev = c2_port->c2dev;
-	struct c2_ring *tx_ring = &c2_port->tx_ring;
-	struct c2_element *elem;
-	dma_addr_t mapaddr;
-	u32 maplen;
-	unsigned long flags;
-	unsigned int i;
-
-	spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-	if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
-		netif_stop_queue(netdev);
-		spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-		pr_debug("%s: Tx ring full when queue awake!\n",
-			netdev->name);
-		return NETDEV_TX_BUSY;
-	}
-
-	maplen = skb_headlen(skb);
-	mapaddr =
-	    pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
-
-	elem = tx_ring->to_use;
-	elem->skb = skb;
-	elem->mapaddr = mapaddr;
-	elem->maplen = maplen;
-
-	/* Tell HW to xmit */
-	__raw_writeq((__force u64) cpu_to_be64(mapaddr),
-		     elem->hw_desc + C2_TXP_ADDR);
-	__raw_writew((__force u16) cpu_to_be16(maplen),
-		     elem->hw_desc + C2_TXP_LEN);
-	__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-		     elem->hw_desc + C2_TXP_FLAGS);
-
-	netdev->stats.tx_packets++;
-	netdev->stats.tx_bytes += maplen;
-
-	/* Loop thru additional data fragments and queue them */
-	if (skb_shinfo(skb)->nr_frags) {
-		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-			maplen = skb_frag_size(frag);
-			mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
-						   0, maplen, DMA_TO_DEVICE);
-			elem = elem->next;
-			elem->skb = NULL;
-			elem->mapaddr = mapaddr;
-			elem->maplen = maplen;
-
-			/* Tell HW to xmit */
-			__raw_writeq((__force u64) cpu_to_be64(mapaddr),
-				     elem->hw_desc + C2_TXP_ADDR);
-			__raw_writew((__force u16) cpu_to_be16(maplen),
-				     elem->hw_desc + C2_TXP_LEN);
-			__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-				     elem->hw_desc + C2_TXP_FLAGS);
-
-			netdev->stats.tx_packets++;
-			netdev->stats.tx_bytes += maplen;
-		}
-	}
-
-	tx_ring->to_use = elem->next;
-	c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
-
-	if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
-		netif_stop_queue(netdev);
-		if (netif_msg_tx_queued(c2_port))
-			pr_debug("%s: transmit queue full\n",
-				netdev->name);
-	}
-
-	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-	netdev->trans_start = jiffies;
-
-	return NETDEV_TX_OK;
-}
-
-static void c2_tx_timeout(struct net_device *netdev)
-{
-	struct c2_port *c2_port = netdev_priv(netdev);
-
-	if (netif_msg_timer(c2_port))
-		pr_debug("%s: tx timeout\n", netdev->name);
-
-	c2_tx_clean(c2_port);
-}
-
-static int c2_change_mtu(struct net_device *netdev, int new_mtu)
-{
-	int ret = 0;
-
-	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-		return -EINVAL;
-
-	netdev->mtu = new_mtu;
-
-	if (netif_running(netdev)) {
-		c2_down(netdev);
-
-		c2_up(netdev);
-	}
-
-	return ret;
-}
-
-static const struct net_device_ops c2_netdev = {
-	.ndo_open 		= c2_up,
-	.ndo_stop 		= c2_down,
-	.ndo_start_xmit		= c2_xmit_frame,
-	.ndo_tx_timeout		= c2_tx_timeout,
-	.ndo_change_mtu		= c2_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
-	.ndo_validate_addr	= eth_validate_addr,
-};
-
-/* Initialize network device */
-static struct net_device *c2_devinit(struct c2_dev *c2dev,
-				     void __iomem * mmio_addr)
-{
-	struct c2_port *c2_port = NULL;
-	struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
-
-	if (!netdev) {
-		pr_debug("c2_port etherdev alloc failed");
-		return NULL;
-	}
-
-	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-	netdev->netdev_ops = &c2_netdev;
-	netdev->watchdog_timeo = C2_TX_TIMEOUT;
-	netdev->irq = c2dev->pcidev->irq;
-
-	c2_port = netdev_priv(netdev);
-	c2_port->netdev = netdev;
-	c2_port->c2dev = c2dev;
-	c2_port->msg_enable = netif_msg_init(debug, default_msg);
-	c2_port->tx_ring.count = C2_NUM_TX_DESC;
-	c2_port->rx_ring.count = C2_NUM_RX_DESC;
-
-	spin_lock_init(&c2_port->tx_lock);
-
-	/* Copy our 48-bit ethernet hardware address */
-	memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
-
-	/* Validate the MAC address */
-	if (!is_valid_ether_addr(netdev->dev_addr)) {
-		pr_debug("Invalid MAC Address\n");
-		c2_print_macaddr(netdev);
-		free_netdev(netdev);
-		return NULL;
-	}
-
-	c2dev->netdev = netdev;
-
-	return netdev;
-}
-
-static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
-{
-	int ret = 0, i;
-	unsigned long reg0_start, reg0_flags, reg0_len;
-	unsigned long reg2_start, reg2_flags, reg2_len;
-	unsigned long reg4_start, reg4_flags, reg4_len;
-	unsigned kva_map_size;
-	struct net_device *netdev = NULL;
-	struct c2_dev *c2dev = NULL;
-	void __iomem *mmio_regs = NULL;
-
-	printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
-		DRV_VERSION);
-
-	/* Enable PCI device */
-	ret = pci_enable_device(pcidev);
-	if (ret) {
-		printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
-			pci_name(pcidev));
-		goto bail0;
-	}
-
-	reg0_start = pci_resource_start(pcidev, BAR_0);
-	reg0_len = pci_resource_len(pcidev, BAR_0);
-	reg0_flags = pci_resource_flags(pcidev, BAR_0);
-
-	reg2_start = pci_resource_start(pcidev, BAR_2);
-	reg2_len = pci_resource_len(pcidev, BAR_2);
-	reg2_flags = pci_resource_flags(pcidev, BAR_2);
-
-	reg4_start = pci_resource_start(pcidev, BAR_4);
-	reg4_len = pci_resource_len(pcidev, BAR_4);
-	reg4_flags = pci_resource_flags(pcidev, BAR_4);
-
-	pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
-	pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
-	pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
-
-	/* Make sure PCI base addr are MMIO */
-	if (!(reg0_flags & IORESOURCE_MEM) ||
-	    !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
-		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
-		ret = -ENODEV;
-		goto bail1;
-	}
-
-	/* Check for weird/broken PCI region reporting */
-	if ((reg0_len < C2_REG0_SIZE) ||
-	    (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
-		printk(KERN_ERR PFX "Invalid PCI region sizes\n");
-		ret = -ENODEV;
-		goto bail1;
-	}
-
-	/* Reserve PCI I/O and memory resources */
-	ret = pci_request_regions(pcidev, DRV_NAME);
-	if (ret) {
-		printk(KERN_ERR PFX "%s: Unable to request regions\n",
-			pci_name(pcidev));
-		goto bail1;
-	}
-
-	if ((sizeof(dma_addr_t) > 4)) {
-		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
-		if (ret < 0) {
-			printk(KERN_ERR PFX "64b DMA configuration failed\n");
-			goto bail2;
-		}
-	} else {
-		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
-		if (ret < 0) {
-			printk(KERN_ERR PFX "32b DMA configuration failed\n");
-			goto bail2;
-		}
-	}
-
-	/* Enables bus-mastering on the device */
-	pci_set_master(pcidev);
-
-	/* Remap the adapter PCI registers in BAR4 */
-	mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-				    sizeof(struct c2_adapter_pci_regs));
-	if (!mmio_regs) {
-		printk(KERN_ERR PFX
-			"Unable to remap adapter PCI registers in BAR4\n");
-		ret = -EIO;
-		goto bail2;
-	}
-
-	/* Validate PCI regs magic */
-	for (i = 0; i < sizeof(c2_magic); i++) {
-		if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
-			printk(KERN_ERR PFX "Downlevel Firmware boot loader "
-				"[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
-			       "utility to update your boot loader\n",
-				i + 1, sizeof(c2_magic),
-				readb(mmio_regs + C2_REGS_MAGIC + i),
-				c2_magic[i]);
-			printk(KERN_ERR PFX "Adapter not claimed\n");
-			iounmap(mmio_regs);
-			ret = -EIO;
-			goto bail2;
-		}
-	}
-
-	/* Validate the adapter version */
-	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
-		printk(KERN_ERR PFX "Version mismatch "
-			"[fw=%u, c2=%u], Adapter not claimed\n",
-			be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
-			C2_VERSION);
-		ret = -EINVAL;
-		iounmap(mmio_regs);
-		goto bail2;
-	}
-
-	/* Validate the adapter IVN */
-	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
-		printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
-		       "the OpenIB device support kit. "
-		       "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
-		       C2_IVN);
-		ret = -EINVAL;
-		iounmap(mmio_regs);
-		goto bail2;
-	}
-
-	/* Allocate hardware structure */
-	c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
-	if (!c2dev) {
-		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
-			pci_name(pcidev));
-		ret = -ENOMEM;
-		iounmap(mmio_regs);
-		goto bail2;
-	}
-
-	memset(c2dev, 0, sizeof(*c2dev));
-	spin_lock_init(&c2dev->lock);
-	c2dev->pcidev = pcidev;
-	c2dev->cur_tx = 0;
-
-	/* Get the last RX index */
-	c2dev->cur_rx =
-	    (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
-	     0xffffc000) / sizeof(struct c2_rxp_desc);
-
-	/* Request an interrupt line for the driver */
-	ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
-	if (ret) {
-		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
-			pci_name(pcidev), pcidev->irq);
-		iounmap(mmio_regs);
-		goto bail3;
-	}
-
-	/* Set driver specific data */
-	pci_set_drvdata(pcidev, c2dev);
-
-	/* Initialize network device */
-	if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
-		ret = -ENOMEM;
-		iounmap(mmio_regs);
-		goto bail4;
-	}
-
-	/* Save off the actual size prior to unmapping mmio_regs */
-	kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
-
-	/* Unmap the adapter PCI registers in BAR4 */
-	iounmap(mmio_regs);
-
-	/* Register network device */
-	ret = register_netdev(netdev);
-	if (ret) {
-		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
-			ret);
-		goto bail5;
-	}
-
-	/* Disable network packets */
-	netif_stop_queue(netdev);
-
-	/* Remap the adapter HRXDQ PA space to kernel VA space */
-	c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
-					       C2_RXP_HRXDQ_SIZE);
-	if (!c2dev->mmio_rxp_ring) {
-		printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
-		ret = -EIO;
-		goto bail6;
-	}
-
-	/* Remap the adapter HTXDQ PA space to kernel VA space */
-	c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
-					       C2_TXP_HTXDQ_SIZE);
-	if (!c2dev->mmio_txp_ring) {
-		printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
-		ret = -EIO;
-		goto bail7;
-	}
-
-	/* Save off the current RX index in the last 4 bytes of the TXP Ring */
-	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
-	c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
-	if (!c2dev->regs) {
-		printk(KERN_ERR PFX "Unable to remap BAR0\n");
-		ret = -EIO;
-		goto bail8;
-	}
-
-	/* Remap the PCI registers in adapter BAR4 to kernel VA space */
-	c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
-	c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-				     kva_map_size);
-	if (!c2dev->kva) {
-		printk(KERN_ERR PFX "Unable to remap BAR4\n");
-		ret = -EIO;
-		goto bail9;
-	}
-
-	/* Print out the MAC address */
-	c2_print_macaddr(netdev);
-
-	ret = c2_rnic_init(c2dev);
-	if (ret) {
-		printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
-		goto bail10;
-	}
-
-	ret = c2_register_device(c2dev);
-	if (ret)
-		goto bail10;
-
-	return 0;
-
- bail10:
-	iounmap(c2dev->kva);
-
- bail9:
-	iounmap(c2dev->regs);
-
- bail8:
-	iounmap(c2dev->mmio_txp_ring);
-
- bail7:
-	iounmap(c2dev->mmio_rxp_ring);
-
- bail6:
-	unregister_netdev(netdev);
-
- bail5:
-	free_netdev(netdev);
-
- bail4:
-	free_irq(pcidev->irq, c2dev);
-
- bail3:
-	ib_dealloc_device(&c2dev->ibdev);
-
- bail2:
-	pci_release_regions(pcidev);
-
- bail1:
-	pci_disable_device(pcidev);
-
- bail0:
-	return ret;
-}
-
-static void c2_remove(struct pci_dev *pcidev)
-{
-	struct c2_dev *c2dev = pci_get_drvdata(pcidev);
-	struct net_device *netdev = c2dev->netdev;
-
-	/* Unregister with OpenIB */
-	c2_unregister_device(c2dev);
-
-	/* Clean up the RNIC resources */
-	c2_rnic_term(c2dev);
-
-	/* Remove network device from the kernel */
-	unregister_netdev(netdev);
-
-	/* Free network device */
-	free_netdev(netdev);
-
-	/* Free the interrupt line */
-	free_irq(pcidev->irq, c2dev);
-
-	/* missing: Turn LEDs off here */
-
-	/* Unmap adapter PA space */
-	iounmap(c2dev->kva);
-	iounmap(c2dev->regs);
-	iounmap(c2dev->mmio_txp_ring);
-	iounmap(c2dev->mmio_rxp_ring);
-
-	/* Free the hardware structure */
-	ib_dealloc_device(&c2dev->ibdev);
-
-	/* Release reserved PCI I/O and memory resources */
-	pci_release_regions(pcidev);
-
-	/* Disable PCI device */
-	pci_disable_device(pcidev);
-
-	/* Clear driver specific data */
-	pci_set_drvdata(pcidev, NULL);
-}
-
-static struct pci_driver c2_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = c2_pci_table,
-	.probe = c2_probe,
-	.remove = c2_remove,
-};
-
-module_pci_driver(c2_pci_driver);
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h
deleted file mode 100644
index d619d73..0000000
--- a/drivers/infiniband/hw/amso1100/c2.h
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __C2_H
-#define __C2_H
-
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/idr.h>
-
-#include "c2_provider.h"
-#include "c2_mq.h"
-#include "c2_status.h"
-
-#define DRV_NAME     "c2"
-#define DRV_VERSION  "1.1"
-#define PFX          DRV_NAME ": "
-
-#define BAR_0                0
-#define BAR_2                2
-#define BAR_4                4
-
-#define RX_BUF_SIZE         (1536 + 8)
-#define ETH_JUMBO_MTU        9000
-#define C2_MAGIC            "CEPHEUS"
-#define C2_VERSION           4
-#define C2_IVN              (18 & 0x7fffffff)
-
-#define C2_REG0_SIZE        (16 * 1024)
-#define C2_REG2_SIZE        (2 * 1024 * 1024)
-#define C2_REG4_SIZE        (256 * 1024 * 1024)
-#define C2_NUM_TX_DESC       341
-#define C2_NUM_RX_DESC       256
-#define C2_PCI_REGS_OFFSET  (0x10000)
-#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
-#define C2_RXP_HRXDQ_SIZE   (4096)
-#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
-#define C2_TXP_HTXDQ_SIZE   (4096)
-#define C2_TX_TIMEOUT	    (6*HZ)
-
-/* CEPHEUS */
-static const u8 c2_magic[] = {
-	0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
-};
-
-enum adapter_pci_regs {
-	C2_REGS_MAGIC = 0x0000,
-	C2_REGS_VERS = 0x0008,
-	C2_REGS_IVN = 0x000C,
-	C2_REGS_PCI_WINSIZE = 0x0010,
-	C2_REGS_Q0_QSIZE = 0x0014,
-	C2_REGS_Q0_MSGSIZE = 0x0018,
-	C2_REGS_Q0_POOLSTART = 0x001C,
-	C2_REGS_Q0_SHARED = 0x0020,
-	C2_REGS_Q1_QSIZE = 0x0024,
-	C2_REGS_Q1_MSGSIZE = 0x0028,
-	C2_REGS_Q1_SHARED = 0x0030,
-	C2_REGS_Q2_QSIZE = 0x0034,
-	C2_REGS_Q2_MSGSIZE = 0x0038,
-	C2_REGS_Q2_SHARED = 0x0040,
-	C2_REGS_ENADDR = 0x004C,
-	C2_REGS_RDMA_ENADDR = 0x0054,
-	C2_REGS_HRX_CUR = 0x006C,
-};
-
-struct c2_adapter_pci_regs {
-	char reg_magic[8];
-	u32 version;
-	u32 ivn;
-	u32 pci_window_size;
-	u32 q0_q_size;
-	u32 q0_msg_size;
-	u32 q0_pool_start;
-	u32 q0_shared;
-	u32 q1_q_size;
-	u32 q1_msg_size;
-	u32 q1_pool_start;
-	u32 q1_shared;
-	u32 q2_q_size;
-	u32 q2_msg_size;
-	u32 q2_pool_start;
-	u32 q2_shared;
-	u32 log_start;
-	u32 log_size;
-	u8 host_enaddr[8];
-	u8 rdma_enaddr[8];
-	u32 crash_entry;
-	u32 crash_ready[2];
-	u32 fw_txd_cur;
-	u32 fw_hrxd_cur;
-	u32 fw_rxd_cur;
-};
-
-enum pci_regs {
-	C2_HISR = 0x0000,
-	C2_DISR = 0x0004,
-	C2_HIMR = 0x0008,
-	C2_DIMR = 0x000C,
-	C2_NISR0 = 0x0010,
-	C2_NISR1 = 0x0014,
-	C2_NIMR0 = 0x0018,
-	C2_NIMR1 = 0x001C,
-	C2_IDIS = 0x0020,
-};
-
-enum {
-	C2_PCI_HRX_INT = 1 << 8,
-	C2_PCI_HTX_INT = 1 << 17,
-	C2_PCI_HRX_QUI = 1 << 31,
-};
-
-/*
- * Cepheus registers in BAR0.
- */
-struct c2_pci_regs {
-	u32 hostisr;
-	u32 dmaisr;
-	u32 hostimr;
-	u32 dmaimr;
-	u32 netisr0;
-	u32 netisr1;
-	u32 netimr0;
-	u32 netimr1;
-	u32 int_disable;
-};
-
-/* TXP flags */
-enum c2_txp_flags {
-	TXP_HTXD_DONE = 0,
-	TXP_HTXD_READY = 1 << 0,
-	TXP_HTXD_UNINIT = 1 << 1,
-};
-
-/* RXP flags */
-enum c2_rxp_flags {
-	RXP_HRXD_UNINIT = 0,
-	RXP_HRXD_READY = 1 << 0,
-	RXP_HRXD_DONE = 1 << 1,
-};
-
-/* RXP status */
-enum c2_rxp_status {
-	RXP_HRXD_ZERO = 0,
-	RXP_HRXD_OK = 1 << 0,
-	RXP_HRXD_BUF_OV = 1 << 1,
-};
-
-/* TXP descriptor fields */
-enum txp_desc {
-	C2_TXP_FLAGS = 0x0000,
-	C2_TXP_LEN = 0x0002,
-	C2_TXP_ADDR = 0x0004,
-};
-
-/* RXP descriptor fields */
-enum rxp_desc {
-	C2_RXP_FLAGS = 0x0000,
-	C2_RXP_STATUS = 0x0002,
-	C2_RXP_COUNT = 0x0004,
-	C2_RXP_LEN = 0x0006,
-	C2_RXP_ADDR = 0x0008,
-};
-
-struct c2_txp_desc {
-	u16 flags;
-	u16 len;
-	u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_desc {
-	u16 flags;
-	u16 status;
-	u16 count;
-	u16 len;
-	u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_hdr {
-	u16 flags;
-	u16 status;
-	u16 len;
-	u16 rsvd;
-} __attribute__ ((packed));
-
-struct c2_tx_desc {
-	u32 len;
-	u32 status;
-	dma_addr_t next_offset;
-};
-
-struct c2_rx_desc {
-	u32 len;
-	u32 status;
-	dma_addr_t next_offset;
-};
-
-struct c2_alloc {
-	u32 last;
-	u32 max;
-	spinlock_t lock;
-	unsigned long *table;
-};
-
-struct c2_array {
-	struct {
-		void **page;
-		int used;
-	} *page_list;
-};
-
-/*
- * The MQ shared pointer pool is organized as a linked list of
- * chunks. Each chunk contains a linked list of free shared pointers
- * that can be allocated to a given user mode client.
- *
- */
-struct sp_chunk {
-	struct sp_chunk *next;
-	dma_addr_t dma_addr;
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-	u16 head;
-	u16 shared_ptr[0];
-};
-
-struct c2_pd_table {
-	u32 last;
-	u32 max;
-	spinlock_t lock;
-	unsigned long *table;
-};
-
-struct c2_qp_table {
-	struct idr idr;
-	spinlock_t lock;
-};
-
-struct c2_element {
-	struct c2_element *next;
-	void *ht_desc;		/* host     descriptor */
-	void __iomem *hw_desc;	/* hardware descriptor */
-	struct sk_buff *skb;
-	dma_addr_t mapaddr;
-	u32 maplen;
-};
-
-struct c2_ring {
-	struct c2_element *to_clean;
-	struct c2_element *to_use;
-	struct c2_element *start;
-	unsigned long count;
-};
-
-struct c2_dev {
-	struct ib_device ibdev;
-	void __iomem *regs;
-	void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
-	void __iomem *mmio_rxp_ring;
-	spinlock_t lock;
-	struct pci_dev *pcidev;
-	struct net_device *netdev;
-	struct net_device *pseudo_netdev;
-	unsigned int cur_tx;
-	unsigned int cur_rx;
-	u32 adapter_handle;
-	int device_cap_flags;
-	void __iomem *kva;	/* KVA device memory */
-	unsigned long pa;	/* PA device memory */
-	void **qptr_array;
-
-	struct kmem_cache *host_msg_cache;
-
-	struct list_head cca_link;		/* adapter list */
-	struct list_head eh_wakeup_list;	/* event wakeup list */
-	wait_queue_head_t req_vq_wo;
-
-	/* Cached RNIC properties */
-	struct ib_device_attr props;
-
-	struct c2_pd_table pd_table;
-	struct c2_qp_table qp_table;
-	int ports;		/* num of GigE ports */
-	int devnum;
-	spinlock_t vqlock;	/* sync vbs req MQ */
-
-	/* Verbs Queues */
-	struct c2_mq req_vq;	/* Verbs Request MQ */
-	struct c2_mq rep_vq;	/* Verbs Reply MQ */
-	struct c2_mq aeq;	/* Async Events MQ */
-
-	/* Kernel client MQs */
-	struct sp_chunk *kern_mqsp_pool;
-
-	/* Device updates these values when posting messages to a host
-	 * target queue */
-	u16 req_vq_shared;
-	u16 rep_vq_shared;
-	u16 aeq_shared;
-	u16 irq_claimed;
-
-	/*
-	 * Shared host target pages for user-accessible MQs.
-	 */
-	int hthead;		/* index of first free entry */
-	void *htpages;		/* kernel vaddr */
-	int htlen;		/* length of htpages memory */
-	void *htuva;		/* user mapped vaddr */
-	spinlock_t htlock;	/* serialize allocation */
-
-	u64 adapter_hint_uva;	/* access to the activity FIFO */
-
-	//	spinlock_t aeq_lock;
-	//	spinlock_t rnic_lock;
-
-	__be16 *hint_count;
-	dma_addr_t hint_count_dma;
-	u16 hints_read;
-
-	int init;		/* TRUE if it's ready */
-	char ae_cache_name[16];
-	char vq_cache_name[16];
-};
-
-struct c2_port {
-	u32 msg_enable;
-	struct c2_dev *c2dev;
-	struct net_device *netdev;
-
-	spinlock_t tx_lock;
-	u32 tx_avail;
-	struct c2_ring tx_ring;
-	struct c2_ring rx_ring;
-
-	void *mem;		/* PCI memory for host rings */
-	dma_addr_t dma;
-	unsigned long mem_size;
-
-	u32 rx_buf_size;
-};
-
-/*
- * Activity FIFO registers in BAR0.
- */
-#define PCI_BAR0_HOST_HINT	0x100
-#define PCI_BAR0_ADAPTER_HINT	0x2000
-
-/*
- * Ammasso PCI vendor id and Cepheus PCI device id.
- */
-#define CQ_ARMED 	0x01
-#define CQ_WAIT_FOR_DMA	0x80
-
-/*
- * The format of a hint is as follows:
- * Lower 16 bits are the count of hints for the queue.
- * Next 15 bits are the qp_index
- * Upper most bit depends on who reads it:
- *    If read by producer, then it means Full (1) or Not-Full (0)
- *    If read by consumer, then it means Empty (1) or Not-Empty (0)
- */
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-
-/*
- * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
- * struct.
- */
-#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
-
-#ifndef readq
-static inline u64 readq(const void __iomem * addr)
-{
-	u64 ret = readl(addr + 4);
-	ret <<= 32;
-	ret |= readl(addr);
-
-	return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void __raw_writeq(u64 val, void __iomem * addr)
-{
-	__raw_writel((u32) (val), addr);
-	__raw_writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
-#define C2_SET_CUR_RX(c2dev, cur_rx) \
-	__raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
-
-#define C2_GET_CUR_RX(c2dev) \
-	be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
-
-static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
-{
-	return container_of(ibdev, struct c2_dev, ibdev);
-}
-
-static inline int c2_errno(void *reply)
-{
-	switch (c2_wr_get_result(reply)) {
-	case C2_OK:
-		return 0;
-	case CCERR_NO_BUFS:
-	case CCERR_INSUFFICIENT_RESOURCES:
-	case CCERR_ZERO_RDMA_READ_RESOURCES:
-		return -ENOMEM;
-	case CCERR_MR_IN_USE:
-	case CCERR_QP_IN_USE:
-		return -EBUSY;
-	case CCERR_ADDR_IN_USE:
-		return -EADDRINUSE;
-	case CCERR_ADDR_NOT_AVAIL:
-		return -EADDRNOTAVAIL;
-	case CCERR_CONN_RESET:
-		return -ECONNRESET;
-	case CCERR_NOT_IMPLEMENTED:
-	case CCERR_INVALID_WQE:
-		return -ENOSYS;
-	case CCERR_QP_NOT_PRIVILEGED:
-		return -EPERM;
-	case CCERR_STACK_ERROR:
-		return -EPROTO;
-	case CCERR_ACCESS_VIOLATION:
-	case CCERR_BASE_AND_BOUNDS_VIOLATION:
-		return -EFAULT;
-	case CCERR_STAG_STATE_NOT_INVALID:
-	case CCERR_INVALID_ADDRESS:
-	case CCERR_INVALID_CQ:
-	case CCERR_INVALID_EP:
-	case CCERR_INVALID_MODIFIER:
-	case CCERR_INVALID_MTU:
-	case CCERR_INVALID_PD_ID:
-	case CCERR_INVALID_QP:
-	case CCERR_INVALID_RNIC:
-	case CCERR_INVALID_STAG:
-		return -EINVAL;
-	default:
-		return -EAGAIN;
-	}
-}
-
-/* Device */
-extern int c2_register_device(struct c2_dev *c2dev);
-extern void c2_unregister_device(struct c2_dev *c2dev);
-extern int c2_rnic_init(struct c2_dev *c2dev);
-extern void c2_rnic_term(struct c2_dev *c2dev);
-extern void c2_rnic_interrupt(struct c2_dev *c2dev);
-extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-
-/* QPs */
-extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
-		       struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
-extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
-extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
-extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-			struct ib_qp_attr *attr, int attr_mask);
-extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-				 int ord, int ird);
-extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-			struct ib_send_wr **bad_wr);
-extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-			   struct ib_recv_wr **bad_wr);
-extern void c2_init_qp_table(struct c2_dev *c2dev);
-extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
-extern void c2_set_qp_state(struct c2_qp *, int);
-extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
-
-/* PDs */
-extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
-extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
-extern int c2_init_pd_table(struct c2_dev *c2dev);
-extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
-
-/* CQs */
-extern int c2_init_cq(struct c2_dev *c2dev, int entries,
-		      struct c2_ucontext *ctx, struct c2_cq *cq);
-extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
-extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
-extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
-extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
-
-/* CM */
-extern int c2_llp_connect(struct iw_cm_id *cm_id,
-			  struct iw_cm_conn_param *iw_param);
-extern int c2_llp_accept(struct iw_cm_id *cm_id,
-			 struct iw_cm_conn_param *iw_param);
-extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
-			 u8 pdata_len);
-extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
-extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
-
-/* MM */
-extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
- 				      int page_size, int pbl_depth, u32 length,
- 				      u32 off, u64 *va, enum c2_acf acf,
-				      struct c2_mr *mr);
-extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
-
-/* AE */
-extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
-
-/* MQSP Allocator */
-extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-			     struct sp_chunk **root);
-extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
-extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-			     dma_addr_t *dma_addr, gfp_t gfp_mask);
-extern void c2_free_mqsp(__be16* mqsp);
-#endif
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c
deleted file mode 100644
index cedda25..0000000
--- a/drivers/infiniband/hw/amso1100/c2_ae.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_status.h"
-#include "c2_ae.h"
-
-static int c2_convert_cm_status(u32 c2_status)
-{
-	switch (c2_status) {
-	case C2_CONN_STATUS_SUCCESS:
-		return 0;
-	case C2_CONN_STATUS_REJECTED:
-		return -ENETRESET;
-	case C2_CONN_STATUS_REFUSED:
-		return -ECONNREFUSED;
-	case C2_CONN_STATUS_TIMEDOUT:
-		return -ETIMEDOUT;
-	case C2_CONN_STATUS_NETUNREACH:
-		return -ENETUNREACH;
-	case C2_CONN_STATUS_HOSTUNREACH:
-		return -EHOSTUNREACH;
-	case C2_CONN_STATUS_INVALID_RNIC:
-		return -EINVAL;
-	case C2_CONN_STATUS_INVALID_QP:
-		return -EINVAL;
-	case C2_CONN_STATUS_INVALID_QP_STATE:
-		return -EINVAL;
-	case C2_CONN_STATUS_ADDR_NOT_AVAIL:
-		return -EADDRNOTAVAIL;
-	default:
-		printk(KERN_ERR PFX
-		       "%s - Unable to convert CM status: %d\n",
-		       __func__, c2_status);
-		return -EIO;
-	}
-}
-
-static const char* to_event_str(int event)
-{
-	static const char* event_str[] = {
-		"CCAE_REMOTE_SHUTDOWN",
-		"CCAE_ACTIVE_CONNECT_RESULTS",
-		"CCAE_CONNECTION_REQUEST",
-		"CCAE_LLP_CLOSE_COMPLETE",
-		"CCAE_TERMINATE_MESSAGE_RECEIVED",
-		"CCAE_LLP_CONNECTION_RESET",
-		"CCAE_LLP_CONNECTION_LOST",
-		"CCAE_LLP_SEGMENT_SIZE_INVALID",
-		"CCAE_LLP_INVALID_CRC",
-		"CCAE_LLP_BAD_FPDU",
-		"CCAE_INVALID_DDP_VERSION",
-		"CCAE_INVALID_RDMA_VERSION",
-		"CCAE_UNEXPECTED_OPCODE",
-		"CCAE_INVALID_DDP_QUEUE_NUMBER",
-		"CCAE_RDMA_READ_NOT_ENABLED",
-		"CCAE_RDMA_WRITE_NOT_ENABLED",
-		"CCAE_RDMA_READ_TOO_SMALL",
-		"CCAE_NO_L_BIT",
-		"CCAE_TAGGED_INVALID_STAG",
-		"CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
-		"CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
-		"CCAE_TAGGED_INVALID_PD",
-		"CCAE_WRAP_ERROR",
-		"CCAE_BAD_CLOSE",
-		"CCAE_BAD_LLP_CLOSE",
-		"CCAE_INVALID_MSN_RANGE",
-		"CCAE_INVALID_MSN_GAP",
-		"CCAE_IRRQ_OVERFLOW",
-		"CCAE_IRRQ_MSN_GAP",
-		"CCAE_IRRQ_MSN_RANGE",
-		"CCAE_IRRQ_INVALID_STAG",
-		"CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
-		"CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
-		"CCAE_IRRQ_INVALID_PD",
-		"CCAE_IRRQ_WRAP_ERROR",
-		"CCAE_CQ_SQ_COMPLETION_OVERFLOW",
-		"CCAE_CQ_RQ_COMPLETION_ERROR",
-		"CCAE_QP_SRQ_WQE_ERROR",
-		"CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
-		"CCAE_CQ_OVERFLOW",
-		"CCAE_CQ_OPERATION_ERROR",
-		"CCAE_SRQ_LIMIT_REACHED",
-		"CCAE_QP_RQ_LIMIT_REACHED",
-		"CCAE_SRQ_CATASTROPHIC_ERROR",
-		"CCAE_RNIC_CATASTROPHIC_ERROR"
-	};
-
-	if (event < CCAE_REMOTE_SHUTDOWN ||
-	    event > CCAE_RNIC_CATASTROPHIC_ERROR)
-		return "<invalid event>";
-
-	event -= CCAE_REMOTE_SHUTDOWN;
-	return event_str[event];
-}
-
-static const char *to_qp_state_str(int state)
-{
-	switch (state) {
-	case C2_QP_STATE_IDLE:
-		return "C2_QP_STATE_IDLE";
-	case C2_QP_STATE_CONNECTING:
-		return "C2_QP_STATE_CONNECTING";
-	case C2_QP_STATE_RTS:
-		return "C2_QP_STATE_RTS";
-	case C2_QP_STATE_CLOSING:
-		return "C2_QP_STATE_CLOSING";
-	case C2_QP_STATE_TERMINATE:
-		return "C2_QP_STATE_TERMINATE";
-	case C2_QP_STATE_ERROR:
-		return "C2_QP_STATE_ERROR";
-	default:
-		return "<invalid QP state>";
-	}
-}
-
-void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
-{
-	struct c2_mq *mq = c2dev->qptr_array[mq_index];
-	union c2wr *wr;
-	void *resource_user_context;
-	struct iw_cm_event cm_event;
-	struct ib_event ib_event;
-	enum c2_resource_indicator resource_indicator;
-	enum c2_event_id event_id;
-	unsigned long flags;
-	int status;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
-
-	/*
-	 * retrieve the message
-	 */
-	wr = c2_mq_consume(mq);
-	if (!wr)
-		return;
-
-	memset(&ib_event, 0, sizeof(ib_event));
-	memset(&cm_event, 0, sizeof(cm_event));
-
-	event_id = c2_wr_get_id(wr);
-	resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
-	resource_user_context =
-	    (void *) (unsigned long) wr->ae.ae_generic.user_context;
-
-	status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
-
-	pr_debug("event received c2_dev=%p, event_id=%d, "
-		"resource_indicator=%d, user_context=%p, status = %d\n",
-		c2dev, event_id, resource_indicator, resource_user_context,
-		status);
-
-	switch (resource_indicator) {
-	case C2_RES_IND_QP:{
-
-		struct c2_qp *qp = (struct c2_qp *)resource_user_context;
-		struct iw_cm_id *cm_id = qp->cm_id;
-		struct c2wr_ae_active_connect_results *res;
-
-		if (!cm_id) {
-			pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
-				qp);
-			goto ignore_it;
-		}
-		pr_debug("%s: event = %s, user_context=%llx, "
-			"resource_type=%x, "
-			"resource=%x, qp_state=%s\n",
-			__func__,
-			to_event_str(event_id),
-			(unsigned long long) wr->ae.ae_generic.user_context,
-			be32_to_cpu(wr->ae.ae_generic.resource_type),
-			be32_to_cpu(wr->ae.ae_generic.resource),
-			to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
-
-		c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
-
-		switch (event_id) {
-		case CCAE_ACTIVE_CONNECT_RESULTS:
-			res = &wr->ae.ae_active_connect_results;
-			cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-			laddr->sin_addr.s_addr = res->laddr;
-			raddr->sin_addr.s_addr = res->raddr;
-			laddr->sin_port = res->lport;
-			raddr->sin_port = res->rport;
-			if (status == 0) {
-				cm_event.private_data_len =
-					be32_to_cpu(res->private_data_length);
-				cm_event.private_data = res->private_data;
-			} else {
-				spin_lock_irqsave(&qp->lock, flags);
-				if (qp->cm_id) {
-					qp->cm_id->rem_ref(qp->cm_id);
-					qp->cm_id = NULL;
-				}
-				spin_unlock_irqrestore(&qp->lock, flags);
-				cm_event.private_data_len = 0;
-				cm_event.private_data = NULL;
-			}
-			if (cm_id->event_handler)
-				cm_id->event_handler(cm_id, &cm_event);
-			break;
-		case CCAE_TERMINATE_MESSAGE_RECEIVED:
-		case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
-			ib_event.device = &c2dev->ibdev;
-			ib_event.element.qp = &qp->ibqp;
-			ib_event.event = IB_EVENT_QP_REQ_ERR;
-
-			if (qp->ibqp.event_handler)
-				qp->ibqp.event_handler(&ib_event,
-						       qp->ibqp.
-						       qp_context);
-			break;
-		case CCAE_BAD_CLOSE:
-		case CCAE_LLP_CLOSE_COMPLETE:
-		case CCAE_LLP_CONNECTION_RESET:
-		case CCAE_LLP_CONNECTION_LOST:
-			BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
-
-			spin_lock_irqsave(&qp->lock, flags);
-			if (qp->cm_id) {
-				qp->cm_id->rem_ref(qp->cm_id);
-				qp->cm_id = NULL;
-			}
-			spin_unlock_irqrestore(&qp->lock, flags);
-			cm_event.event = IW_CM_EVENT_CLOSE;
-			cm_event.status = 0;
-			if (cm_id->event_handler)
-				cm_id->event_handler(cm_id, &cm_event);
-			break;
-		default:
-			BUG_ON(1);
-			pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
-				"CM_ID=%p\n",
-				__func__, __LINE__,
-				event_id, qp, cm_id);
-			break;
-		}
-		break;
-	}
-
-	case C2_RES_IND_EP:{
-
-		struct c2wr_ae_connection_request *req =
-			&wr->ae.ae_connection_request;
-		struct iw_cm_id *cm_id =
-			(struct iw_cm_id *)resource_user_context;
-
-		pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
-		if (event_id != CCAE_CONNECTION_REQUEST) {
-			pr_debug("%s: Invalid event_id: %d\n",
-				__func__, event_id);
-			break;
-		}
-		cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
-		cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
-		laddr->sin_addr.s_addr = req->laddr;
-		raddr->sin_addr.s_addr = req->raddr;
-		laddr->sin_port = req->lport;
-		raddr->sin_port = req->rport;
-		cm_event.private_data_len =
-			be32_to_cpu(req->private_data_length);
-		cm_event.private_data = req->private_data;
-		/*
-		 * Until ird/ord negotiation via MPAv2 support is added, send
-		 * max supported values
-		 */
-		cm_event.ird = cm_event.ord = 128;
-
-		if (cm_id->event_handler)
-			cm_id->event_handler(cm_id, &cm_event);
-		break;
-	}
-
-	case C2_RES_IND_CQ:{
-		struct c2_cq *cq =
-		    (struct c2_cq *) resource_user_context;
-
-		pr_debug("IB_EVENT_CQ_ERR\n");
-		ib_event.device = &c2dev->ibdev;
-		ib_event.element.cq = &cq->ibcq;
-		ib_event.event = IB_EVENT_CQ_ERR;
-
-		if (cq->ibcq.event_handler)
-			cq->ibcq.event_handler(&ib_event,
-					       cq->ibcq.cq_context);
-		break;
-	}
-
-	default:
-		printk("Bad resource indicator = %d\n",
-		       resource_indicator);
-		break;
-	}
-
- ignore_it:
-	c2_mq_free(mq);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/infiniband/hw/amso1100/c2_ae.h
deleted file mode 100644
index 3a065c3..0000000
--- a/drivers/infiniband/hw/amso1100/c2_ae.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_AE_H_
-#define _C2_AE_H_
-
-/*
- * WARNING: If you change this file, also bump C2_IVN_BASE
- * in common/include/clustercore/c2_ivn.h.
- */
-
-/*
- * Asynchronous Event Identifiers
- *
- * These start at 0x80 only so it's obvious from inspection that
- * they are not work-request statuses.  This isn't critical.
- *
- * NOTE: these event id's must fit in eight bits.
- */
-enum c2_event_id {
-	CCAE_REMOTE_SHUTDOWN = 0x80,
-	CCAE_ACTIVE_CONNECT_RESULTS,
-	CCAE_CONNECTION_REQUEST,
-	CCAE_LLP_CLOSE_COMPLETE,
-	CCAE_TERMINATE_MESSAGE_RECEIVED,
-	CCAE_LLP_CONNECTION_RESET,
-	CCAE_LLP_CONNECTION_LOST,
-	CCAE_LLP_SEGMENT_SIZE_INVALID,
-	CCAE_LLP_INVALID_CRC,
-	CCAE_LLP_BAD_FPDU,
-	CCAE_INVALID_DDP_VERSION,
-	CCAE_INVALID_RDMA_VERSION,
-	CCAE_UNEXPECTED_OPCODE,
-	CCAE_INVALID_DDP_QUEUE_NUMBER,
-	CCAE_RDMA_READ_NOT_ENABLED,
-	CCAE_RDMA_WRITE_NOT_ENABLED,
-	CCAE_RDMA_READ_TOO_SMALL,
-	CCAE_NO_L_BIT,
-	CCAE_TAGGED_INVALID_STAG,
-	CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
-	CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
-	CCAE_TAGGED_INVALID_PD,
-	CCAE_WRAP_ERROR,
-	CCAE_BAD_CLOSE,
-	CCAE_BAD_LLP_CLOSE,
-	CCAE_INVALID_MSN_RANGE,
-	CCAE_INVALID_MSN_GAP,
-	CCAE_IRRQ_OVERFLOW,
-	CCAE_IRRQ_MSN_GAP,
-	CCAE_IRRQ_MSN_RANGE,
-	CCAE_IRRQ_INVALID_STAG,
-	CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
-	CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
-	CCAE_IRRQ_INVALID_PD,
-	CCAE_IRRQ_WRAP_ERROR,
-	CCAE_CQ_SQ_COMPLETION_OVERFLOW,
-	CCAE_CQ_RQ_COMPLETION_ERROR,
-	CCAE_QP_SRQ_WQE_ERROR,
-	CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
-	CCAE_CQ_OVERFLOW,
-	CCAE_CQ_OPERATION_ERROR,
-	CCAE_SRQ_LIMIT_REACHED,
-	CCAE_QP_RQ_LIMIT_REACHED,
-	CCAE_SRQ_CATASTROPHIC_ERROR,
-	CCAE_RNIC_CATASTROPHIC_ERROR
-/* WARNING If you add more id's, make sure their values fit in eight bits. */
-};
-
-/*
- * Resource Indicators and Identifiers
- */
-enum c2_resource_indicator {
-	C2_RES_IND_QP = 1,
-	C2_RES_IND_EP,
-	C2_RES_IND_CQ,
-	C2_RES_IND_SRQ,
-};
-
-#endif /* _C2_AE_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/infiniband/hw/amso1100/c2_alloc.c
deleted file mode 100644
index 78d247e..0000000
--- a/drivers/infiniband/hw/amso1100/c2_alloc.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/errno.h>
-#include <linux/bitmap.h>
-
-#include "c2.h"
-
-static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
-			       struct sp_chunk **head)
-{
-	int i;
-	struct sp_chunk *new_head;
-	dma_addr_t dma_addr;
-
-	new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
-				      &dma_addr, gfp_mask);
-	if (new_head == NULL)
-		return -ENOMEM;
-
-	new_head->dma_addr = dma_addr;
-	dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
-
-	new_head->next = NULL;
-	new_head->head = 0;
-
-	/* build list where each index is the next free slot */
-	for (i = 0;
-	     i < (PAGE_SIZE - sizeof(struct sp_chunk) -
-		  sizeof(u16)) / sizeof(u16) - 1;
-	     i++) {
-		new_head->shared_ptr[i] = i + 1;
-	}
-	/* terminate list */
-	new_head->shared_ptr[i] = 0xFFFF;
-
-	*head = new_head;
-	return 0;
-}
-
-int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-		      struct sp_chunk **root)
-{
-	return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
-}
-
-void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
-{
-	struct sp_chunk *next;
-
-	while (root) {
-		next = root->next;
-		dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
-				  dma_unmap_addr(root, mapping));
-		root = next;
-	}
-}
-
-__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-		      dma_addr_t *dma_addr, gfp_t gfp_mask)
-{
-	u16 mqsp;
-
-	while (head) {
-		mqsp = head->head;
-		if (mqsp != 0xFFFF) {
-			head->head = head->shared_ptr[mqsp];
-			break;
-		} else if (head->next == NULL) {
-			if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
-			    0) {
-				head = head->next;
-				mqsp = head->head;
-				head->head = head->shared_ptr[mqsp];
-				break;
-			} else
-				return NULL;
-		} else
-			head = head->next;
-	}
-	if (head) {
-		*dma_addr = head->dma_addr +
-			    ((unsigned long) &(head->shared_ptr[mqsp]) -
-			     (unsigned long) head);
-		pr_debug("%s addr %p dma_addr %llx\n", __func__,
-			 &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
-		return (__force __be16 *) &(head->shared_ptr[mqsp]);
-	}
-	return NULL;
-}
-
-void c2_free_mqsp(__be16 *mqsp)
-{
-	struct sp_chunk *head;
-	u16 idx;
-
-	/* The chunk containing this ptr begins at the page boundary */
-	head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
-
-	/* Link head to new mqsp */
-	*mqsp = (__force __be16) head->head;
-
-	/* Compute the shared_ptr index */
-	idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
-	idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
-
-	/* Point this index at the head */
-	head->shared_ptr[idx] = head->head;
-
-	/* Point head at this index */
-	head->head = idx;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c
deleted file mode 100644
index 23bfa94..0000000
--- a/drivers/infiniband/hw/amso1100/c2_cm.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_vq.h"
-#include <rdma/iw_cm.h>
-
-int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-	struct c2_dev *c2dev = to_c2dev(cm_id->device);
-	struct ib_qp *ibqp;
-	struct c2_qp *qp;
-	struct c2wr_qp_connect_req *wr;	/* variable size needs a malloc. */
-	struct c2_vq_req *vq_req;
-	int err;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
-
-	if (cm_id->remote_addr.ss_family != AF_INET)
-		return -ENOSYS;
-
-	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-	qp = to_c2qp(ibqp);
-
-	/* Associate QP <--> CM_ID */
-	cm_id->provider_data = qp;
-	cm_id->add_ref(cm_id);
-	qp->cm_id = cm_id;
-
-	/*
-	 * only support the max private_data length
-	 */
-	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-		err = -EINVAL;
-		goto bail0;
-	}
-	/*
-	 * Set the rdma read limits
-	 */
-	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-	if (err)
-		goto bail0;
-
-	/*
-	 * Create and send a WR_QP_CONNECT...
-	 */
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	c2_wr_set_id(wr, CCWR_QP_CONNECT);
-	wr->hdr.context = 0;
-	wr->rnic_handle = c2dev->adapter_handle;
-	wr->qp_handle = qp->adapter_handle;
-
-	wr->remote_addr = raddr->sin_addr.s_addr;
-	wr->remote_port = raddr->sin_port;
-
-	/*
-	 * Move any private data from the callers's buf into
-	 * the WR.
-	 */
-	if (iw_param->private_data) {
-		wr->private_data_length =
-			cpu_to_be32(iw_param->private_data_len);
-		memcpy(&wr->private_data[0], iw_param->private_data,
-		       iw_param->private_data_len);
-	} else
-		wr->private_data_length = 0;
-
-	/*
-	 * Send WR to adapter.  NOTE: There is no synch reply from
-	 * the adapter.
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) wr);
-	vq_req_free(c2dev, vq_req);
-
- bail1:
-	kfree(wr);
- bail0:
-	if (err) {
-		/*
-		 * If we fail, release reference on QP and
-		 * disassociate QP from CM_ID
-		 */
-		cm_id->provider_data = NULL;
-		qp->cm_id = NULL;
-		cm_id->rem_ref(cm_id);
-	}
-	return err;
-}
-
-int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-	struct c2_dev *c2dev;
-	struct c2wr_ep_listen_create_req wr;
-	struct c2wr_ep_listen_create_rep *reply;
-	struct c2_vq_req *vq_req;
-	int err;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
-
-	if (cm_id->local_addr.ss_family != AF_INET)
-		return -ENOSYS;
-
-	c2dev = to_c2dev(cm_id->device);
-	if (c2dev == NULL)
-		return -EINVAL;
-
-	/*
-	 * Allocate verbs request.
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	/*
-	 * Build the WR
-	 */
-	c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
-	wr.hdr.context = (u64) (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.local_addr = laddr->sin_addr.s_addr;
-	wr.local_port = laddr->sin_port;
-	wr.backlog = cpu_to_be32(backlog);
-	wr.user_context = (u64) (unsigned long) cm_id;
-
-	/*
-	 * Reference the request struct.  Dereferenced in the int handler.
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	/*
-	 * Send WR to adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	/*
-	 * Wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail0;
-
-	/*
-	 * Process reply
-	 */
-	reply =
-	    (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	if ((err = c2_errno(reply)) != 0)
-		goto bail1;
-
-	/*
-	 * Keep the adapter handle. Used in subsequent destroy
-	 */
-	cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
-
-	/*
-	 * free vq stuff
-	 */
-	vq_repbuf_free(c2dev, reply);
-	vq_req_free(c2dev, vq_req);
-
-	return 0;
-
- bail1:
-	vq_repbuf_free(c2dev, reply);
- bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-
-int c2_llp_service_destroy(struct iw_cm_id *cm_id)
-{
-
-	struct c2_dev *c2dev;
-	struct c2wr_ep_listen_destroy_req wr;
-	struct c2wr_ep_listen_destroy_rep *reply;
-	struct c2_vq_req *vq_req;
-	int err;
-
-	c2dev = to_c2dev(cm_id->device);
-	if (c2dev == NULL)
-		return -EINVAL;
-
-	/*
-	 * Allocate verbs request.
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	/*
-	 * Build the WR
-	 */
-	c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
-
-	/*
-	 * reference the request struct.  dereferenced in the int handler.
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	/*
-	 * Send WR to adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	/*
-	 * Wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail0;
-
-	/*
-	 * Process reply
-	 */
-	reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-	if ((err = c2_errno(reply)) != 0)
-		goto bail1;
-
- bail1:
-	vq_repbuf_free(c2dev, reply);
- bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-	struct c2_dev *c2dev = to_c2dev(cm_id->device);
-	struct c2_qp *qp;
-	struct ib_qp *ibqp;
-	struct c2wr_cr_accept_req *wr;	/* variable length WR */
-	struct c2_vq_req *vq_req;
-	struct c2wr_cr_accept_rep *reply;	/* VQ Reply msg ptr. */
-	int err;
-
-	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-	qp = to_c2qp(ibqp);
-
-	/* Set the RDMA read limits */
-	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-	if (err)
-		goto bail0;
-
-	/* Allocate verbs request. */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-	vq_req->qp = qp;
-	vq_req->cm_id = cm_id;
-	vq_req->event = IW_CM_EVENT_ESTABLISHED;
-
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	/* Build the WR */
-	c2_wr_set_id(wr, CCWR_CR_ACCEPT);
-	wr->hdr.context = (unsigned long) vq_req;
-	wr->rnic_handle = c2dev->adapter_handle;
-	wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
-	wr->qp_handle = qp->adapter_handle;
-
-	/* Replace the cr_handle with the QP after accept */
-	cm_id->provider_data = qp;
-	cm_id->add_ref(cm_id);
-	qp->cm_id = cm_id;
-
-	cm_id->provider_data = qp;
-
-	/* Validate private_data length */
-	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-		err = -EINVAL;
-		goto bail1;
-	}
-
-	if (iw_param->private_data) {
-		wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
-		memcpy(&wr->private_data[0],
-		       iw_param->private_data, iw_param->private_data_len);
-	} else
-		wr->private_data_length = 0;
-
-	/* Reference the request struct.  Dereferenced in the int handler. */
-	vq_req_get(c2dev, vq_req);
-
-	/* Send WR to adapter */
-	err = vq_send_wr(c2dev, (union c2wr *) wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	/* Wait for reply from adapter */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail1;
-
-	/* Check that reply is present */
-	reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	err = c2_errno(reply);
-	vq_repbuf_free(c2dev, reply);
-
-	if (!err)
-		c2_set_qp_state(qp, C2_QP_STATE_RTS);
- bail1:
-	kfree(wr);
-	vq_req_free(c2dev, vq_req);
- bail0:
-	if (err) {
-		/*
-		 * If we fail, release reference on QP and
-		 * disassociate QP from CM_ID
-		 */
-		cm_id->provider_data = NULL;
-		qp->cm_id = NULL;
-		cm_id->rem_ref(cm_id);
-	}
-	return err;
-}
-
-int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-	struct c2_dev *c2dev;
-	struct c2wr_cr_reject_req wr;
-	struct c2_vq_req *vq_req;
-	struct c2wr_cr_reject_rep *reply;
-	int err;
-
-	c2dev = to_c2dev(cm_id->device);
-
-	/*
-	 * Allocate verbs request.
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	/*
-	 * Build the WR
-	 */
-	c2_wr_set_id(&wr, CCWR_CR_REJECT);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
-
-	/*
-	 * reference the request struct.  dereferenced in the int handler.
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	/*
-	 * Send WR to adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	/*
-	 * Wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail0;
-
-	/*
-	 * Process reply
-	 */
-	reply = (struct c2wr_cr_reject_rep *) (unsigned long)
-		vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-	err = c2_errno(reply);
-	/*
-	 * free vq stuff
-	 */
-	vq_repbuf_free(c2dev, reply);
-
- bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c
deleted file mode 100644
index 1b63185..0000000
--- a/drivers/infiniband/hw/amso1100/c2_cq.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
-
-static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
-{
-	struct c2_cq *cq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c2dev->lock, flags);
-	cq = c2dev->qptr_array[cqn];
-	if (!cq) {
-		spin_unlock_irqrestore(&c2dev->lock, flags);
-		return NULL;
-	}
-	atomic_inc(&cq->refcount);
-	spin_unlock_irqrestore(&c2dev->lock, flags);
-	return cq;
-}
-
-static void c2_cq_put(struct c2_cq *cq)
-{
-	if (atomic_dec_and_test(&cq->refcount))
-		wake_up(&cq->wait);
-}
-
-void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
-{
-	struct c2_cq *cq;
-
-	cq = c2_cq_get(c2dev, mq_index);
-	if (!cq) {
-		printk("discarding events on destroyed CQN=%d\n", mq_index);
-		return;
-	}
-
-	(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
-	c2_cq_put(cq);
-}
-
-void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
-{
-	struct c2_cq *cq;
-	struct c2_mq *q;
-
-	cq = c2_cq_get(c2dev, mq_index);
-	if (!cq)
-		return;
-
-	spin_lock_irq(&cq->lock);
-	q = &cq->mq;
-	if (q && !c2_mq_empty(q)) {
-		u16 priv = q->priv;
-		struct c2wr_ce *msg;
-
-		while (priv != be16_to_cpu(*q->shared)) {
-			msg = (struct c2wr_ce *)
-				(q->msg_pool.host + priv * q->msg_size);
-			if (msg->qp_user_context == (u64) (unsigned long) qp) {
-				msg->qp_user_context = (u64) 0;
-			}
-			priv = (priv + 1) % q->q_size;
-		}
-	}
-	spin_unlock_irq(&cq->lock);
-	c2_cq_put(cq);
-}
-
-static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
-{
-	switch (status) {
-	case C2_OK:
-		return IB_WC_SUCCESS;
-	case CCERR_FLUSHED:
-		return IB_WC_WR_FLUSH_ERR;
-	case CCERR_BASE_AND_BOUNDS_VIOLATION:
-		return IB_WC_LOC_PROT_ERR;
-	case CCERR_ACCESS_VIOLATION:
-		return IB_WC_LOC_ACCESS_ERR;
-	case CCERR_TOTAL_LENGTH_TOO_BIG:
-		return IB_WC_LOC_LEN_ERR;
-	case CCERR_INVALID_WINDOW:
-		return IB_WC_MW_BIND_ERR;
-	default:
-		return IB_WC_GENERAL_ERR;
-	}
-}
-
-
-static inline int c2_poll_one(struct c2_dev *c2dev,
-			      struct c2_cq *cq, struct ib_wc *entry)
-{
-	struct c2wr_ce *ce;
-	struct c2_qp *qp;
-	int is_recv = 0;
-
-	ce = c2_mq_consume(&cq->mq);
-	if (!ce) {
-		return -EAGAIN;
-	}
-
-	/*
-	 * if the qp returned is null then this qp has already
-	 * been freed and we are unable process the completion.
-	 * try pulling the next message
-	 */
-	while ((qp =
-		(struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
-		c2_mq_free(&cq->mq);
-		ce = c2_mq_consume(&cq->mq);
-		if (!ce)
-			return -EAGAIN;
-	}
-
-	entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
-	entry->wr_id = ce->hdr.context;
-	entry->qp = &qp->ibqp;
-	entry->wc_flags = 0;
-	entry->slid = 0;
-	entry->sl = 0;
-	entry->src_qp = 0;
-	entry->dlid_path_bits = 0;
-	entry->pkey_index = 0;
-
-	switch (c2_wr_get_id(ce)) {
-	case C2_WR_TYPE_SEND:
-		entry->opcode = IB_WC_SEND;
-		break;
-	case C2_WR_TYPE_RDMA_WRITE:
-		entry->opcode = IB_WC_RDMA_WRITE;
-		break;
-	case C2_WR_TYPE_RDMA_READ:
-		entry->opcode = IB_WC_RDMA_READ;
-		break;
-	case C2_WR_TYPE_BIND_MW:
-		entry->opcode = IB_WC_BIND_MW;
-		break;
-	case C2_WR_TYPE_RECV:
-		entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
-		entry->opcode = IB_WC_RECV;
-		is_recv = 1;
-		break;
-	default:
-		break;
-	}
-
-	/* consume the WQEs */
-	if (is_recv)
-		c2_mq_lconsume(&qp->rq_mq, 1);
-	else
-		c2_mq_lconsume(&qp->sq_mq,
-			       be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
-
-	/* free the message */
-	c2_mq_free(&cq->mq);
-
-	return 0;
-}
-
-int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-	struct c2_dev *c2dev = to_c2dev(ibcq->device);
-	struct c2_cq *cq = to_c2cq(ibcq);
-	unsigned long flags;
-	int npolled, err;
-
-	spin_lock_irqsave(&cq->lock, flags);
-
-	for (npolled = 0; npolled < num_entries; ++npolled) {
-
-		err = c2_poll_one(c2dev, cq, entry + npolled);
-		if (err)
-			break;
-	}
-
-	spin_unlock_irqrestore(&cq->lock, flags);
-
-	return npolled;
-}
-
-int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-	struct c2_mq_shared __iomem *shared;
-	struct c2_cq *cq;
-	unsigned long flags;
-	int ret = 0;
-
-	cq = to_c2cq(ibcq);
-	shared = cq->mq.peer;
-
-	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
-		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
-	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
-	else
-		return -EINVAL;
-
-	writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
-
-	/*
-	 * Now read back shared->armed to make the PCI
-	 * write synchronous.  This is necessary for
-	 * correct cq notification semantics.
-	 */
-	readb(&shared->armed);
-
-	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-		spin_lock_irqsave(&cq->lock, flags);
-		ret = !c2_mq_empty(&cq->mq);
-		spin_unlock_irqrestore(&cq->lock, flags);
-	}
-
-	return ret;
-}
-
-static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
-{
-	dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
-			  mq->msg_pool.host, dma_unmap_addr(mq, mapping));
-}
-
-static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
-			   size_t q_size, size_t msg_size)
-{
-	u8 *pool_start;
-
-	if (q_size > SIZE_MAX / msg_size)
-		return -EINVAL;
-
-	pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
-					&mq->host_dma, GFP_KERNEL);
-	if (!pool_start)
-		return -ENOMEM;
-
-	c2_mq_rep_init(mq,
-		       0,		/* index (currently unknown) */
-		       q_size,
-		       msg_size,
-		       pool_start,
-		       NULL,	/* peer (currently unknown) */
-		       C2_MQ_HOST_TARGET);
-
-	dma_unmap_addr_set(mq, mapping, mq->host_dma);
-
-	return 0;
-}
-
-int c2_init_cq(struct c2_dev *c2dev, int entries,
-	       struct c2_ucontext *ctx, struct c2_cq *cq)
-{
-	struct c2wr_cq_create_req wr;
-	struct c2wr_cq_create_rep *reply;
-	unsigned long peer_pa;
-	struct c2_vq_req *vq_req;
-	int err;
-
-	might_sleep();
-
-	cq->ibcq.cqe = entries - 1;
-	cq->is_kernel = !ctx;
-
-	/* Allocate a shared pointer */
-	cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-				      &cq->mq.shared_dma, GFP_KERNEL);
-	if (!cq->mq.shared)
-		return -ENOMEM;
-
-	/* Allocate pages for the message pool */
-	err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
-	if (err)
-		goto bail0;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_CQ_CREATE);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.msg_size = cpu_to_be32(cq->mq.msg_size);
-	wr.depth = cpu_to_be32(cq->mq.q_size);
-	wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
-	wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
-	wr.user_context = (u64) (unsigned long) (cq);
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail2;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail2;
-
-	reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail2;
-	}
-
-	if ((err = c2_errno(reply)) != 0)
-		goto bail3;
-
-	cq->adapter_handle = reply->cq_handle;
-	cq->mq.index = be32_to_cpu(reply->mq_index);
-
-	peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
-	cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
-	if (!cq->mq.peer) {
-		err = -ENOMEM;
-		goto bail3;
-	}
-
-	vq_repbuf_free(c2dev, reply);
-	vq_req_free(c2dev, vq_req);
-
-	spin_lock_init(&cq->lock);
-	atomic_set(&cq->refcount, 1);
-	init_waitqueue_head(&cq->wait);
-
-	/*
-	 * Use the MQ index allocated by the adapter to
-	 * store the CQ in the qptr_array
-	 */
-	cq->cqn = cq->mq.index;
-	c2dev->qptr_array[cq->cqn] = cq;
-
-	return 0;
-
-      bail3:
-	vq_repbuf_free(c2dev, reply);
-      bail2:
-	vq_req_free(c2dev, vq_req);
-      bail1:
-	c2_free_cq_buf(c2dev, &cq->mq);
-      bail0:
-	c2_free_mqsp(cq->mq.shared);
-
-	return err;
-}
-
-void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
-{
-	int err;
-	struct c2_vq_req *vq_req;
-	struct c2wr_cq_destroy_req wr;
-	struct c2wr_cq_destroy_rep *reply;
-
-	might_sleep();
-
-	/* Clear CQ from the qptr array */
-	spin_lock_irq(&c2dev->lock);
-	c2dev->qptr_array[cq->mq.index] = NULL;
-	atomic_dec(&cq->refcount);
-	spin_unlock_irq(&c2dev->lock);
-
-	wait_event(cq->wait, !atomic_read(&cq->refcount));
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		goto bail0;
-	}
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.cq_handle = cq->adapter_handle;
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail1;
-
-	reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-	if (reply)
-		vq_repbuf_free(c2dev, reply);
-      bail1:
-	vq_req_free(c2dev, vq_req);
-      bail0:
-	if (cq->is_kernel) {
-		c2_free_cq_buf(c2dev, &cq->mq);
-	}
-
-	return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c
deleted file mode 100644
index 3a17d9b..0000000
--- a/drivers/infiniband/hw/amso1100/c2_intr.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_vq.h"
-
-static void handle_mq(struct c2_dev *c2dev, u32 index);
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
-
-/*
- * Handle RNIC interrupts
- */
-void c2_rnic_interrupt(struct c2_dev *c2dev)
-{
-	unsigned int mq_index;
-
-	while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
-		mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
-		if (mq_index & 0x80000000) {
-			break;
-		}
-
-		c2dev->hints_read++;
-		handle_mq(c2dev, mq_index);
-	}
-
-}
-
-/*
- * Top level MQ handler
- */
-static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
-{
-	if (c2dev->qptr_array[mq_index] == NULL) {
-		pr_debug("handle_mq: stray activity for mq_index=%d\n",
-			 mq_index);
-		return;
-	}
-
-	switch (mq_index) {
-	case (0):
-		/*
-		 * An index of 0 in the activity queue
-		 * indicates the req vq now has messages
-		 * available...
-		 *
-		 * Wake up any waiters waiting on req VQ
-		 * message availability.
-		 */
-		wake_up(&c2dev->req_vq_wo);
-		break;
-	case (1):
-		handle_vq(c2dev, mq_index);
-		break;
-	case (2):
-		/* We have to purge the VQ in case there are pending
-		 * accept reply requests that would result in the
-		 * generation of an ESTABLISHED event. If we don't
-		 * generate these first, a CLOSE event could end up
-		 * being delivered before the ESTABLISHED event.
-		 */
-		handle_vq(c2dev, 1);
-
-		c2_ae_event(c2dev, mq_index);
-		break;
-	default:
-		/* There is no event synchronization between CQ events
-		 * and AE or CM events. In fact, CQE could be
-		 * delivered for all of the I/O up to and including the
-		 * FLUSH for a peer disconenct prior to the ESTABLISHED
-		 * event being delivered to the app. The reason for this
-		 * is that CM events are delivered on a thread, while AE
-		 * and CM events are delivered on interrupt context.
-		 */
-		c2_cq_event(c2dev, mq_index);
-		break;
-	}
-
-	return;
-}
-
-/*
- * Handles verbs WR replies.
- */
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
-{
-	void *adapter_msg, *reply_msg;
-	struct c2wr_hdr *host_msg;
-	struct c2wr_hdr tmp;
-	struct c2_mq *reply_vq;
-	struct c2_vq_req *req;
-	struct iw_cm_event cm_event;
-	int err;
-
-	reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
-
-	/*
-	 * get next msg from mq_index into adapter_msg.
-	 * don't free it yet.
-	 */
-	adapter_msg = c2_mq_consume(reply_vq);
-	if (adapter_msg == NULL) {
-		return;
-	}
-
-	host_msg = vq_repbuf_alloc(c2dev);
-
-	/*
-	 * If we can't get a host buffer, then we'll still
-	 * wakeup the waiter, we just won't give him the msg.
-	 * It is assumed the waiter will deal with this...
-	 */
-	if (!host_msg) {
-		pr_debug("handle_vq: no repbufs!\n");
-
-		/*
-		 * just copy the WR header into a local variable.
-		 * this allows us to still demux on the context
-		 */
-		host_msg = &tmp;
-		memcpy(host_msg, adapter_msg, sizeof(tmp));
-		reply_msg = NULL;
-	} else {
-		memcpy(host_msg, adapter_msg, reply_vq->msg_size);
-		reply_msg = host_msg;
-	}
-
-	/*
-	 * consume the msg from the MQ
-	 */
-	c2_mq_free(reply_vq);
-
-	/*
-	 * wakeup the waiter.
-	 */
-	req = (struct c2_vq_req *) (unsigned long) host_msg->context;
-	if (req == NULL) {
-		/*
-		 * We should never get here, as the adapter should
-		 * never send us a reply that we're not expecting.
-		 */
-		if (reply_msg != NULL)
-			vq_repbuf_free(c2dev, host_msg);
-		pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
-		return;
-	}
-
-	if (reply_msg)
-		err = c2_errno(reply_msg);
-	else
-		err = -ENOMEM;
-
-	if (!err) switch (req->event) {
-	case IW_CM_EVENT_ESTABLISHED:
-		c2_set_qp_state(req->qp,
-				C2_QP_STATE_RTS);
-		/*
-		 * Until ird/ord negotiation via MPAv2 support is added, send
-		 * max supported values
-		 */
-		cm_event.ird = cm_event.ord = 128;
-	case IW_CM_EVENT_CLOSE:
-
-		/*
-		 * Move the QP to RTS if this is
-		 * the established event
-		 */
-		cm_event.event = req->event;
-		cm_event.status = 0;
-		cm_event.local_addr = req->cm_id->local_addr;
-		cm_event.remote_addr = req->cm_id->remote_addr;
-		cm_event.private_data = NULL;
-		cm_event.private_data_len = 0;
-		req->cm_id->event_handler(req->cm_id, &cm_event);
-		break;
-	default:
-		break;
-	}
-
-	req->reply_msg = (u64) (unsigned long) (reply_msg);
-	atomic_set(&req->reply_ready, 1);
-	wake_up(&req->wait_object);
-
-	/*
-	 * If the request was cancelled, then this put will
-	 * free the vq_req memory...and reply_msg!!!
-	 */
-	vq_req_put(c2dev, req);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/infiniband/hw/amso1100/c2_mm.c
deleted file mode 100644
index 119c4f3..0000000
--- a/drivers/infiniband/hw/amso1100/c2_mm.c
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-
-#define PBL_VIRT 1
-#define PBL_PHYS 2
-
-/*
- * Send all the PBL messages to convey the remainder of the PBL
- * Wait for the adapter's reply on the last one.
- * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
- *
- * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
- *	  Reply buffer _is_ freed by this function.
- */
-static int
-send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
-		  unsigned long va, u32 pbl_depth,
-		  struct c2_vq_req *vq_req, int pbl_type)
-{
-	u32 pbe_count;		/* amt that fits in a PBL msg */
-	u32 count;		/* amt in this PBL MSG. */
-	struct c2wr_nsmr_pbl_req *wr;	/* PBL WR ptr */
-	struct c2wr_nsmr_pbl_rep *reply;	/* reply ptr */
- 	int err, pbl_virt, pbl_index, i;
-
-	switch (pbl_type) {
-	case PBL_VIRT:
-		pbl_virt = 1;
-		break;
-	case PBL_PHYS:
-		pbl_virt = 0;
-		break;
-	default:
-		return -EINVAL;
-		break;
-	}
-
-	pbe_count = (c2dev->req_vq.msg_size -
-		     sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		return -ENOMEM;
-	}
-	c2_wr_set_id(wr, CCWR_NSMR_PBL);
-
-	/*
-	 * Only the last PBL message will generate a reply from the verbs,
-	 * so we set the context to 0 indicating there is no kernel verbs
-	 * handler blocked awaiting this reply.
-	 */
-	wr->hdr.context = 0;
-	wr->rnic_handle = c2dev->adapter_handle;
-	wr->stag_index = stag_index;	/* already swapped */
-	wr->flags = 0;
-	pbl_index = 0;
-	while (pbl_depth) {
-		count = min(pbe_count, pbl_depth);
-		wr->addrs_length = cpu_to_be32(count);
-
-		/*
-		 *  If this is the last message, then reference the
-		 *  vq request struct cuz we're gonna wait for a reply.
-		 *  also make this PBL msg as the last one.
-		 */
-		if (count == pbl_depth) {
-			/*
-			 * reference the request struct.  dereferenced in the
-			 * int handler.
-			 */
-			vq_req_get(c2dev, vq_req);
-			wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
-
-			/*
-			 * This is the last PBL message.
-			 * Set the context to our VQ Request Object so we can
-			 * wait for the reply.
-			 */
-			wr->hdr.context = (unsigned long) vq_req;
-		}
-
-		/*
-		 * If pbl_virt is set then va is a virtual address
-		 * that describes a virtually contiguous memory
-		 * allocation. The wr needs the start of each virtual page
-		 * to be converted to the corresponding physical address
-		 * of the page. If pbl_virt is not set then va is an array
-		 * of physical addresses and there is no conversion to do.
-		 * Just fill in the wr with what is in the array.
-		 */
-		for (i = 0; i < count; i++) {
-			if (pbl_virt) {
-				va += PAGE_SIZE;
-			} else {
- 				wr->paddrs[i] =
-				    cpu_to_be64(((u64 *)va)[pbl_index + i]);
-			}
-		}
-
-		/*
-		 * Send WR to adapter
-		 */
-		err = vq_send_wr(c2dev, (union c2wr *) wr);
-		if (err) {
-			if (count <= pbe_count) {
-				vq_req_put(c2dev, vq_req);
-			}
-			goto bail0;
-		}
-		pbl_depth -= count;
-		pbl_index += count;
-	}
-
-	/*
-	 *  Now wait for the reply...
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail0;
-	}
-
-	/*
-	 * Process reply
-	 */
-	reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	err = c2_errno(reply);
-
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	kfree(wr);
-	return err;
-}
-
-#define C2_PBL_MAX_DEPTH 131072
-int
-c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
- 			   int page_size, int pbl_depth, u32 length,
- 			   u32 offset, u64 *va, enum c2_acf acf,
-			   struct c2_mr *mr)
-{
-	struct c2_vq_req *vq_req;
-	struct c2wr_nsmr_register_req *wr;
-	struct c2wr_nsmr_register_rep *reply;
-	u16 flags;
-	int i, pbe_count, count;
-	int err;
-
-	if (!va || !length || !addr_list || !pbl_depth)
-		return -EINTR;
-
-	/*
-	 * Verify PBL depth is within rnic max
-	 */
-	if (pbl_depth > C2_PBL_MAX_DEPTH) {
-		return -EINTR;
-	}
-
-	/*
-	 * allocate verbs request object
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	/*
-	 * build the WR
-	 */
-	c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
-	wr->hdr.context = (unsigned long) vq_req;
-	wr->rnic_handle = c2dev->adapter_handle;
-
-	flags = (acf | MEM_VA_BASED | MEM_REMOTE);
-
-	/*
-	 * compute how many pbes can fit in the message
-	 */
-	pbe_count = (c2dev->req_vq.msg_size -
-		     sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
-
-	if (pbl_depth <= pbe_count) {
-		flags |= MEM_PBL_COMPLETE;
-	}
-	wr->flags = cpu_to_be16(flags);
-	wr->stag_key = 0;	//stag_key;
-	wr->va = cpu_to_be64(*va);
-	wr->pd_id = mr->pd->pd_id;
-	wr->pbe_size = cpu_to_be32(page_size);
-	wr->length = cpu_to_be32(length);
-	wr->pbl_depth = cpu_to_be32(pbl_depth);
-	wr->fbo = cpu_to_be32(offset);
-	count = min(pbl_depth, pbe_count);
-	wr->addrs_length = cpu_to_be32(count);
-
-	/*
-	 * fill out the PBL for this message
-	 */
-	for (i = 0; i < count; i++) {
-		wr->paddrs[i] = cpu_to_be64(addr_list[i]);
-	}
-
-	/*
-	 * regerence the request struct
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	/*
-	 * send the WR to the adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	/*
-	 * wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail1;
-	}
-
-	/*
-	 * process reply
-	 */
-	reply =
-	    (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-	if ((err = c2_errno(reply))) {
-		goto bail2;
-	}
-	//*p_pb_entries = be32_to_cpu(reply->pbl_depth);
-	mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
-	vq_repbuf_free(c2dev, reply);
-
-	/*
-	 * if there are still more PBEs we need to send them to
-	 * the adapter and wait for a reply on the final one.
-	 * reuse vq_req for this purpose.
-	 */
-	pbl_depth -= count;
-	if (pbl_depth) {
-
-		vq_req->reply_msg = (unsigned long) NULL;
-		atomic_set(&vq_req->reply_ready, 0);
-		err = send_pbl_messages(c2dev,
-					cpu_to_be32(mr->ibmr.lkey),
-					(unsigned long) &addr_list[i],
-					pbl_depth, vq_req, PBL_PHYS);
-		if (err) {
-			goto bail1;
-		}
-	}
-
-	vq_req_free(c2dev, vq_req);
-	kfree(wr);
-
-	return err;
-
-      bail2:
-	vq_repbuf_free(c2dev, reply);
-      bail1:
-	kfree(wr);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
-{
-	struct c2_vq_req *vq_req;	/* verbs request object */
-	struct c2wr_stag_dealloc_req wr;	/* work request */
-	struct c2wr_stag_dealloc_rep *reply;	/* WR reply  */
-	int err;
-
-
-	/*
-	 * allocate verbs request object
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		return -ENOMEM;
-	}
-
-	/*
-	 * Build the WR
-	 */
-	c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
-	wr.hdr.context = (u64) (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.stag_index = cpu_to_be32(stag_index);
-
-	/*
-	 * reference the request struct.  dereferenced in the int handler.
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	/*
-	 * Send WR to adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	/*
-	 * Wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail0;
-	}
-
-	/*
-	 * Process reply
-	 */
-	reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	err = c2_errno(reply);
-
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/infiniband/hw/amso1100/c2_mq.c
deleted file mode 100644
index 0cddc49..0000000
--- a/drivers/infiniband/hw/amso1100/c2_mq.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include "c2_mq.h"
-
-void *c2_mq_alloc(struct c2_mq *q)
-{
-	BUG_ON(q->magic != C2_MQ_MAGIC);
-	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-	if (c2_mq_full(q)) {
-		return NULL;
-	} else {
-#ifdef DEBUG
-		struct c2wr_hdr *m =
-		    (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-		BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
-		m->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-		return m;
-#else
-		return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-	}
-}
-
-void c2_mq_produce(struct c2_mq *q)
-{
-	BUG_ON(q->magic != C2_MQ_MAGIC);
-	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-	if (!c2_mq_full(q)) {
-		q->priv = (q->priv + 1) % q->q_size;
-		q->hint_count++;
-		/* Update peer's offset. */
-		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-	}
-}
-
-void *c2_mq_consume(struct c2_mq *q)
-{
-	BUG_ON(q->magic != C2_MQ_MAGIC);
-	BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-	if (c2_mq_empty(q)) {
-		return NULL;
-	} else {
-#ifdef DEBUG
-		struct c2wr_hdr *m = (struct c2wr_hdr *)
-		    (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-		BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
-#endif
-		return m;
-#else
-		return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-	}
-}
-
-void c2_mq_free(struct c2_mq *q)
-{
-	BUG_ON(q->magic != C2_MQ_MAGIC);
-	BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-	if (!c2_mq_empty(q)) {
-
-#ifdef CCMSGMAGIC
-		{
-			struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
-			    (q->msg_pool.adapter + q->priv * q->msg_size);
-			__raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
-		}
-#endif
-		q->priv = (q->priv + 1) % q->q_size;
-		/* Update peer's offset. */
-		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-	}
-}
-
-
-void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
-{
-	BUG_ON(q->magic != C2_MQ_MAGIC);
-	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-	while (wqe_count--) {
-		BUG_ON(c2_mq_empty(q));
-		*q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
-	}
-}
-
-#if 0
-u32 c2_mq_count(struct c2_mq *q)
-{
-	s32 count;
-
-	if (q->type == C2_MQ_HOST_TARGET)
-		count = be16_to_cpu(*q->shared) - q->priv;
-	else
-		count = q->priv - be16_to_cpu(*q->shared);
-
-	if (count < 0)
-		count += q->q_size;
-
-	return (u32) count;
-}
-#endif  /*  0  */
-
-void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-		    u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
-{
-	BUG_ON(!q->shared);
-
-	/* This code assumes the byte swapping has already been done! */
-	q->index = index;
-	q->q_size = q_size;
-	q->msg_size = msg_size;
-	q->msg_pool.adapter = pool_start;
-	q->peer = (struct c2_mq_shared __iomem *) peer;
-	q->magic = C2_MQ_MAGIC;
-	q->type = type;
-	q->priv = 0;
-	q->hint_count = 0;
-	return;
-}
-void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-		    u8 *pool_start, u16 __iomem *peer, u32 type)
-{
-	BUG_ON(!q->shared);
-
-	/* This code assumes the byte swapping has already been done! */
-	q->index = index;
-	q->q_size = q_size;
-	q->msg_size = msg_size;
-	q->msg_pool.host = pool_start;
-	q->peer = (struct c2_mq_shared __iomem *) peer;
-	q->magic = C2_MQ_MAGIC;
-	q->type = type;
-	q->priv = 0;
-	q->hint_count = 0;
-	return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/infiniband/hw/amso1100/c2_mq.h
deleted file mode 100644
index fc1b9a7..0000000
--- a/drivers/infiniband/hw/amso1100/c2_mq.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _C2_MQ_H_
-#define _C2_MQ_H_
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include "c2_wr.h"
-
-enum c2_shared_regs {
-
-	C2_SHARED_ARMED = 0x10,
-	C2_SHARED_NOTIFY = 0x18,
-	C2_SHARED_SHARED = 0x40,
-};
-
-struct c2_mq_shared {
-	u16 unused1;
-	u8 armed;
-	u8 notification_type;
-	u32 unused2;
-	u16 shared;
-	/* Pad to 64 bytes. */
-	u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
-};
-
-enum c2_mq_type {
-	C2_MQ_HOST_TARGET = 1,
-	C2_MQ_ADAPTER_TARGET = 2,
-};
-
-/*
- * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
- * c2_user_mq_t (which is the same format) is for user-mode MQs...
- */
-#define C2_MQ_MAGIC 0x4d512020	/* 'MQ  ' */
-struct c2_mq {
-	u32 magic;
-	union {
-		u8 *host;
-		u8 __iomem *adapter;
-	} msg_pool;
-	dma_addr_t host_dma;
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-	u16 hint_count;
-	u16 priv;
-	struct c2_mq_shared __iomem *peer;
-	__be16 *shared;
-	dma_addr_t shared_dma;
-	u32 q_size;
-	u32 msg_size;
-	u32 index;
-	enum c2_mq_type type;
-};
-
-static __inline__ int c2_mq_empty(struct c2_mq *q)
-{
-	return q->priv == be16_to_cpu(*q->shared);
-}
-
-static __inline__ int c2_mq_full(struct c2_mq *q)
-{
-	return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
-}
-
-extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
-extern void *c2_mq_alloc(struct c2_mq *q);
-extern void c2_mq_produce(struct c2_mq *q);
-extern void *c2_mq_consume(struct c2_mq *q);
-extern void c2_mq_free(struct c2_mq *q);
-extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-		       u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
-extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-			   u8 *pool_start, u16 __iomem *peer, u32 type);
-
-#endif				/* _C2_MQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c
deleted file mode 100644
index f3e81dc..0000000
--- a/drivers/infiniband/hw/amso1100/c2_pd.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-
-#include "c2.h"
-#include "c2_provider.h"
-
-int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
-{
-	u32 obj;
-	int ret = 0;
-
-	spin_lock(&c2dev->pd_table.lock);
-	obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
-				 c2dev->pd_table.last);
-	if (obj >= c2dev->pd_table.max)
-		obj = find_first_zero_bit(c2dev->pd_table.table,
-					  c2dev->pd_table.max);
-	if (obj < c2dev->pd_table.max) {
-		pd->pd_id = obj;
-		__set_bit(obj, c2dev->pd_table.table);
-		c2dev->pd_table.last = obj+1;
-		if (c2dev->pd_table.last >= c2dev->pd_table.max)
-			c2dev->pd_table.last = 0;
-	} else
-		ret = -ENOMEM;
-	spin_unlock(&c2dev->pd_table.lock);
-	return ret;
-}
-
-void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
-{
-	spin_lock(&c2dev->pd_table.lock);
-	__clear_bit(pd->pd_id, c2dev->pd_table.table);
-	spin_unlock(&c2dev->pd_table.lock);
-}
-
-int c2_init_pd_table(struct c2_dev *c2dev)
-{
-
-	c2dev->pd_table.last = 0;
-	c2dev->pd_table.max = c2dev->props.max_pd;
-	spin_lock_init(&c2dev->pd_table.lock);
-	c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
-					sizeof(long), GFP_KERNEL);
-	if (!c2dev->pd_table.table)
-		return -ENOMEM;
-	bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
-	return 0;
-}
-
-void c2_cleanup_pd_table(struct c2_dev *c2dev)
-{
-	kfree(c2dev->pd_table.table);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
deleted file mode 100644
index 25c3f00..0000000
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ /dev/null
@@ -1,912 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/if_arp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_umem.h>
-#include <rdma/ib_user_verbs.h>
-#include "c2.h"
-#include "c2_provider.h"
-#include "c2_user.h"
-
-static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-			   struct ib_udata *uhw)
-{
-	struct c2_dev *c2dev = to_c2dev(ibdev);
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	if (uhw->inlen || uhw->outlen)
-		return -EINVAL;
-
-	*props = c2dev->props;
-	return 0;
-}
-
-static int c2_query_port(struct ib_device *ibdev,
-			 u8 port, struct ib_port_attr *props)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	props->max_mtu = IB_MTU_4096;
-	props->lid = 0;
-	props->lmc = 0;
-	props->sm_lid = 0;
-	props->sm_sl = 0;
-	props->state = IB_PORT_ACTIVE;
-	props->phys_state = 0;
-	props->port_cap_flags =
-	    IB_PORT_CM_SUP |
-	    IB_PORT_REINIT_SUP |
-	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
-	props->qkey_viol_cntr = 0;
-	props->active_width = 1;
-	props->active_speed = IB_SPEED_SDR;
-
-	return 0;
-}
-
-static int c2_query_pkey(struct ib_device *ibdev,
-			 u8 port, u16 index, u16 * pkey)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	*pkey = 0;
-	return 0;
-}
-
-static int c2_query_gid(struct ib_device *ibdev, u8 port,
-			int index, union ib_gid *gid)
-{
-	struct c2_dev *c2dev = to_c2dev(ibdev);
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-	memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
-
-	return 0;
-}
-
-/* Allocate the user context data structure. This keeps track
- * of all objects associated with a particular user-mode client.
- */
-static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
-					     struct ib_udata *udata)
-{
-	struct c2_ucontext *context;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
-	return &context->ibucontext;
-}
-
-static int c2_dealloc_ucontext(struct ib_ucontext *context)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	kfree(context);
-	return 0;
-}
-
-static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return -ENOSYS;
-}
-
-static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
-				 struct ib_ucontext *context,
-				 struct ib_udata *udata)
-{
-	struct c2_pd *pd;
-	int err;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
-	err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
-
-	if (context) {
-		if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
-			c2_pd_free(to_c2dev(ibdev), pd);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
-		}
-	}
-
-	return &pd->ibpd;
-}
-
-static int c2_dealloc_pd(struct ib_pd *pd)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
-	kfree(pd);
-
-	return 0;
-}
-
-static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return ERR_PTR(-ENOSYS);
-}
-
-static int c2_ah_destroy(struct ib_ah *ah)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return -ENOSYS;
-}
-
-static void c2_add_ref(struct ib_qp *ibqp)
-{
-	struct c2_qp *qp;
-	BUG_ON(!ibqp);
-	qp = to_c2qp(ibqp);
-	atomic_inc(&qp->refcount);
-}
-
-static void c2_rem_ref(struct ib_qp *ibqp)
-{
-	struct c2_qp *qp;
-	BUG_ON(!ibqp);
-	qp = to_c2qp(ibqp);
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
-}
-
-struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
-{
-	struct c2_dev* c2dev = to_c2dev(device);
-	struct c2_qp *qp;
-
-	qp = c2_find_qpn(c2dev, qpn);
-	pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
-		__func__, qp, qpn, device,
-		(qp?atomic_read(&qp->refcount):0));
-
-	return (qp?&qp->ibqp:NULL);
-}
-
-static struct ib_qp *c2_create_qp(struct ib_pd *pd,
-				  struct ib_qp_init_attr *init_attr,
-				  struct ib_udata *udata)
-{
-	struct c2_qp *qp;
-	int err;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	if (init_attr->create_flags)
-		return ERR_PTR(-EINVAL);
-
-	switch (init_attr->qp_type) {
-	case IB_QPT_RC:
-		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-		if (!qp) {
-			pr_debug("%s: Unable to allocate QP\n", __func__);
-			return ERR_PTR(-ENOMEM);
-		}
-		spin_lock_init(&qp->lock);
-		if (pd->uobject) {
-			/* userspace specific */
-		}
-
-		err = c2_alloc_qp(to_c2dev(pd->device),
-				  to_c2pd(pd), init_attr, qp);
-
-		if (err && pd->uobject) {
-			/* userspace specific */
-		}
-
-		break;
-	default:
-		pr_debug("%s: Invalid QP type: %d\n", __func__,
-			init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (err) {
-		kfree(qp);
-		return ERR_PTR(err);
-	}
-
-	return &qp->ibqp;
-}
-
-static int c2_destroy_qp(struct ib_qp *ib_qp)
-{
-	struct c2_qp *qp = to_c2qp(ib_qp);
-
-	pr_debug("%s:%u qp=%p,qp->state=%d\n",
-		__func__, __LINE__, ib_qp, qp->state);
-	c2_free_qp(to_c2dev(ib_qp->device), qp);
-	kfree(qp);
-	return 0;
-}
-
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
-				  const struct ib_cq_init_attr *attr,
-				  struct ib_ucontext *context,
-				  struct ib_udata *udata)
-{
-	int entries = attr->cqe;
-	struct c2_cq *cq;
-	int err;
-
-	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		pr_debug("%s: Unable to allocate CQ\n", __func__);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
-	if (err) {
-		pr_debug("%s: error initializing CQ\n", __func__);
-		kfree(cq);
-		return ERR_PTR(err);
-	}
-
-	return &cq->ibcq;
-}
-
-static int c2_destroy_cq(struct ib_cq *ib_cq)
-{
-	struct c2_cq *cq = to_c2cq(ib_cq);
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	c2_free_cq(to_c2dev(ib_cq->device), cq);
-	kfree(cq);
-
-	return 0;
-}
-
-static inline u32 c2_convert_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
-	    (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
-	    (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
-	    C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
-}
-
-static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
-				    struct ib_phys_buf *buffer_list,
-				    int num_phys_buf, int acc, u64 * iova_start)
-{
-	struct c2_mr *mr;
-	u64 *page_list;
-	u32 total_len;
-	int err, i, j, k, page_shift, pbl_depth;
-
-	pbl_depth = 0;
-	total_len = 0;
-
-	page_shift = PAGE_SHIFT;
-	/*
-	 * If there is only 1 buffer we assume this could
-	 * be a map of all phy mem...use a 32k page_shift.
-	 */
-	if (num_phys_buf == 1)
-		page_shift += 3;
-
-	for (i = 0; i < num_phys_buf; i++) {
-
-		if (buffer_list[i].addr & ~PAGE_MASK) {
-			pr_debug("Unaligned Memory Buffer: 0x%x\n",
-				(unsigned int) buffer_list[i].addr);
-			return ERR_PTR(-EINVAL);
-		}
-
-		if (!buffer_list[i].size) {
-			pr_debug("Invalid Buffer Size\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		total_len += buffer_list[i].size;
-		pbl_depth += ALIGN(buffer_list[i].size,
-				   (1 << page_shift)) >> page_shift;
-	}
-
-	page_list = vmalloc(sizeof(u64) * pbl_depth);
-	if (!page_list) {
-		pr_debug("couldn't vmalloc page_list of size %zd\n",
-			(sizeof(u64) * pbl_depth));
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (i = 0, j = 0; i < num_phys_buf; i++) {
-
-		int naddrs;
-
- 		naddrs = ALIGN(buffer_list[i].size,
-			       (1 << page_shift)) >> page_shift;
-		for (k = 0; k < naddrs; k++)
-			page_list[j++] = (buffer_list[i].addr +
-						     (k << page_shift));
-	}
-
-	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr) {
-		vfree(page_list);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	mr->pd = to_c2pd(ib_pd);
-	mr->umem = NULL;
-	pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
-		"*iova_start %llx, first pa %llx, last pa %llx\n",
-		__func__, page_shift, pbl_depth, total_len,
-		(unsigned long long) *iova_start,
-	       	(unsigned long long) page_list[0],
-	       	(unsigned long long) page_list[pbl_depth-1]);
-  	err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
- 					 (1 << page_shift), pbl_depth,
-					 total_len, 0, iova_start,
-					 c2_convert_access(acc), mr);
-	vfree(page_list);
-	if (err) {
-		kfree(mr);
-		return ERR_PTR(err);
-	}
-
-	return &mr->ibmr;
-}
-
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	struct ib_phys_buf bl;
-	u64 kva = 0;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	/* AMSO1100 limit */
-	bl.size = 0xffffffff;
-	bl.addr = 0;
-	return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
-}
-
-static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				    u64 virt, int acc, struct ib_udata *udata)
-{
-	u64 *pages;
-	u64 kva = 0;
-	int shift, n, len;
-	int i, k, entry;
-	int err = 0;
-	struct scatterlist *sg;
-	struct c2_pd *c2pd = to_c2pd(pd);
-	struct c2_mr *c2mr;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
-	if (!c2mr)
-		return ERR_PTR(-ENOMEM);
-	c2mr->pd = c2pd;
-
-	c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
-	if (IS_ERR(c2mr->umem)) {
-		err = PTR_ERR(c2mr->umem);
-		kfree(c2mr);
-		return ERR_PTR(err);
-	}
-
-	shift = ffs(c2mr->umem->page_size) - 1;
-	n = c2mr->umem->nmap;
-
-	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
-	if (!pages) {
-		err = -ENOMEM;
-		goto err;
-	}
-
-	i = 0;
-	for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] =
-				sg_dma_address(sg) +
-				(c2mr->umem->page_size * k);
-		}
-	}
-
-	kva = virt;
-  	err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
-					 pages,
-					 c2mr->umem->page_size,
-					 i,
-					 length,
-					 ib_umem_offset(c2mr->umem),
-					 &kva,
-					 c2_convert_access(acc),
-					 c2mr);
-	kfree(pages);
-	if (err)
-		goto err;
-	return &c2mr->ibmr;
-
-err:
-	ib_umem_release(c2mr->umem);
-	kfree(c2mr);
-	return ERR_PTR(err);
-}
-
-static int c2_dereg_mr(struct ib_mr *ib_mr)
-{
-	struct c2_mr *mr = to_c2mr(ib_mr);
-	int err;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
-	if (err)
-		pr_debug("c2_stag_dealloc failed: %d\n", err);
-	else {
-		if (mr->umem)
-			ib_umem_release(mr->umem);
-		kfree(mr);
-	}
-
-	return err;
-}
-
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return sprintf(buf, "%x\n", c2dev->props.hw_ver);
-}
-
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-			   char *buf)
-{
-	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return sprintf(buf, "%x.%x.%x\n",
-		       (int) (c2dev->props.fw_ver >> 32),
-		       (int) (c2dev->props.fw_ver >> 16) & 0xffff,
-		       (int) (c2dev->props.fw_ver & 0xffff));
-}
-
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return sprintf(buf, "AMSO1100\n");
-}
-
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-
-static struct device_attribute *c2_dev_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_fw_ver,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
-};
-
-static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-			int attr_mask, struct ib_udata *udata)
-{
-	int err;
-
-	err =
-	    c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
-			 attr_mask);
-
-	return err;
-}
-
-static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return -ENOSYS;
-}
-
-static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return -ENOSYS;
-}
-
-static int c2_process_mad(struct ib_device *ibdev,
-			  int mad_flags,
-			  u8 port_num,
-			  const struct ib_wc *in_wc,
-			  const struct ib_grh *in_grh,
-			  const struct ib_mad_hdr *in_mad,
-			  size_t in_mad_size,
-			  struct ib_mad_hdr *out_mad,
-			  size_t *out_mad_size,
-			  u16 *out_mad_pkey_index)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	return -ENOSYS;
-}
-
-static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	/* Request a connection */
-	return c2_llp_connect(cm_id, iw_param);
-}
-
-static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	/* Accept the new connection */
-	return c2_llp_accept(cm_id, iw_param);
-}
-
-static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-	int err;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	err = c2_llp_reject(cm_id, pdata, pdata_len);
-	return err;
-}
-
-static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-	int err;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	err = c2_llp_service_create(cm_id, backlog);
-	pr_debug("%s:%u err=%d\n",
-		__func__, __LINE__,
-		err);
-	return err;
-}
-
-static int c2_service_destroy(struct iw_cm_id *cm_id)
-{
-	int err;
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	err = c2_llp_service_destroy(cm_id);
-
-	return err;
-}
-
-static int c2_pseudo_up(struct net_device *netdev)
-{
-	struct in_device *ind;
-	struct c2_dev *c2dev = netdev->ml_priv;
-
-	ind = in_dev_get(netdev);
-	if (!ind)
-		return 0;
-
-	pr_debug("adding...\n");
-	for_ifa(ind) {
-#ifdef DEBUG
-		u8 *ip = (u8 *) & ifa->ifa_address;
-
-		pr_debug("%s: %d.%d.%d.%d\n",
-		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-		c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-	}
-	endfor_ifa(ind);
-	in_dev_put(ind);
-
-	return 0;
-}
-
-static int c2_pseudo_down(struct net_device *netdev)
-{
-	struct in_device *ind;
-	struct c2_dev *c2dev = netdev->ml_priv;
-
-	ind = in_dev_get(netdev);
-	if (!ind)
-		return 0;
-
-	pr_debug("deleting...\n");
-	for_ifa(ind) {
-#ifdef DEBUG
-		u8 *ip = (u8 *) & ifa->ifa_address;
-
-		pr_debug("%s: %d.%d.%d.%d\n",
-		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-		c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-	}
-	endfor_ifa(ind);
-	in_dev_put(ind);
-
-	return 0;
-}
-
-static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-	kfree_skb(skb);
-	return NETDEV_TX_OK;
-}
-
-static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
-{
-	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-		return -EINVAL;
-
-	netdev->mtu = new_mtu;
-
-	/* TODO: Tell rnic about new rmda interface mtu */
-	return 0;
-}
-
-static const struct net_device_ops c2_pseudo_netdev_ops = {
-	.ndo_open 		= c2_pseudo_up,
-	.ndo_stop 		= c2_pseudo_down,
-	.ndo_start_xmit 	= c2_pseudo_xmit_frame,
-	.ndo_change_mtu 	= c2_pseudo_change_mtu,
-	.ndo_validate_addr	= eth_validate_addr,
-};
-
-static void setup(struct net_device *netdev)
-{
-	netdev->netdev_ops = &c2_pseudo_netdev_ops;
-
-	netdev->watchdog_timeo = 0;
-	netdev->type = ARPHRD_ETHER;
-	netdev->mtu = 1500;
-	netdev->hard_header_len = ETH_HLEN;
-	netdev->addr_len = ETH_ALEN;
-	netdev->tx_queue_len = 0;
-	netdev->flags |= IFF_NOARP;
-}
-
-static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
-{
-	char name[IFNAMSIZ];
-	struct net_device *netdev;
-
-	/* change ethxxx to iwxxx */
-	strcpy(name, "iw");
-	strcat(name, &c2dev->netdev->name[3]);
-	netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
-	if (!netdev) {
-		printk(KERN_ERR PFX "%s -  etherdev alloc failed",
-			__func__);
-		return NULL;
-	}
-
-	netdev->ml_priv = c2dev;
-
-	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-	memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
-
-	/* Print out the MAC address */
-	pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
-
-#if 0
-	/* Disable network packets */
-	netif_stop_queue(netdev);
-#endif
-	return netdev;
-}
-
-static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
-			     struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	err = c2_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-	return 0;
-}
-
-int c2_register_device(struct c2_dev *dev)
-{
-	int ret = -ENOMEM;
-	int i;
-
-	/* Register pseudo network device */
-	dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
-	if (!dev->pseudo_netdev)
-		goto out;
-
-	ret = register_netdev(dev->pseudo_netdev);
-	if (ret)
-		goto out_free_netdev;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
-	dev->ibdev.owner = THIS_MODULE;
-	dev->ibdev.uverbs_cmd_mask =
-	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
-	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
-	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
-
-	dev->ibdev.node_type = RDMA_NODE_RNIC;
-	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
-	memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
-	dev->ibdev.phys_port_cnt = 1;
-	dev->ibdev.num_comp_vectors = 1;
-	dev->ibdev.dma_device = &dev->pcidev->dev;
-	dev->ibdev.query_device = c2_query_device;
-	dev->ibdev.query_port = c2_query_port;
-	dev->ibdev.query_pkey = c2_query_pkey;
-	dev->ibdev.query_gid = c2_query_gid;
-	dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
-	dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
-	dev->ibdev.mmap = c2_mmap_uar;
-	dev->ibdev.alloc_pd = c2_alloc_pd;
-	dev->ibdev.dealloc_pd = c2_dealloc_pd;
-	dev->ibdev.create_ah = c2_ah_create;
-	dev->ibdev.destroy_ah = c2_ah_destroy;
-	dev->ibdev.create_qp = c2_create_qp;
-	dev->ibdev.modify_qp = c2_modify_qp;
-	dev->ibdev.destroy_qp = c2_destroy_qp;
-	dev->ibdev.create_cq = c2_create_cq;
-	dev->ibdev.destroy_cq = c2_destroy_cq;
-	dev->ibdev.poll_cq = c2_poll_cq;
-	dev->ibdev.get_dma_mr = c2_get_dma_mr;
-	dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
-	dev->ibdev.reg_user_mr = c2_reg_user_mr;
-	dev->ibdev.dereg_mr = c2_dereg_mr;
-	dev->ibdev.get_port_immutable = c2_port_immutable;
-
-	dev->ibdev.alloc_fmr = NULL;
-	dev->ibdev.unmap_fmr = NULL;
-	dev->ibdev.dealloc_fmr = NULL;
-	dev->ibdev.map_phys_fmr = NULL;
-
-	dev->ibdev.attach_mcast = c2_multicast_attach;
-	dev->ibdev.detach_mcast = c2_multicast_detach;
-	dev->ibdev.process_mad = c2_process_mad;
-
-	dev->ibdev.req_notify_cq = c2_arm_cq;
-	dev->ibdev.post_send = c2_post_send;
-	dev->ibdev.post_recv = c2_post_receive;
-
-	dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
-	if (dev->ibdev.iwcm == NULL) {
-		ret = -ENOMEM;
-		goto out_unregister_netdev;
-	}
-	dev->ibdev.iwcm->add_ref = c2_add_ref;
-	dev->ibdev.iwcm->rem_ref = c2_rem_ref;
-	dev->ibdev.iwcm->get_qp = c2_get_qp;
-	dev->ibdev.iwcm->connect = c2_connect;
-	dev->ibdev.iwcm->accept = c2_accept;
-	dev->ibdev.iwcm->reject = c2_reject;
-	dev->ibdev.iwcm->create_listen = c2_service_create;
-	dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
-
-	ret = ib_register_device(&dev->ibdev, NULL);
-	if (ret)
-		goto out_free_iwcm;
-
-	for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
-		ret = device_create_file(&dev->ibdev.dev,
-					       c2_dev_attributes[i]);
-		if (ret)
-			goto out_unregister_ibdev;
-	}
-	goto out;
-
-out_unregister_ibdev:
-	ib_unregister_device(&dev->ibdev);
-out_free_iwcm:
-	kfree(dev->ibdev.iwcm);
-out_unregister_netdev:
-	unregister_netdev(dev->pseudo_netdev);
-out_free_netdev:
-	free_netdev(dev->pseudo_netdev);
-out:
-	pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
-	return ret;
-}
-
-void c2_unregister_device(struct c2_dev *dev)
-{
-	pr_debug("%s:%u\n", __func__, __LINE__);
-	unregister_netdev(dev->pseudo_netdev);
-	free_netdev(dev->pseudo_netdev);
-	ib_unregister_device(&dev->ibdev);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h
deleted file mode 100644
index bf18998..0000000
--- a/drivers/infiniband/hw/amso1100/c2_provider.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_PROVIDER_H
-#define C2_PROVIDER_H
-#include <linux/inetdevice.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-
-#include "c2_mq.h"
-#include <rdma/iw_cm.h>
-
-#define C2_MPT_FLAG_ATOMIC        (1 << 14)
-#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
-#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
-#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
-#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
-
-struct c2_buf_list {
-	void *buf;
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-};
-
-
-/* The user context keeps track of objects allocated for a
- * particular user-mode client. */
-struct c2_ucontext {
-	struct ib_ucontext ibucontext;
-};
-
-struct c2_mtt;
-
-/* All objects associated with a PD are kept in the
- * associated user context if present.
- */
-struct c2_pd {
-	struct ib_pd ibpd;
-	u32 pd_id;
-};
-
-struct c2_mr {
-	struct ib_mr ibmr;
-	struct c2_pd *pd;
-	struct ib_umem *umem;
-};
-
-struct c2_av;
-
-enum c2_ah_type {
-	C2_AH_ON_HCA,
-	C2_AH_PCI_POOL,
-	C2_AH_KMALLOC
-};
-
-struct c2_ah {
-	struct ib_ah ibah;
-};
-
-struct c2_cq {
-	struct ib_cq ibcq;
-	spinlock_t lock;
-	atomic_t refcount;
-	int cqn;
-	int is_kernel;
-	wait_queue_head_t wait;
-
-	u32 adapter_handle;
-	struct c2_mq mq;
-};
-
-struct c2_wq {
-	spinlock_t lock;
-};
-struct iw_cm_id;
-struct c2_qp {
-	struct ib_qp ibqp;
-	struct iw_cm_id *cm_id;
-	spinlock_t lock;
-	atomic_t refcount;
-	wait_queue_head_t wait;
-	int qpn;
-
-	u32 adapter_handle;
-	u32 send_sgl_depth;
-	u32 recv_sgl_depth;
-	u32 rdma_write_sgl_depth;
-	u8 state;
-
-	struct c2_mq sq_mq;
-	struct c2_mq rq_mq;
-};
-
-struct c2_cr_query_attrs {
-	u32 local_addr;
-	u32 remote_addr;
-	u16 local_port;
-	u16 remote_port;
-};
-
-static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
-{
-	return container_of(ibpd, struct c2_pd, ibpd);
-}
-
-static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
-{
-	return container_of(ibucontext, struct c2_ucontext, ibucontext);
-}
-
-static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
-{
-	return container_of(ibmr, struct c2_mr, ibmr);
-}
-
-
-static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
-{
-	return container_of(ibah, struct c2_ah, ibah);
-}
-
-static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
-{
-	return container_of(ibcq, struct c2_cq, ibcq);
-}
-
-static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
-{
-	return container_of(ibqp, struct c2_qp, ibqp);
-}
-
-static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
-{
-	struct in_device *ind;
-	int ret = 0;
-
-	ind = in_dev_get(netdev);
-	if (!ind)
-		return 0;
-
-	for_ifa(ind) {
-		if (ifa->ifa_address == addr) {
-			ret = 1;
-			break;
-		}
-	}
-	endfor_ifa(ind);
-	in_dev_put(ind);
-	return ret;
-}
-#endif				/* C2_PROVIDER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c
deleted file mode 100644
index 86708de..0000000
--- a/drivers/infiniband/hw/amso1100/c2_qp.c
+++ /dev/null
@@ -1,1024 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_MAX_ORD_PER_QP 128
-#define C2_MAX_IRD_PER_QP 128
-
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-#define NO_SUPPORT -1
-static const u8 c2_opcode[] = {
-	[IB_WR_SEND] = C2_WR_TYPE_SEND,
-	[IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
-	[IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
-	[IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
-	[IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
-	[IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
-	[IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
-};
-
-static int to_c2_state(enum ib_qp_state ib_state)
-{
-	switch (ib_state) {
-	case IB_QPS_RESET:
-		return C2_QP_STATE_IDLE;
-	case IB_QPS_RTS:
-		return C2_QP_STATE_RTS;
-	case IB_QPS_SQD:
-		return C2_QP_STATE_CLOSING;
-	case IB_QPS_SQE:
-		return C2_QP_STATE_CLOSING;
-	case IB_QPS_ERR:
-		return C2_QP_STATE_ERROR;
-	default:
-		return -1;
-	}
-}
-
-static int to_ib_state(enum c2_qp_state c2_state)
-{
-	switch (c2_state) {
-	case C2_QP_STATE_IDLE:
-		return IB_QPS_RESET;
-	case C2_QP_STATE_CONNECTING:
-		return IB_QPS_RTR;
-	case C2_QP_STATE_RTS:
-		return IB_QPS_RTS;
-	case C2_QP_STATE_CLOSING:
-		return IB_QPS_SQD;
-	case C2_QP_STATE_ERROR:
-		return IB_QPS_ERR;
-	case C2_QP_STATE_TERMINATE:
-		return IB_QPS_SQE;
-	default:
-		return -1;
-	}
-}
-
-static const char *to_ib_state_str(int ib_state)
-{
-	static const char *state_str[] = {
-		"IB_QPS_RESET",
-		"IB_QPS_INIT",
-		"IB_QPS_RTR",
-		"IB_QPS_RTS",
-		"IB_QPS_SQD",
-		"IB_QPS_SQE",
-		"IB_QPS_ERR"
-	};
-	if (ib_state < IB_QPS_RESET ||
-	    ib_state > IB_QPS_ERR)
-		return "<invalid IB QP state>";
-
-	ib_state -= IB_QPS_RESET;
-	return state_str[ib_state];
-}
-
-void c2_set_qp_state(struct c2_qp *qp, int c2_state)
-{
-	int new_state = to_ib_state(c2_state);
-
-	pr_debug("%s: qp[%p] state modify %s --> %s\n",
-	       __func__,
-		qp,
-		to_ib_state_str(qp->state),
-		to_ib_state_str(new_state));
-	qp->state = new_state;
-}
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-		 struct ib_qp_attr *attr, int attr_mask)
-{
-	struct c2wr_qp_modify_req wr;
-	struct c2wr_qp_modify_rep *reply;
-	struct c2_vq_req *vq_req;
-	unsigned long flags;
-	u8 next_state;
-	int err;
-
-	pr_debug("%s:%d qp=%p, %s --> %s\n",
-		__func__, __LINE__,
-		qp,
-		to_ib_state_str(qp->state),
-		to_ib_state_str(attr->qp_state));
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.qp_handle = qp->adapter_handle;
-	wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-	wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-	if (attr_mask & IB_QP_STATE) {
-		/* Ensure the state is valid */
-		if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
-			err = -EINVAL;
-			goto bail0;
-		}
-
-		wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
-
-		if (attr->qp_state == IB_QPS_ERR) {
-			spin_lock_irqsave(&qp->lock, flags);
-			if (qp->cm_id && qp->state == IB_QPS_RTS) {
-				pr_debug("Generating CLOSE event for QP-->ERR, "
-					"qp=%p, cm_id=%p\n",qp,qp->cm_id);
-				/* Generate an CLOSE event */
-				vq_req->cm_id = qp->cm_id;
-				vq_req->event = IW_CM_EVENT_CLOSE;
-			}
-			spin_unlock_irqrestore(&qp->lock, flags);
-		}
-		next_state =  attr->qp_state;
-
-	} else if (attr_mask & IB_QP_CUR_STATE) {
-
-		if (attr->cur_qp_state != IB_QPS_RTR &&
-		    attr->cur_qp_state != IB_QPS_RTS &&
-		    attr->cur_qp_state != IB_QPS_SQD &&
-		    attr->cur_qp_state != IB_QPS_SQE) {
-			err = -EINVAL;
-			goto bail0;
-		} else
-			wr.next_qp_state =
-			    cpu_to_be32(to_c2_state(attr->cur_qp_state));
-
-		next_state = attr->cur_qp_state;
-
-	} else {
-		err = 0;
-		goto bail0;
-	}
-
-	/* reference the request struct */
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail0;
-
-	reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	err = c2_errno(reply);
-	if (!err)
-		qp->state = next_state;
-#ifdef DEBUG
-	else
-		pr_debug("%s: c2_errno=%d\n", __func__, err);
-#endif
-	/*
-	 * If we're going to error and generating the event here, then
-	 * we need to remove the reference because there will be no
-	 * close event generated by the adapter
-	*/
-	spin_lock_irqsave(&qp->lock, flags);
-	if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
-		qp->cm_id->rem_ref(qp->cm_id);
-		qp->cm_id = NULL;
-	}
-	spin_unlock_irqrestore(&qp->lock, flags);
-
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-
-	pr_debug("%s:%d qp=%p, cur_state=%s\n",
-		__func__, __LINE__,
-		qp,
-		to_ib_state_str(qp->state));
-	return err;
-}
-
-int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-			  int ord, int ird)
-{
-	struct c2wr_qp_modify_req wr;
-	struct c2wr_qp_modify_rep *reply;
-	struct c2_vq_req *vq_req;
-	int err;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.qp_handle = qp->adapter_handle;
-	wr.ord = cpu_to_be32(ord);
-	wr.ird = cpu_to_be32(ird);
-	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-	wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-	/* reference the request struct */
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail0;
-
-	reply = (struct c2wr_qp_modify_rep *) (unsigned long)
-		vq_req->reply_msg;
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	err = c2_errno(reply);
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-	struct c2_vq_req *vq_req;
-	struct c2wr_qp_destroy_req wr;
-	struct c2wr_qp_destroy_rep *reply;
-	unsigned long flags;
-	int err;
-
-	/*
-	 * Allocate a verb request message
-	 */
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req) {
-		return -ENOMEM;
-	}
-
-	/*
-	 * Initialize the WR
-	 */
-	c2_wr_set_id(&wr, CCWR_QP_DESTROY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.qp_handle = qp->adapter_handle;
-
-	/*
-	 * reference the request struct.  dereferenced in the int handler.
-	 */
-	vq_req_get(c2dev, vq_req);
-
-	spin_lock_irqsave(&qp->lock, flags);
-	if (qp->cm_id && qp->state == IB_QPS_RTS) {
-		pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
-			"qp=%p, cm_id=%p\n",qp,qp->cm_id);
-		/* Generate an CLOSE event */
-		vq_req->qp = qp;
-		vq_req->cm_id = qp->cm_id;
-		vq_req->event = IW_CM_EVENT_CLOSE;
-	}
-	spin_unlock_irqrestore(&qp->lock, flags);
-
-	/*
-	 * Send WR to adapter
-	 */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	/*
-	 * Wait for reply from adapter
-	 */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail0;
-	}
-
-	/*
-	 * Process reply
-	 */
-	reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	spin_lock_irqsave(&qp->lock, flags);
-	if (qp->cm_id) {
-		qp->cm_id->rem_ref(qp->cm_id);
-		qp->cm_id = NULL;
-	}
-	spin_unlock_irqrestore(&qp->lock, flags);
-
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-	int ret;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irq(&c2dev->qp_table.lock);
-
-	ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
-	if (ret >= 0)
-		qp->qpn = ret;
-
-	spin_unlock_irq(&c2dev->qp_table.lock);
-	idr_preload_end();
-	return ret < 0 ? ret : 0;
-}
-
-static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
-{
-	spin_lock_irq(&c2dev->qp_table.lock);
-	idr_remove(&c2dev->qp_table.idr, qpn);
-	spin_unlock_irq(&c2dev->qp_table.lock);
-}
-
-struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
-{
-	unsigned long flags;
-	struct c2_qp *qp;
-
-	spin_lock_irqsave(&c2dev->qp_table.lock, flags);
-	qp = idr_find(&c2dev->qp_table.idr, qpn);
-	spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
-	return qp;
-}
-
-int c2_alloc_qp(struct c2_dev *c2dev,
-		struct c2_pd *pd,
-		struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
-{
-	struct c2wr_qp_create_req wr;
-	struct c2wr_qp_create_rep *reply;
-	struct c2_vq_req *vq_req;
-	struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
-	struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
-	unsigned long peer_pa;
-	u32 q_size, msg_size, mmap_size;
-	void __iomem *mmap;
-	int err;
-
-	err = c2_alloc_qpn(c2dev, qp);
-	if (err)
-		return err;
-	qp->ibqp.qp_num = qp->qpn;
-	qp->ibqp.qp_type = IB_QPT_RC;
-
-	/* Allocate the SQ and RQ shared pointers */
-	qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					 &qp->sq_mq.shared_dma, GFP_KERNEL);
-	if (!qp->sq_mq.shared) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					 &qp->rq_mq.shared_dma, GFP_KERNEL);
-	if (!qp->rq_mq.shared) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	/* Allocate the verbs request */
-	vq_req = vq_req_alloc(c2dev);
-	if (vq_req == NULL) {
-		err = -ENOMEM;
-		goto bail2;
-	}
-
-	/* Initialize the work request */
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_QP_CREATE);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-	wr.sq_cq_handle = send_cq->adapter_handle;
-	wr.rq_cq_handle = recv_cq->adapter_handle;
-	wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
-	wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
-	wr.srq_handle = 0;
-	wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
-			       QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
-	wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-	wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
-	wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-	wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
-	wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
-	wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
-	wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
-	wr.pd_id = pd->pd_id;
-	wr.user_context = (unsigned long) qp;
-
-	vq_req_get(c2dev, vq_req);
-
-	/* Send the WR to the adapter */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail3;
-	}
-
-	/* Wait for the verb reply  */
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail3;
-	}
-
-	/* Process the reply */
-	reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail3;
-	}
-
-	if ((err = c2_wr_get_result(reply)) != 0) {
-		goto bail4;
-	}
-
-	/* Fill in the kernel QP struct */
-	atomic_set(&qp->refcount, 1);
-	qp->adapter_handle = reply->qp_handle;
-	qp->state = IB_QPS_RESET;
-	qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
-	qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
-	qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
-	init_waitqueue_head(&qp->wait);
-
-	/* Initialize the SQ MQ */
-	q_size = be32_to_cpu(reply->sq_depth);
-	msg_size = be32_to_cpu(reply->sq_msg_size);
-	peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
-	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-	mmap = ioremap_nocache(peer_pa, mmap_size);
-	if (!mmap) {
-		err = -ENOMEM;
-		goto bail5;
-	}
-
-	c2_mq_req_init(&qp->sq_mq,
-		       be32_to_cpu(reply->sq_mq_index),
-		       q_size,
-		       msg_size,
-		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
-		       mmap,				/* peer */
-		       C2_MQ_ADAPTER_TARGET);
-
-	/* Initialize the RQ mq */
-	q_size = be32_to_cpu(reply->rq_depth);
-	msg_size = be32_to_cpu(reply->rq_msg_size);
-	peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
-	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-	mmap = ioremap_nocache(peer_pa, mmap_size);
-	if (!mmap) {
-		err = -ENOMEM;
-		goto bail6;
-	}
-
-	c2_mq_req_init(&qp->rq_mq,
-		       be32_to_cpu(reply->rq_mq_index),
-		       q_size,
-		       msg_size,
-		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
-		       mmap,				/* peer */
-		       C2_MQ_ADAPTER_TARGET);
-
-	vq_repbuf_free(c2dev, reply);
-	vq_req_free(c2dev, vq_req);
-
-	return 0;
-
-      bail6:
-	iounmap(qp->sq_mq.peer);
-      bail5:
-	destroy_qp(c2dev, qp);
-      bail4:
-	vq_repbuf_free(c2dev, reply);
-      bail3:
-	vq_req_free(c2dev, vq_req);
-      bail2:
-	c2_free_mqsp(qp->rq_mq.shared);
-      bail1:
-	c2_free_mqsp(qp->sq_mq.shared);
-      bail0:
-	c2_free_qpn(c2dev, qp->qpn);
-	return err;
-}
-
-static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-	if (send_cq == recv_cq)
-		spin_lock_irq(&send_cq->lock);
-	else if (send_cq > recv_cq) {
-		spin_lock_irq(&send_cq->lock);
-		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
-	} else {
-		spin_lock_irq(&recv_cq->lock);
-		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
-	}
-}
-
-static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-	if (send_cq == recv_cq)
-		spin_unlock_irq(&send_cq->lock);
-	else if (send_cq > recv_cq) {
-		spin_unlock(&recv_cq->lock);
-		spin_unlock_irq(&send_cq->lock);
-	} else {
-		spin_unlock(&send_cq->lock);
-		spin_unlock_irq(&recv_cq->lock);
-	}
-}
-
-void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-	struct c2_cq *send_cq;
-	struct c2_cq *recv_cq;
-
-	send_cq = to_c2cq(qp->ibqp.send_cq);
-	recv_cq = to_c2cq(qp->ibqp.recv_cq);
-
-	/*
-	 * Lock CQs here, so that CQ polling code can do QP lookup
-	 * without taking a lock.
-	 */
-	c2_lock_cqs(send_cq, recv_cq);
-	c2_free_qpn(c2dev, qp->qpn);
-	c2_unlock_cqs(send_cq, recv_cq);
-
-	/*
-	 * Destroy qp in the rnic...
-	 */
-	destroy_qp(c2dev, qp);
-
-	/*
-	 * Mark any unreaped CQEs as null and void.
-	 */
-	c2_cq_clean(c2dev, qp, send_cq->cqn);
-	if (send_cq != recv_cq)
-		c2_cq_clean(c2dev, qp, recv_cq->cqn);
-	/*
-	 * Unmap the MQs and return the shared pointers
-	 * to the message pool.
-	 */
-	iounmap(qp->sq_mq.peer);
-	iounmap(qp->rq_mq.peer);
-	c2_free_mqsp(qp->sq_mq.shared);
-	c2_free_mqsp(qp->rq_mq.shared);
-
-	atomic_dec(&qp->refcount);
-	wait_event(qp->wait, !atomic_read(&qp->refcount));
-}
-
-/*
- * Function: move_sgl
- *
- * Description:
- * Move an SGL from the user's work request struct into a CCIL Work Request
- * message, swapping to WR byte order and ensure the total length doesn't
- * overflow.
- *
- * IN:
- * dst		- ptr to CCIL Work Request message SGL memory.
- * src		- ptr to the consumers SGL memory.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int
-move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
-	 u8 * actual_count)
-{
-	u32 tot = 0;		/* running total */
-	u8 acount = 0;		/* running total non-0 len sge's */
-
-	while (count > 0) {
-		/*
-		 * If the addition of this SGE causes the
-		 * total SGL length to exceed 2^32-1, then
-		 * fail-n-bail.
-		 *
-		 * If the current total plus the next element length
-		 * wraps, then it will go negative and be less than the
-		 * current total...
-		 */
-		if ((tot + src->length) < tot) {
-			return -EINVAL;
-		}
-		/*
-		 * Bug: 1456 (as well as 1498 & 1643)
-		 * Skip over any sge's supplied with len=0
-		 */
-		if (src->length) {
-			tot += src->length;
-			dst->stag = cpu_to_be32(src->lkey);
-			dst->to = cpu_to_be64(src->addr);
-			dst->length = cpu_to_be32(src->length);
-			dst++;
-			acount++;
-		}
-		src++;
-		count--;
-	}
-
-	if (acount == 0) {
-		/*
-		 * Bug: 1476 (as well as 1498, 1456 and 1643)
-		 * Setup the SGL in the WR to make it easier for the RNIC.
-		 * This way, the FW doesn't have to deal with special cases.
-		 * Setting length=0 should be sufficient.
-		 */
-		dst->stag = 0;
-		dst->to = 0;
-		dst->length = 0;
-	}
-
-	*p_len = tot;
-	*actual_count = acount;
-	return 0;
-}
-
-/*
- * Function: c2_activity (private function)
- *
- * Description:
- * Post an mq index to the host->adapter activity fifo.
- *
- * IN:
- * c2dev	- ptr to c2dev structure
- * mq_index	- mq index to post
- * shared	- value most recently written to shared
- *
- * OUT:
- *
- * Return:
- * none
- */
-static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
-{
-	/*
-	 * First read the register to see if the FIFO is full, and if so,
-	 * spin until it's not.  This isn't perfect -- there is no
-	 * synchronization among the clients of the register, but in
-	 * practice it prevents multiple CPU from hammering the bus
-	 * with PCI RETRY. Note that when this does happen, the card
-	 * cannot get on the bus and the card and system hang in a
-	 * deadlock -- thus the need for this code. [TOT]
-	 */
-	while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
-		udelay(10);
-
-	__raw_writel(C2_HINT_MAKE(mq_index, shared),
-		     c2dev->regs + PCI_BAR0_ADAPTER_HINT);
-}
-
-/*
- * Function: qp_wr_post
- *
- * Description:
- * This in-line function allocates a MQ msg, then moves the host-copy of
- * the completed WR into msg.  Then it posts the message.
- *
- * IN:
- * q		- ptr to user MQ.
- * wr		- ptr to host-copy of the WR.
- * qp		- ptr to user qp
- * size		- Number of bytes to post.  Assumed to be divisible by 4.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
-{
-	union c2wr *msg;
-
-	msg = c2_mq_alloc(q);
-	if (msg == NULL) {
-		return -EINVAL;
-	}
-#ifdef CCMSGMAGIC
-	((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-
-	/*
-	 * Since all header fields in the WR are the same as the
-	 * CQE, set the following so the adapter need not.
-	 */
-	c2_wr_set_result(wr, CCERR_PENDING);
-
-	/*
-	 * Copy the wr down to the adapter
-	 */
-	memcpy((void *) msg, (void *) wr, size);
-
-	c2_mq_produce(q);
-	return 0;
-}
-
-
-int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-		 struct ib_send_wr **bad_wr)
-{
-	struct c2_dev *c2dev = to_c2dev(ibqp->device);
-	struct c2_qp *qp = to_c2qp(ibqp);
-	union c2wr wr;
-	unsigned long lock_flags;
-	int err = 0;
-
-	u32 flags;
-	u32 tot_len;
-	u8 actual_sge_count;
-	u32 msg_size;
-
-	if (qp->state > IB_QPS_RTS) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	while (ib_wr) {
-
-		flags = 0;
-		wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-		if (ib_wr->send_flags & IB_SEND_SIGNALED) {
-			flags |= SQ_SIGNALED;
-		}
-
-		switch (ib_wr->opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_INV:
-			if (ib_wr->opcode == IB_WR_SEND) {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
-				else
-					c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
-				wr.sqwr.send.remote_stag = 0;
-			} else {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
-				else
-					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
-				wr.sqwr.send.remote_stag =
-					cpu_to_be32(ib_wr->ex.invalidate_rkey);
-			}
-
-			msg_size = sizeof(struct c2wr_send_req) +
-				sizeof(struct c2_data_addr) * ib_wr->num_sge;
-			if (ib_wr->num_sge > qp->send_sgl_depth) {
-				err = -EINVAL;
-				break;
-			}
-			if (ib_wr->send_flags & IB_SEND_FENCE) {
-				flags |= SQ_READ_FENCE;
-			}
-			err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
-				       ib_wr->sg_list,
-				       ib_wr->num_sge,
-				       &tot_len, &actual_sge_count);
-			wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
-			c2_wr_set_sge_count(&wr, actual_sge_count);
-			break;
-		case IB_WR_RDMA_WRITE:
-			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
-			msg_size = sizeof(struct c2wr_rdma_write_req) +
-			    (sizeof(struct c2_data_addr) * ib_wr->num_sge);
-			if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
-				err = -EINVAL;
-				break;
-			}
-			if (ib_wr->send_flags & IB_SEND_FENCE) {
-				flags |= SQ_READ_FENCE;
-			}
-			wr.sqwr.rdma_write.remote_stag =
-			    cpu_to_be32(ib_wr->wr.rdma.rkey);
-			wr.sqwr.rdma_write.remote_to =
-			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
-			err = move_sgl((struct c2_data_addr *)
-				       & (wr.sqwr.rdma_write.data),
-				       ib_wr->sg_list,
-				       ib_wr->num_sge,
-				       &tot_len, &actual_sge_count);
-			wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
-			c2_wr_set_sge_count(&wr, actual_sge_count);
-			break;
-		case IB_WR_RDMA_READ:
-			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
-			msg_size = sizeof(struct c2wr_rdma_read_req);
-
-			/* IWarp only suppots 1 sge for RDMA reads */
-			if (ib_wr->num_sge > 1) {
-				err = -EINVAL;
-				break;
-			}
-
-			/*
-			 * Move the local and remote stag/to/len into the WR.
-			 */
-			wr.sqwr.rdma_read.local_stag =
-			    cpu_to_be32(ib_wr->sg_list->lkey);
-			wr.sqwr.rdma_read.local_to =
-			    cpu_to_be64(ib_wr->sg_list->addr);
-			wr.sqwr.rdma_read.remote_stag =
-			    cpu_to_be32(ib_wr->wr.rdma.rkey);
-			wr.sqwr.rdma_read.remote_to =
-			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
-			wr.sqwr.rdma_read.length =
-			    cpu_to_be32(ib_wr->sg_list->length);
-			break;
-		default:
-			/* error */
-			msg_size = 0;
-			err = -EINVAL;
-			break;
-		}
-
-		/*
-		 * If we had an error on the last wr build, then
-		 * break out.  Possible errors include bogus WR
-		 * type, and a bogus SGL length...
-		 */
-		if (err) {
-			break;
-		}
-
-		/*
-		 * Store flags
-		 */
-		c2_wr_set_flags(&wr, flags);
-
-		/*
-		 * Post the puppy!
-		 */
-		spin_lock_irqsave(&qp->lock, lock_flags);
-		err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
-		if (err) {
-			spin_unlock_irqrestore(&qp->lock, lock_flags);
-			break;
-		}
-
-		/*
-		 * Enqueue mq index to activity FIFO.
-		 */
-		c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
-		spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-		ib_wr = ib_wr->next;
-	}
-
-out:
-	if (err)
-		*bad_wr = ib_wr;
-	return err;
-}
-
-int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-		    struct ib_recv_wr **bad_wr)
-{
-	struct c2_dev *c2dev = to_c2dev(ibqp->device);
-	struct c2_qp *qp = to_c2qp(ibqp);
-	union c2wr wr;
-	unsigned long lock_flags;
-	int err = 0;
-
-	if (qp->state > IB_QPS_RTS) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Try and post each work request
-	 */
-	while (ib_wr) {
-		u32 tot_len;
-		u8 actual_sge_count;
-
-		if (ib_wr->num_sge > qp->recv_sgl_depth) {
-			err = -EINVAL;
-			break;
-		}
-
-		/*
-		 * Create local host-copy of the WR
-		 */
-		wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-		c2_wr_set_id(&wr, CCWR_RECV);
-		c2_wr_set_flags(&wr, 0);
-
-		/* sge_count is limited to eight bits. */
-		BUG_ON(ib_wr->num_sge >= 256);
-		err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
-			       ib_wr->sg_list,
-			       ib_wr->num_sge, &tot_len, &actual_sge_count);
-		c2_wr_set_sge_count(&wr, actual_sge_count);
-
-		/*
-		 * If we had an error on the last wr build, then
-		 * break out.  Possible errors include bogus WR
-		 * type, and a bogus SGL length...
-		 */
-		if (err) {
-			break;
-		}
-
-		spin_lock_irqsave(&qp->lock, lock_flags);
-		err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
-		if (err) {
-			spin_unlock_irqrestore(&qp->lock, lock_flags);
-			break;
-		}
-
-		/*
-		 * Enqueue mq index to activity FIFO
-		 */
-		c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
-		spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-		ib_wr = ib_wr->next;
-	}
-
-out:
-	if (err)
-		*bad_wr = ib_wr;
-	return err;
-}
-
-void c2_init_qp_table(struct c2_dev *c2dev)
-{
-	spin_lock_init(&c2dev->qp_table.lock);
-	idr_init(&c2dev->qp_table.idr);
-}
-
-void c2_cleanup_qp_table(struct c2_dev *c2dev)
-{
-	idr_destroy(&c2dev->qp_table.idr);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
deleted file mode 100644
index d2a6d96..0000000
--- a/drivers/infiniband/hw/amso1100/c2_rnic.c
+++ /dev/null
@@ -1,655 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/inet.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <linux/route.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_vq.h"
-
-/* Device capabilities */
-#define C2_MIN_PAGESIZE  1024
-
-#define C2_MAX_MRS       32768
-#define C2_MAX_QPS       16000
-#define C2_MAX_WQE_SZ    256
-#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
-#define C2_MAX_SGES      4
-#define C2_MAX_SGE_RD    1
-#define C2_MAX_CQS       32768
-#define C2_MAX_CQES      4096
-#define C2_MAX_PDS       16384
-
-/*
- * Send the adapter INIT message to the amso1100
- */
-static int c2_adapter_init(struct c2_dev *c2dev)
-{
-	struct c2wr_init_req wr;
-	int err;
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_INIT);
-	wr.hdr.context = 0;
-	wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
-	wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
-	wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
-	wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
-	wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
-	wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
-
-	/* Post the init message */
-	err = vq_send_wr(c2dev, (union c2wr *) & wr);
-
-	return err;
-}
-
-/*
- * Send the adapter TERM message to the amso1100
- */
-static void c2_adapter_term(struct c2_dev *c2dev)
-{
-	struct c2wr_init_req wr;
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_TERM);
-	wr.hdr.context = 0;
-
-	/* Post the init message */
-	vq_send_wr(c2dev, (union c2wr *) & wr);
-	c2dev->init = 0;
-
-	return;
-}
-
-/*
- * Query the adapter
- */
-static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
-{
-	struct c2_vq_req *vq_req;
-	struct c2wr_rnic_query_req wr;
-	struct c2wr_rnic_query_rep *reply;
-	int err;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
-	wr.hdr.context = (unsigned long) vq_req;
-	wr.rnic_handle = c2dev->adapter_handle;
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) &wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail1;
-
-	reply =
-	    (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply)
-		err = -ENOMEM;
-	else
-		err = c2_errno(reply);
-	if (err)
-		goto bail2;
-
-	props->fw_ver =
-		((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
-		((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
-		(be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
-	memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
-	props->max_mr_size         = 0xFFFFFFFF;
-	props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
-	props->vendor_id           = be32_to_cpu(reply->vendor_id);
-	props->vendor_part_id      = be32_to_cpu(reply->part_number);
-	props->hw_ver              = be32_to_cpu(reply->hw_version);
-	props->max_qp              = be32_to_cpu(reply->max_qps);
-	props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
-	props->device_cap_flags    = c2dev->device_cap_flags;
-	props->max_sge             = C2_MAX_SGES;
-	props->max_sge_rd          = C2_MAX_SGE_RD;
-	props->max_cq              = be32_to_cpu(reply->max_cqs);
-	props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
-	props->max_mr              = be32_to_cpu(reply->max_mrs);
-	props->max_pd              = be32_to_cpu(reply->max_pds);
-	props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
-	props->max_ee_rd_atom      = 0;
-	props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
-	props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
-	props->max_ee_init_rd_atom = 0;
-	props->atomic_cap          = IB_ATOMIC_NONE;
-	props->max_ee              = 0;
-	props->max_rdd             = 0;
-	props->max_mw              = be32_to_cpu(reply->max_mws);
-	props->max_raw_ipv6_qp     = 0;
-	props->max_raw_ethy_qp     = 0;
-	props->max_mcast_grp       = 0;
-	props->max_mcast_qp_attach = 0;
-	props->max_total_mcast_qp_attach = 0;
-	props->max_ah              = 0;
-	props->max_fmr             = 0;
-	props->max_map_per_fmr     = 0;
-	props->max_srq             = 0;
-	props->max_srq_wr          = 0;
-	props->max_srq_sge         = 0;
-	props->max_pkeys           = 0;
-	props->local_ca_ack_delay  = 0;
-
- bail2:
-	vq_repbuf_free(c2dev, reply);
-
- bail1:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-/*
- * Add an IP address to the RNIC interface
- */
-int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-	struct c2_vq_req *vq_req;
-	struct c2wr_rnic_setconfig_req *wr;
-	struct c2wr_rnic_setconfig_rep *reply;
-	struct c2_netaddr netaddr;
-	int err, len;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	len = sizeof(struct c2_netaddr);
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-	wr->hdr.context = (unsigned long) vq_req;
-	wr->rnic_handle = c2dev->adapter_handle;
-	wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
-
-	netaddr.ip_addr = inaddr;
-	netaddr.netmask = inmask;
-	netaddr.mtu = 0;
-
-	memcpy(wr->data, &netaddr, len);
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail1;
-
-	reply =
-	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	err = c2_errno(reply);
-	vq_repbuf_free(c2dev, reply);
-
-      bail1:
-	kfree(wr);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-/*
- * Delete an IP address from the RNIC interface
- */
-int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-	struct c2_vq_req *vq_req;
-	struct c2wr_rnic_setconfig_req *wr;
-	struct c2wr_rnic_setconfig_rep *reply;
-	struct c2_netaddr netaddr;
-	int err, len;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (!vq_req)
-		return -ENOMEM;
-
-	len = sizeof(struct c2_netaddr);
-	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-	if (!wr) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-	wr->hdr.context = (unsigned long) vq_req;
-	wr->rnic_handle = c2dev->adapter_handle;
-	wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
-
-	netaddr.ip_addr = inaddr;
-	netaddr.netmask = inmask;
-	netaddr.mtu = 0;
-
-	memcpy(wr->data, &netaddr, len);
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, (union c2wr *) wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail1;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err)
-		goto bail1;
-
-	reply =
-	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	err = c2_errno(reply);
-	vq_repbuf_free(c2dev, reply);
-
-      bail1:
-	kfree(wr);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-/*
- * Open a single RNIC instance to use with all
- * low level openib calls
- */
-static int c2_rnic_open(struct c2_dev *c2dev)
-{
-	struct c2_vq_req *vq_req;
-	union c2wr wr;
-	struct c2wr_rnic_open_rep *reply;
-	int err;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (vq_req == NULL) {
-		return -ENOMEM;
-	}
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
-	wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
-	wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
-	wr.rnic_open.req.port_num = cpu_to_be16(0);
-	wr.rnic_open.req.user_context = (unsigned long) c2dev;
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, &wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail0;
-	}
-
-	reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	if ((err = c2_errno(reply)) != 0) {
-		goto bail1;
-	}
-
-	c2dev->adapter_handle = reply->rnic_handle;
-
-      bail1:
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-/*
- * Close the RNIC instance
- */
-static int c2_rnic_close(struct c2_dev *c2dev)
-{
-	struct c2_vq_req *vq_req;
-	union c2wr wr;
-	struct c2wr_rnic_close_rep *reply;
-	int err;
-
-	vq_req = vq_req_alloc(c2dev);
-	if (vq_req == NULL) {
-		return -ENOMEM;
-	}
-
-	memset(&wr, 0, sizeof(wr));
-	c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
-	wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
-	wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
-
-	vq_req_get(c2dev, vq_req);
-
-	err = vq_send_wr(c2dev, &wr);
-	if (err) {
-		vq_req_put(c2dev, vq_req);
-		goto bail0;
-	}
-
-	err = vq_wait_for_reply(c2dev, vq_req);
-	if (err) {
-		goto bail0;
-	}
-
-	reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
-	if (!reply) {
-		err = -ENOMEM;
-		goto bail0;
-	}
-
-	if ((err = c2_errno(reply)) != 0) {
-		goto bail1;
-	}
-
-	c2dev->adapter_handle = 0;
-
-      bail1:
-	vq_repbuf_free(c2dev, reply);
-      bail0:
-	vq_req_free(c2dev, vq_req);
-	return err;
-}
-
-/*
- * Called by c2_probe to initialize the RNIC. This principally
- * involves initializing the various limits and resource pools that
- * comprise the RNIC instance.
- */
-int c2_rnic_init(struct c2_dev *c2dev)
-{
-	int err;
-	u32 qsize, msgsize;
-	void *q1_pages;
-	void *q2_pages;
-	void __iomem *mmio_regs;
-
-	/* Device capabilities */
-	c2dev->device_cap_flags =
-	    (IB_DEVICE_RESIZE_MAX_WR |
-	     IB_DEVICE_CURR_QP_STATE_MOD |
-	     IB_DEVICE_SYS_IMAGE_GUID |
-	     IB_DEVICE_LOCAL_DMA_LKEY |
-	     IB_DEVICE_MEM_WINDOW);
-
-	/* Allocate the qptr_array */
-	c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
-	if (!c2dev->qptr_array) {
-		return -ENOMEM;
-	}
-
-	/* Initialize the qptr_array */
-	c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
-	c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
-	c2dev->qptr_array[2] = (void *) &c2dev->aeq;
-
-	/* Initialize data structures */
-	init_waitqueue_head(&c2dev->req_vq_wo);
-	spin_lock_init(&c2dev->vqlock);
-	spin_lock_init(&c2dev->lock);
-
-	/* Allocate MQ shared pointer pool for kernel clients. User
-	 * mode client pools are hung off the user context
-	 */
-	err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
-	if (err) {
-		goto bail0;
-	}
-
-	/* Allocate shared pointers for Q0, Q1, and Q2 from
-	 * the shared pointer pool.
-	 */
-
-	c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					     &c2dev->hint_count_dma,
-					     GFP_KERNEL);
-	c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					     &c2dev->req_vq.shared_dma,
-					     GFP_KERNEL);
-	c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					     &c2dev->rep_vq.shared_dma,
-					     GFP_KERNEL);
-	c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-					  &c2dev->aeq.shared_dma, GFP_KERNEL);
-	if (!c2dev->hint_count || !c2dev->req_vq.shared ||
-	    !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-
-	mmio_regs = c2dev->kva;
-	/* Initialize the Verbs Request Queue */
-	c2_mq_req_init(&c2dev->req_vq, 0,
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
-		       mmio_regs +
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
-		       mmio_regs +
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
-		       C2_MQ_ADAPTER_TARGET);
-
-	/* Initialize the Verbs Reply Queue */
-	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
-	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
-	q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-				      &c2dev->rep_vq.host_dma, GFP_KERNEL);
-	if (!q1_pages) {
-		err = -ENOMEM;
-		goto bail1;
-	}
-	dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
-	pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
-		 (unsigned long long) c2dev->rep_vq.host_dma);
-	c2_mq_rep_init(&c2dev->rep_vq,
-		   1,
-		   qsize,
-		   msgsize,
-		   q1_pages,
-		   mmio_regs +
-		   be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
-		   C2_MQ_HOST_TARGET);
-
-	/* Initialize the Asynchronus Event Queue */
-	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
-	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
-	q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-				      &c2dev->aeq.host_dma, GFP_KERNEL);
-	if (!q2_pages) {
-		err = -ENOMEM;
-		goto bail2;
-	}
-	dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
-	pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
-		 (unsigned long long) c2dev->aeq.host_dma);
-	c2_mq_rep_init(&c2dev->aeq,
-		       2,
-		       qsize,
-		       msgsize,
-		       q2_pages,
-		       mmio_regs +
-		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
-		       C2_MQ_HOST_TARGET);
-
-	/* Initialize the verbs request allocator */
-	err = vq_init(c2dev);
-	if (err)
-		goto bail3;
-
-	/* Enable interrupts on the adapter */
-	writel(0, c2dev->regs + C2_IDIS);
-
-	/* create the WR init message */
-	err = c2_adapter_init(c2dev);
-	if (err)
-		goto bail4;
-	c2dev->init++;
-
-	/* open an adapter instance */
-	err = c2_rnic_open(c2dev);
-	if (err)
-		goto bail4;
-
-	/* Initialize cached the adapter limits */
-	err = c2_rnic_query(c2dev, &c2dev->props);
-	if (err)
-		goto bail5;
-
-	/* Initialize the PD pool */
-	err = c2_init_pd_table(c2dev);
-	if (err)
-		goto bail5;
-
-	/* Initialize the QP pool */
-	c2_init_qp_table(c2dev);
-	return 0;
-
-      bail5:
-	c2_rnic_close(c2dev);
-      bail4:
-	vq_term(c2dev);
-      bail3:
-	dma_free_coherent(&c2dev->pcidev->dev,
-			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
-			  q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
-      bail2:
-	dma_free_coherent(&c2dev->pcidev->dev,
-			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-			  q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
-      bail1:
-	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-      bail0:
-	vfree(c2dev->qptr_array);
-
-	return err;
-}
-
-/*
- * Called by c2_remove to cleanup the RNIC resources.
- */
-void c2_rnic_term(struct c2_dev *c2dev)
-{
-
-	/* Close the open adapter instance */
-	c2_rnic_close(c2dev);
-
-	/* Send the TERM message to the adapter */
-	c2_adapter_term(c2dev);
-
-	/* Disable interrupts on the adapter */
-	writel(1, c2dev->regs + C2_IDIS);
-
-	/* Free the QP pool */
-	c2_cleanup_qp_table(c2dev);
-
-	/* Free the PD pool */
-	c2_cleanup_pd_table(c2dev);
-
-	/* Free the verbs request allocator */
-	vq_term(c2dev);
-
-	/* Free the asynchronus event queue */
-	dma_free_coherent(&c2dev->pcidev->dev,
-			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
-			  c2dev->aeq.msg_pool.host,
-			  dma_unmap_addr(&c2dev->aeq, mapping));
-
-	/* Free the verbs reply queue */
-	dma_free_coherent(&c2dev->pcidev->dev,
-			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-			  c2dev->rep_vq.msg_pool.host,
-			  dma_unmap_addr(&c2dev->rep_vq, mapping));
-
-	/* Free the MQ shared pointer pool */
-	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-
-	/* Free the qptr_array */
-	vfree(c2dev->qptr_array);
-
-	return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/infiniband/hw/amso1100/c2_status.h
deleted file mode 100644
index 6ee4aa9..0000000
--- a/drivers/infiniband/hw/amso1100/c2_status.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef	_C2_STATUS_H_
-#define _C2_STATUS_H_
-
-/*
- * Verbs Status Codes
- */
-enum c2_status {
-	C2_OK = 0,		/* This must be zero */
-	CCERR_INSUFFICIENT_RESOURCES = 1,
-	CCERR_INVALID_MODIFIER = 2,
-	CCERR_INVALID_MODE = 3,
-	CCERR_IN_USE = 4,
-	CCERR_INVALID_RNIC = 5,
-	CCERR_INTERRUPTED_OPERATION = 6,
-	CCERR_INVALID_EH = 7,
-	CCERR_INVALID_CQ = 8,
-	CCERR_CQ_EMPTY = 9,
-	CCERR_NOT_IMPLEMENTED = 10,
-	CCERR_CQ_DEPTH_TOO_SMALL = 11,
-	CCERR_PD_IN_USE = 12,
-	CCERR_INVALID_PD = 13,
-	CCERR_INVALID_SRQ = 14,
-	CCERR_INVALID_ADDRESS = 15,
-	CCERR_INVALID_NETMASK = 16,
-	CCERR_INVALID_QP = 17,
-	CCERR_INVALID_QP_STATE = 18,
-	CCERR_TOO_MANY_WRS_POSTED = 19,
-	CCERR_INVALID_WR_TYPE = 20,
-	CCERR_INVALID_SGL_LENGTH = 21,
-	CCERR_INVALID_SQ_DEPTH = 22,
-	CCERR_INVALID_RQ_DEPTH = 23,
-	CCERR_INVALID_ORD = 24,
-	CCERR_INVALID_IRD = 25,
-	CCERR_QP_ATTR_CANNOT_CHANGE = 26,
-	CCERR_INVALID_STAG = 27,
-	CCERR_QP_IN_USE = 28,
-	CCERR_OUTSTANDING_WRS = 29,
-	CCERR_STAG_IN_USE = 30,
-	CCERR_INVALID_STAG_INDEX = 31,
-	CCERR_INVALID_SGL_FORMAT = 32,
-	CCERR_ADAPTER_TIMEOUT = 33,
-	CCERR_INVALID_CQ_DEPTH = 34,
-	CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
-	CCERR_INVALID_EP = 36,
-	CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
-	CCERR_FLUSHED = 38,
-	CCERR_INVALID_WQE = 39,
-	CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
-	CCERR_REMOTE_TERMINATION_ERROR = 41,
-	CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
-	CCERR_ACCESS_VIOLATION = 43,
-	CCERR_INVALID_PD_ID = 44,
-	CCERR_WRAP_ERROR = 45,
-	CCERR_INV_STAG_ACCESS_ERROR = 46,
-	CCERR_ZERO_RDMA_READ_RESOURCES = 47,
-	CCERR_QP_NOT_PRIVILEGED = 48,
-	CCERR_STAG_STATE_NOT_INVALID = 49,
-	CCERR_INVALID_PAGE_SIZE = 50,
-	CCERR_INVALID_BUFFER_SIZE = 51,
-	CCERR_INVALID_PBE = 52,
-	CCERR_INVALID_FBO = 53,
-	CCERR_INVALID_LENGTH = 54,
-	CCERR_INVALID_ACCESS_RIGHTS = 55,
-	CCERR_PBL_TOO_BIG = 56,
-	CCERR_INVALID_VA = 57,
-	CCERR_INVALID_REGION = 58,
-	CCERR_INVALID_WINDOW = 59,
-	CCERR_TOTAL_LENGTH_TOO_BIG = 60,
-	CCERR_INVALID_QP_ID = 61,
-	CCERR_ADDR_IN_USE = 62,
-	CCERR_ADDR_NOT_AVAIL = 63,
-	CCERR_NET_DOWN = 64,
-	CCERR_NET_UNREACHABLE = 65,
-	CCERR_CONN_ABORTED = 66,
-	CCERR_CONN_RESET = 67,
-	CCERR_NO_BUFS = 68,
-	CCERR_CONN_TIMEDOUT = 69,
-	CCERR_CONN_REFUSED = 70,
-	CCERR_HOST_UNREACHABLE = 71,
-	CCERR_INVALID_SEND_SGL_DEPTH = 72,
-	CCERR_INVALID_RECV_SGL_DEPTH = 73,
-	CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
-	CCERR_INSUFFICIENT_PRIVILEGES = 75,
-	CCERR_STACK_ERROR = 76,
-	CCERR_INVALID_VERSION = 77,
-	CCERR_INVALID_MTU = 78,
-	CCERR_INVALID_IMAGE = 79,
-	CCERR_PENDING = 98,	/* not an error; user internally by adapter */
-	CCERR_DEFER = 99,	/* not an error; used internally by adapter */
-	CCERR_FAILED_WRITE = 100,
-	CCERR_FAILED_ERASE = 101,
-	CCERR_FAILED_VERIFICATION = 102,
-	CCERR_NOT_FOUND = 103,
-
-};
-
-/*
- * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
- */
-enum c2_connect_status {
-	C2_CONN_STATUS_SUCCESS = C2_OK,
-	C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
-	C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
-	C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
-	C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
-	C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
-	C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
-	C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
-	C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
-	C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
-	C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
-};
-
-/*
- * Flash programming status codes.
- */
-enum c2_flash_status {
-	C2_FLASH_STATUS_SUCCESS = 0x0000,
-	C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
-	C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
-	C2_FLASH_STATUS_ECLBS = 0x0400,
-	C2_FLASH_STATUS_PSLBS = 0x0800,
-	C2_FLASH_STATUS_VPENS = 0x1000,
-};
-
-#endif				/* _C2_STATUS_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/infiniband/hw/amso1100/c2_user.h
deleted file mode 100644
index 7e9e7ad..0000000
--- a/drivers/infiniband/hw/amso1100/c2_user.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_USER_H
-#define C2_USER_H
-
-#include <linux/types.h>
-
-/*
- * Make sure that all structs defined in this file remain laid out so
- * that they pack the same way on 32-bit and 64-bit architectures (to
- * avoid incompatibility between 32-bit userspace and 64-bit kernels).
- * In particular do not use pointer types -- pass pointers in __u64
- * instead.
- */
-
-struct c2_alloc_ucontext_resp {
-	__u32 qp_tab_size;
-	__u32 uarc_size;
-};
-
-struct c2_alloc_pd_resp {
-	__u32 pdn;
-	__u32 reserved;
-};
-
-struct c2_create_cq {
-	__u32 lkey;
-	__u32 pdn;
-	__u64 arm_db_page;
-	__u64 set_db_page;
-	__u32 arm_db_index;
-	__u32 set_db_index;
-};
-
-struct c2_create_cq_resp {
-	__u32 cqn;
-	__u32 reserved;
-};
-
-struct c2_create_qp {
-	__u32 lkey;
-	__u32 reserved;
-	__u64 sq_db_page;
-	__u64 rq_db_page;
-	__u32 sq_db_index;
-	__u32 rq_db_index;
-};
-
-#endif				/* C2_USER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
deleted file mode 100644
index 2ec716f..0000000
--- a/drivers/infiniband/hw/amso1100/c2_vq.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-
-#include "c2_vq.h"
-#include "c2_provider.h"
-
-/*
- * Verbs Request Objects:
- *
- * VQ Request Objects are allocated by the kernel verbs handlers.
- * They contain a wait object, a refcnt, an atomic bool indicating that the
- * adapter has replied, and a copy of the verb reply work request.
- * A pointer to the VQ Request Object is passed down in the context
- * field of the work request message, and reflected back by the adapter
- * in the verbs reply message.  The function handle_vq() in the interrupt
- * path will use this pointer to:
- * 	1) append a copy of the verbs reply message
- * 	2) mark that the reply is ready
- * 	3) wake up the kernel verbs handler blocked awaiting the reply.
- *
- *
- * The kernel verbs handlers do a "get" to put a 2nd reference on the
- * VQ Request object.  If the kernel verbs handler exits before the adapter
- * can respond, this extra reference will keep the VQ Request object around
- * until the adapter's reply can be processed.  The reason we need this is
- * because a pointer to this object is stuffed into the context field of
- * the verbs work request message, and reflected back in the reply message.
- * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
- * kernel verb handler that is blocked awaiting the verb reply.
- * So handle_vq() will do a "put" on the object when it's done accessing it.
- * NOTE:  If we guarantee that the kernel verb handler will never bail before
- *        getting the reply, then we don't need these refcnts.
- *
- *
- * VQ Request objects are freed by the kernel verbs handlers only
- * after the verb has been processed, or when the adapter fails and
- * does not reply.
- *
- *
- * Verbs Reply Buffers:
- *
- * VQ Reply bufs are local host memory copies of a
- * outstanding Verb Request reply
- * message.  The are always allocated by the kernel verbs handlers, and _may_ be
- * freed by either the kernel verbs handler -or- the interrupt handler.  The
- * kernel verbs handler _must_ free the repbuf, then free the vq request object
- * in that order.
- */
-
-int vq_init(struct c2_dev *c2dev)
-{
-	sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
-		(char) ('0' + c2dev->devnum));
-	c2dev->host_msg_cache =
-	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
-			      SLAB_HWCACHE_ALIGN, NULL);
-	if (c2dev->host_msg_cache == NULL) {
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void vq_term(struct c2_dev *c2dev)
-{
-	kmem_cache_destroy(c2dev->host_msg_cache);
-}
-
-/* vq_req_alloc - allocate a VQ Request Object and initialize it.
- * The refcnt is set to 1.
- */
-struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
-{
-	struct c2_vq_req *r;
-
-	r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
-	if (r) {
-		init_waitqueue_head(&r->wait_object);
-		r->reply_msg = 0;
-		r->event = 0;
-		r->cm_id = NULL;
-		r->qp = NULL;
-		atomic_set(&r->refcnt, 1);
-		atomic_set(&r->reply_ready, 0);
-	}
-	return r;
-}
-
-
-/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
- * has already free the VQ Reply Buffer if it existed.
- */
-void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-	r->reply_msg = 0;
-	if (atomic_dec_and_test(&r->refcnt)) {
-		kfree(r);
-	}
-}
-
-/* vq_req_get - reference a VQ Request Object.  Done
- * only in the kernel verbs handlers.
- */
-void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-	atomic_inc(&r->refcnt);
-}
-
-
-/* vq_req_put - dereference and potentially free a VQ Request Object.
- *
- * This is only called by handle_vq() on the
- * interrupt when it is done processing
- * a verb reply message.  If the associated
- * kernel verbs handler has already bailed,
- * then this put will actually free the VQ
- * Request object _and_ the VQ Reply Buffer
- * if it exists.
- */
-void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-	if (atomic_dec_and_test(&r->refcnt)) {
-		if (r->reply_msg != 0)
-			vq_repbuf_free(c2dev,
-				       (void *) (unsigned long) r->reply_msg);
-		kfree(r);
-	}
-}
-
-
-/*
- * vq_repbuf_alloc - allocate a VQ Reply Buffer.
- */
-void *vq_repbuf_alloc(struct c2_dev *c2dev)
-{
-	return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
-}
-
-/*
- * vq_send_wr - post a verbs request message to the Verbs Request Queue.
- * If a message is not available in the MQ, then block until one is available.
- * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
- * When the adapter drains the Verbs Request Queue,
- * it inserts MQ index 0 in to the
- * adapter->host activity fifo and interrupts the host.
- */
-int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
-{
-	void *msg;
-	wait_queue_t __wait;
-
-	/*
-	 * grab adapter vq lock
-	 */
-	spin_lock(&c2dev->vqlock);
-
-	/*
-	 * allocate msg
-	 */
-	msg = c2_mq_alloc(&c2dev->req_vq);
-
-	/*
-	 * If we cannot get a msg, then we'll wait
-	 * When a messages are available, the int handler will wake_up()
-	 * any waiters.
-	 */
-	while (msg == NULL) {
-		pr_debug("%s:%d no available msg in VQ, waiting...\n",
-		       __func__, __LINE__);
-		init_waitqueue_entry(&__wait, current);
-		add_wait_queue(&c2dev->req_vq_wo, &__wait);
-		spin_unlock(&c2dev->vqlock);
-		for (;;) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (!c2_mq_full(&c2dev->req_vq)) {
-				break;
-			}
-			if (!signal_pending(current)) {
-				schedule_timeout(1 * HZ);	/* 1 second... */
-				continue;
-			}
-			set_current_state(TASK_RUNNING);
-			remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-			return -EINTR;
-		}
-		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-		spin_lock(&c2dev->vqlock);
-		msg = c2_mq_alloc(&c2dev->req_vq);
-	}
-
-	/*
-	 * copy wr into adapter msg
-	 */
-	memcpy(msg, wr, c2dev->req_vq.msg_size);
-
-	/*
-	 * post msg
-	 */
-	c2_mq_produce(&c2dev->req_vq);
-
-	/*
-	 * release adapter vq lock
-	 */
-	spin_unlock(&c2dev->vqlock);
-	return 0;
-}
-
-
-/*
- * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
- */
-int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
-{
-	if (!wait_event_timeout(req->wait_object,
-				atomic_read(&req->reply_ready),
-				60*HZ))
-		return -ETIMEDOUT;
-
-	return 0;
-}
-
-/*
- * vq_repbuf_free - Free a Verbs Reply Buffer.
- */
-void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
-{
-	kmem_cache_free(c2dev->host_msg_cache, reply);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/infiniband/hw/amso1100/c2_vq.h
deleted file mode 100644
index 3380562..0000000
--- a/drivers/infiniband/hw/amso1100/c2_vq.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_VQ_H_
-#define _C2_VQ_H_
-#include <linux/sched.h>
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_provider.h"
-
-struct c2_vq_req {
-	u64 reply_msg;		/* ptr to reply msg */
-	wait_queue_head_t wait_object;	/* wait object for vq reqs */
-	atomic_t reply_ready;	/* set when reply is ready */
-	atomic_t refcnt;	/* used to cancel WRs... */
-	int event;
-	struct iw_cm_id *cm_id;
-	struct c2_qp *qp;
-};
-
-extern int vq_init(struct c2_dev *c2dev);
-extern void vq_term(struct c2_dev *c2dev);
-
-extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
-extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
-
-extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
-extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
-
-extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
-#endif				/* _C2_VQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/infiniband/hw/amso1100/c2_wr.h
deleted file mode 100644
index 8d4b4ca..0000000
--- a/drivers/infiniband/hw/amso1100/c2_wr.h
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_WR_H_
-#define _C2_WR_H_
-
-#ifdef CCDEBUG
-#define CCWR_MAGIC		0xb07700b0
-#endif
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-/* Maximum allowed size in bytes of private_data exchange
- * on connect.
- */
-#define C2_MAX_PRIVATE_DATA_SIZE 200
-
-/*
- * These types are shared among the adapter, host, and CCIL consumer.
- */
-enum c2_cq_notification_type {
-	C2_CQ_NOTIFICATION_TYPE_NONE = 1,
-	C2_CQ_NOTIFICATION_TYPE_NEXT,
-	C2_CQ_NOTIFICATION_TYPE_NEXT_SE
-};
-
-enum c2_setconfig_cmd {
-	C2_CFG_ADD_ADDR = 1,
-	C2_CFG_DEL_ADDR = 2,
-	C2_CFG_ADD_ROUTE = 3,
-	C2_CFG_DEL_ROUTE = 4
-};
-
-enum c2_getconfig_cmd {
-	C2_GETCONFIG_ROUTES = 1,
-	C2_GETCONFIG_ADDRS
-};
-
-/*
- *  CCIL Work Request Identifiers
- */
-enum c2wr_ids {
-	CCWR_RNIC_OPEN = 1,
-	CCWR_RNIC_QUERY,
-	CCWR_RNIC_SETCONFIG,
-	CCWR_RNIC_GETCONFIG,
-	CCWR_RNIC_CLOSE,
-	CCWR_CQ_CREATE,
-	CCWR_CQ_QUERY,
-	CCWR_CQ_MODIFY,
-	CCWR_CQ_DESTROY,
-	CCWR_QP_CONNECT,
-	CCWR_PD_ALLOC,
-	CCWR_PD_DEALLOC,
-	CCWR_SRQ_CREATE,
-	CCWR_SRQ_QUERY,
-	CCWR_SRQ_MODIFY,
-	CCWR_SRQ_DESTROY,
-	CCWR_QP_CREATE,
-	CCWR_QP_QUERY,
-	CCWR_QP_MODIFY,
-	CCWR_QP_DESTROY,
-	CCWR_NSMR_STAG_ALLOC,
-	CCWR_NSMR_REGISTER,
-	CCWR_NSMR_PBL,
-	CCWR_STAG_DEALLOC,
-	CCWR_NSMR_REREGISTER,
-	CCWR_SMR_REGISTER,
-	CCWR_MR_QUERY,
-	CCWR_MW_ALLOC,
-	CCWR_MW_QUERY,
-	CCWR_EP_CREATE,
-	CCWR_EP_GETOPT,
-	CCWR_EP_SETOPT,
-	CCWR_EP_DESTROY,
-	CCWR_EP_BIND,
-	CCWR_EP_CONNECT,
-	CCWR_EP_LISTEN,
-	CCWR_EP_SHUTDOWN,
-	CCWR_EP_LISTEN_CREATE,
-	CCWR_EP_LISTEN_DESTROY,
-	CCWR_EP_QUERY,
-	CCWR_CR_ACCEPT,
-	CCWR_CR_REJECT,
-	CCWR_CONSOLE,
-	CCWR_TERM,
-	CCWR_FLASH_INIT,
-	CCWR_FLASH,
-	CCWR_BUF_ALLOC,
-	CCWR_BUF_FREE,
-	CCWR_FLASH_WRITE,
-	CCWR_INIT,		/* WARNING: Don't move this ever again! */
-
-
-
-	/* Add new IDs here */
-
-
-
-	/*
-	 * WARNING: CCWR_LAST must always be the last verbs id defined!
-	 *          All the preceding IDs are fixed, and must not change.
-	 *          You can add new IDs, but must not remove or reorder
-	 *          any IDs. If you do, YOU will ruin any hope of
-	 *          compatibility between versions.
-	 */
-	CCWR_LAST,
-
-	/*
-	 * Start over at 1 so that arrays indexed by user wr id's
-	 * begin at 1.  This is OK since the verbs and user wr id's
-	 * are always used on disjoint sets of queues.
-	 */
-	/*
-	 * The order of the CCWR_SEND_XX verbs must
-	 * match the order of the RDMA_OPs
-	 */
-	CCWR_SEND = 1,
-	CCWR_SEND_INV,
-	CCWR_SEND_SE,
-	CCWR_SEND_SE_INV,
-	CCWR_RDMA_WRITE,
-	CCWR_RDMA_READ,
-	CCWR_RDMA_READ_INV,
-	CCWR_MW_BIND,
-	CCWR_NSMR_FASTREG,
-	CCWR_STAG_INVALIDATE,
-	CCWR_RECV,
-	CCWR_NOP,
-	CCWR_UNIMPL,
-/* WARNING: This must always be the last user wr id defined! */
-};
-#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
-
-/*
- * SQ/RQ Work Request Types
- */
-enum c2_wr_type {
-	C2_WR_TYPE_SEND = CCWR_SEND,
-	C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
-	C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
-	C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
-	C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
-	C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
-	C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
-	C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
-	C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
-	C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
-	C2_WR_TYPE_RECV = CCWR_RECV,
-	C2_WR_TYPE_NOP = CCWR_NOP,
-};
-
-struct c2_netaddr {
-	__be32 ip_addr;
-	__be32 netmask;
-	u32 mtu;
-};
-
-struct c2_route {
-	u32 ip_addr;		/* 0 indicates the default route */
-	u32 netmask;		/* netmask associated with dst */
-	u32 flags;
-	union {
-		u32 ipaddr;	/* address of the nexthop interface */
-		u8 enaddr[6];
-	} nexthop;
-};
-
-/*
- * A Scatter Gather Entry.
- */
-struct c2_data_addr {
-	__be32 stag;
-	__be32 length;
-	__be64 to;
-};
-
-/*
- * MR and MW flags used by the consumer, RI, and RNIC.
- */
-enum c2_mm_flags {
-	MEM_REMOTE = 0x0001,	/* allow mw binds with remote access. */
-	MEM_VA_BASED = 0x0002,	/* Not Zero-based */
-	MEM_PBL_COMPLETE = 0x0004,	/* PBL array is complete in this msg */
-	MEM_LOCAL_READ = 0x0008,	/* allow local reads */
-	MEM_LOCAL_WRITE = 0x0010,	/* allow local writes */
-	MEM_REMOTE_READ = 0x0020,	/* allow remote reads */
-	MEM_REMOTE_WRITE = 0x0040,	/* allow remote writes */
-	MEM_WINDOW_BIND = 0x0080,	/* binds allowed */
-	MEM_SHARED = 0x0100,	/* set if MR is shared */
-	MEM_STAG_VALID = 0x0200	/* set if STAG is in valid state */
-};
-
-/*
- * CCIL API ACF flags defined in terms of the low level mem flags.
- * This minimizes translation needed in the user API
- */
-enum c2_acf {
-	C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
-	C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
-	C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
-	C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
-	C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
-};
-
-/*
- * Image types of objects written to flash
- */
-#define C2_FLASH_IMG_BITFILE 1
-#define C2_FLASH_IMG_OPTION_ROM 2
-#define C2_FLASH_IMG_VPD 3
-
-/*
- *  to fix bug 1815 we define the max size allowable of the
- *  terminate message (per the IETF spec).Refer to the IETF
- *  protocol specification, section 12.1.6, page 64)
- *  The message is prefixed by 20 types of DDP info.
- *
- *  Then the message has 6 bytes for the terminate control
- *  and DDP segment length info plus a DDP header (either
- *  14 or 18 byts) plus 28 bytes for the RDMA header.
- *  Thus the max size in:
- *  20 + (6 + 18 + 28) = 72
- */
-#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
-
-/*
- * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
- */
-#define WR_BUILD_STR_LEN 64
-
-/*
- * WARNING:  All of these structs need to align any 64bit types on
- * 64 bit boundaries!  64bit types include u64 and u64.
- */
-
-/*
- * Clustercore Work Request Header.  Be sensitive to field layout
- * and alignment.
- */
-struct c2wr_hdr {
-	/* wqe_count is part of the cqe.  It is put here so the
-	 * adapter can write to it while the wr is pending without
-	 * clobbering part of the wr.  This word need not be dma'd
-	 * from the host to adapter by libccil, but we copy it anyway
-	 * to make the memcpy to the adapter better aligned.
-	 */
-	__be32 wqe_count;
-
-	/* Put these fields next so that later 32- and 64-bit
-	 * quantities are naturally aligned.
-	 */
-	u8 id;
-	u8 result;		/* adapter -> host */
-	u8 sge_count;		/* host -> adapter */
-	u8 flags;		/* host -> adapter */
-
-	u64 context;
-#ifdef CCMSGMAGIC
-	u32 magic;
-	u32 pad;
-#endif
-} __attribute__((packed));
-
-/*
- *------------------------ RNIC ------------------------
- */
-
-/*
- * WR_RNIC_OPEN
- */
-
-/*
- * Flags for the RNIC WRs
- */
-enum c2_rnic_flags {
-	RNIC_IRD_STATIC = 0x0001,
-	RNIC_ORD_STATIC = 0x0002,
-	RNIC_QP_STATIC = 0x0004,
-	RNIC_SRQ_SUPPORTED = 0x0008,
-	RNIC_PBL_BLOCK_MODE = 0x0010,
-	RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
-	RNIC_CQ_OVF_DETECTED = 0x0040,
-	RNIC_PRIV_MODE = 0x0080
-};
-
-struct c2wr_rnic_open_req {
-	struct c2wr_hdr hdr;
-	u64 user_context;
-	__be16 flags;		/* See enum c2_rnic_flags */
-	__be16 port_num;
-} __attribute__((packed));
-
-struct c2wr_rnic_open_rep {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-} __attribute__((packed));
-
-union c2wr_rnic_open {
-	struct c2wr_rnic_open_req req;
-	struct c2wr_rnic_open_rep rep;
-} __attribute__((packed));
-
-struct c2wr_rnic_query_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_QUERY
- */
-struct c2wr_rnic_query_rep {
-	struct c2wr_hdr hdr;
-	u64 user_context;
-	__be32 vendor_id;
-	__be32 part_number;
-	__be32 hw_version;
-	__be32 fw_ver_major;
-	__be32 fw_ver_minor;
-	__be32 fw_ver_patch;
-	char fw_ver_build_str[WR_BUILD_STR_LEN];
-	__be32 max_qps;
-	__be32 max_qp_depth;
-	u32 max_srq_depth;
-	u32 max_send_sgl_depth;
-	u32 max_rdma_sgl_depth;
-	__be32 max_cqs;
-	__be32 max_cq_depth;
-	u32 max_cq_event_handlers;
-	__be32 max_mrs;
-	u32 max_pbl_depth;
-	__be32 max_pds;
-	__be32 max_global_ird;
-	u32 max_global_ord;
-	__be32 max_qp_ird;
-	__be32 max_qp_ord;
-	u32 flags;
-	__be32 max_mws;
-	u32 pbe_range_low;
-	u32 pbe_range_high;
-	u32 max_srqs;
-	u32 page_size;
-} __attribute__((packed));
-
-union c2wr_rnic_query {
-	struct c2wr_rnic_query_req req;
-	struct c2wr_rnic_query_rep rep;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_GETCONFIG
- */
-
-struct c2wr_rnic_getconfig_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 option;		/* see c2_getconfig_cmd_t */
-	u64 reply_buf;
-	u32 reply_buf_len;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_getconfig_rep {
-	struct c2wr_hdr hdr;
-	u32 option;		/* see c2_getconfig_cmd_t */
-	u32 count_len;		/* length of the number of addresses configured */
-} __attribute__((packed)) ;
-
-union c2wr_rnic_getconfig {
-	struct c2wr_rnic_getconfig_req req;
-	struct c2wr_rnic_getconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_SETCONFIG
- */
-struct c2wr_rnic_setconfig_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	__be32 option;		/* See c2_setconfig_cmd_t */
-	/* variable data and pad. See c2_netaddr and c2_route */
-	u8 data[0];
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_setconfig_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_setconfig {
-	struct c2wr_rnic_setconfig_req req;
-	struct c2wr_rnic_setconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_CLOSE
- */
-struct c2wr_rnic_close_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_close_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_close {
-	struct c2wr_rnic_close_req req;
-	struct c2wr_rnic_close_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ CQ ------------------------
- */
-struct c2wr_cq_create_req {
-	struct c2wr_hdr hdr;
-	__be64 shared_ht;
-	u64 user_context;
-	__be64 msg_pool;
-	u32 rnic_handle;
-	__be32 msg_size;
-	__be32 depth;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_create_rep {
-	struct c2wr_hdr hdr;
-	__be32 mq_index;
-	__be32 adapter_shared;
-	u32 cq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_cq_create {
-	struct c2wr_cq_create_req req;
-	struct c2wr_cq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 cq_handle;
-	u32 new_depth;
-	u64 new_msg_pool;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_modify {
-	struct c2wr_cq_modify_req req;
-	struct c2wr_cq_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 cq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_destroy {
-	struct c2wr_cq_destroy_req req;
-	struct c2wr_cq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ PD ------------------------
- */
-struct c2wr_pd_alloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_alloc_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_alloc {
-	struct c2wr_pd_alloc_req req;
-	struct c2wr_pd_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_dealloc {
-	struct c2wr_pd_dealloc_req req;
-	struct c2wr_pd_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ SRQ ------------------------
- */
-struct c2wr_srq_create_req {
-	struct c2wr_hdr hdr;
-	u64 shared_ht;
-	u64 user_context;
-	u32 rnic_handle;
-	u32 srq_depth;
-	u32 srq_limit;
-	u32 sgl_depth;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_create_rep {
-	struct c2wr_hdr hdr;
-	u32 srq_depth;
-	u32 sgl_depth;
-	u32 msg_size;
-	u32 mq_index;
-	u32 mq_start;
-	u32 srq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_srq_create {
-	struct c2wr_srq_create_req req;
-	struct c2wr_srq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 srq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_srq_destroy {
-	struct c2wr_srq_destroy_req req;
-	struct c2wr_srq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ QP ------------------------
- */
-enum c2wr_qp_flags {
-	QP_RDMA_READ = 0x00000001,	/* RDMA read enabled? */
-	QP_RDMA_WRITE = 0x00000002,	/* RDMA write enabled? */
-	QP_MW_BIND = 0x00000004,	/* MWs enabled */
-	QP_ZERO_STAG = 0x00000008,	/* enabled? */
-	QP_REMOTE_TERMINATION = 0x00000010,	/* remote end terminated */
-	QP_RDMA_READ_RESPONSE = 0x00000020	/* Remote RDMA read  */
-	    /* enabled? */
-};
-
-struct c2wr_qp_create_req {
-	struct c2wr_hdr hdr;
-	__be64 shared_sq_ht;
-	__be64 shared_rq_ht;
-	u64 user_context;
-	u32 rnic_handle;
-	u32 sq_cq_handle;
-	u32 rq_cq_handle;
-	__be32 sq_depth;
-	__be32 rq_depth;
-	u32 srq_handle;
-	u32 srq_limit;
-	__be32 flags;		/* see enum c2wr_qp_flags */
-	__be32 send_sgl_depth;
-	__be32 recv_sgl_depth;
-	__be32 rdma_write_sgl_depth;
-	__be32 ord;
-	__be32 ird;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_create_rep {
-	struct c2wr_hdr hdr;
-	__be32 sq_depth;
-	__be32 rq_depth;
-	u32 send_sgl_depth;
-	u32 recv_sgl_depth;
-	u32 rdma_write_sgl_depth;
-	u32 ord;
-	u32 ird;
-	__be32 sq_msg_size;
-	__be32 sq_mq_index;
-	__be32 sq_mq_start;
-	__be32 rq_msg_size;
-	__be32 rq_mq_index;
-	__be32 rq_mq_start;
-	u32 qp_handle;
-} __attribute__((packed)) ;
-
-union c2wr_qp_create {
-	struct c2wr_qp_create_req req;
-	struct c2wr_qp_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_rep {
-	struct c2wr_hdr hdr;
-	u64 user_context;
-	u32 rnic_handle;
-	u32 sq_depth;
-	u32 rq_depth;
-	u32 send_sgl_depth;
-	u32 rdma_write_sgl_depth;
-	u32 recv_sgl_depth;
-	u32 ord;
-	u32 ird;
-	u16 qp_state;
-	u16 flags;		/* see c2wr_qp_flags_t */
-	u32 qp_id;
-	u32 local_addr;
-	u32 remote_addr;
-	u16 local_port;
-	u16 remote_port;
-	u32 terminate_msg_length;	/* 0 if not present */
-	u8 data[0];
-	/* Terminate Message in-line here. */
-} __attribute__((packed)) ;
-
-union c2wr_qp_query {
-	struct c2wr_qp_query_req req;
-	struct c2wr_qp_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_req {
-	struct c2wr_hdr hdr;
-	u64 stream_msg;
-	u32 stream_msg_length;
-	u32 rnic_handle;
-	u32 qp_handle;
-	__be32 next_qp_state;
-	__be32 ord;
-	__be32 ird;
-	__be32 sq_depth;
-	__be32 rq_depth;
-	u32 llp_ep_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_rep {
-	struct c2wr_hdr hdr;
-	u32 ord;
-	u32 ird;
-	u32 sq_depth;
-	u32 rq_depth;
-	u32 sq_msg_size;
-	u32 sq_mq_index;
-	u32 sq_mq_start;
-	u32 rq_msg_size;
-	u32 rq_mq_index;
-	u32 rq_mq_start;
-} __attribute__((packed)) ;
-
-union c2wr_qp_modify {
-	struct c2wr_qp_modify_req req;
-	struct c2wr_qp_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_qp_destroy {
-	struct c2wr_qp_destroy_req req;
-	struct c2wr_qp_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
- * only be posted when a QP is in IDLE state.  After the connect request is
- * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
- * No synchronous reply from adapter to this WR.  The results of
- * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
- * See c2wr_ae_active_connect_results_t
- */
-struct c2wr_qp_connect_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 qp_handle;
-	__be32 remote_addr;
-	__be16 remote_port;
-	u16 pad;
-	__be32 private_data_length;
-	u8 private_data[0];	/* Private data in-line. */
-} __attribute__((packed)) ;
-
-struct c2wr_qp_connect {
-	struct c2wr_qp_connect_req req;
-	/* no synchronous reply.         */
-} __attribute__((packed)) ;
-
-
-/*
- *------------------------ MM ------------------------
- */
-
-struct c2wr_nsmr_stag_alloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 pbl_depth;
-	u32 pd_id;
-	u32 flags;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_stag_alloc_rep {
-	struct c2wr_hdr hdr;
-	u32 pbl_depth;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_stag_alloc {
-	struct c2wr_nsmr_stag_alloc_req req;
-	struct c2wr_nsmr_stag_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_req {
-	struct c2wr_hdr hdr;
-	__be64 va;
-	u32 rnic_handle;
-	__be16 flags;
-	u8 stag_key;
-	u8 pad;
-	u32 pd_id;
-	__be32 pbl_depth;
-	__be32 pbe_size;
-	__be32 fbo;
-	__be32 length;
-	__be32 addrs_length;
-	/* array of paddrs (must be aligned on a 64bit boundary) */
-	__be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_rep {
-	struct c2wr_hdr hdr;
-	u32 pbl_depth;
-	__be32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_register {
-	struct c2wr_nsmr_register_req req;
-	struct c2wr_nsmr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	__be32 flags;
-	__be32 stag_index;
-	__be32 addrs_length;
-	/* array of paddrs (must be aligned on a 64bit boundary) */
-	__be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_pbl {
-	struct c2wr_nsmr_pbl_req req;
-	struct c2wr_nsmr_pbl_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_rep {
-	struct c2wr_hdr hdr;
-	u8 stag_key;
-	u8 pad[3];
-	u32 pd_id;
-	u32 flags;
-	u32 pbl_depth;
-} __attribute__((packed)) ;
-
-union c2wr_mr_query {
-	struct c2wr_mr_query_req req;
-	struct c2wr_mr_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_rep {
-	struct c2wr_hdr hdr;
-	u8 stag_key;
-	u8 pad[3];
-	u32 pd_id;
-	u32 flags;
-} __attribute__((packed)) ;
-
-union c2wr_mw_query {
-	struct c2wr_mw_query_req req;
-	struct c2wr_mw_query_rep rep;
-} __attribute__((packed)) ;
-
-
-struct c2wr_stag_dealloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	__be32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_stag_dealloc_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_stag_dealloc {
-	struct c2wr_stag_dealloc_req req;
-	struct c2wr_stag_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_req {
-	struct c2wr_hdr hdr;
-	u64 va;
-	u32 rnic_handle;
-	u16 flags;
-	u8 stag_key;
-	u8 pad;
-	u32 stag_index;
-	u32 pd_id;
-	u32 pbl_depth;
-	u32 pbe_size;
-	u32 fbo;
-	u32 length;
-	u32 addrs_length;
-	u32 pad1;
-	/* array of paddrs (must be aligned on a 64bit boundary) */
-	u64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_rep {
-	struct c2wr_hdr hdr;
-	u32 pbl_depth;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_reregister {
-	struct c2wr_nsmr_reregister_req req;
-	struct c2wr_nsmr_reregister_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_req {
-	struct c2wr_hdr hdr;
-	u64 va;
-	u32 rnic_handle;
-	u16 flags;
-	u8 stag_key;
-	u8 pad;
-	u32 stag_index;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_rep {
-	struct c2wr_hdr hdr;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_smr_register {
-	struct c2wr_smr_register_req req;
-	struct c2wr_smr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_rep {
-	struct c2wr_hdr hdr;
-	u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_mw_alloc {
-	struct c2wr_mw_alloc_req req;
-	struct c2wr_mw_alloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ WRs -----------------------
- */
-
-struct c2wr_user_hdr {
-	struct c2wr_hdr hdr;		/* Has status and WR Type */
-} __attribute__((packed)) ;
-
-enum c2_qp_state {
-	C2_QP_STATE_IDLE = 0x01,
-	C2_QP_STATE_CONNECTING = 0x02,
-	C2_QP_STATE_RTS = 0x04,
-	C2_QP_STATE_CLOSING = 0x08,
-	C2_QP_STATE_TERMINATE = 0x10,
-	C2_QP_STATE_ERROR = 0x20,
-};
-
-/* Completion queue entry. */
-struct c2wr_ce {
-	struct c2wr_hdr hdr;		/* Has status and WR Type */
-	u64 qp_user_context;	/* c2_user_qp_t * */
-	u32 qp_state;		/* Current QP State */
-	u32 handle;		/* QPID or EP Handle */
-	__be32 bytes_rcvd;		/* valid for RECV WCs */
-	u32 stag;
-} __attribute__((packed)) ;
-
-
-/*
- * Flags used for all post-sq WRs.  These must fit in the flags
- * field of the struct c2wr_hdr (eight bits).
- */
-enum {
-	SQ_SIGNALED = 0x01,
-	SQ_READ_FENCE = 0x02,
-	SQ_FENCE = 0x04,
-};
-
-/*
- * Common fields for all post-sq WRs.  Namely the standard header and a
- * secondary header with fields common to all post-sq WRs.
- */
-struct c2_sq_hdr {
-	struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * Same as above but for post-rq WRs.
- */
-struct c2_rq_hdr {
-	struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * use the same struct for all sends.
- */
-struct c2wr_send_req {
-	struct c2_sq_hdr sq_hdr;
-	__be32 sge_len;
-	__be32 remote_stag;
-	u8 data[0];		/* SGE array */
-} __attribute__((packed));
-
-union c2wr_send {
-	struct c2wr_send_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_write_req {
-	struct c2_sq_hdr sq_hdr;
-	__be64 remote_to;
-	__be32 remote_stag;
-	__be32 sge_len;
-	u8 data[0];		/* SGE array */
-} __attribute__((packed));
-
-union c2wr_rdma_write {
-	struct c2wr_rdma_write_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_read_req {
-	struct c2_sq_hdr sq_hdr;
-	__be64 local_to;
-	__be64 remote_to;
-	__be32 local_stag;
-	__be32 remote_stag;
-	__be32 length;
-} __attribute__((packed));
-
-union c2wr_rdma_read {
-	struct c2wr_rdma_read_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_mw_bind_req {
-	struct c2_sq_hdr sq_hdr;
-	u64 va;
-	u8 stag_key;
-	u8 pad[3];
-	u32 mw_stag_index;
-	u32 mr_stag_index;
-	u32 length;
-	u32 flags;
-} __attribute__((packed));
-
-union c2wr_mw_bind {
-	struct c2wr_mw_bind_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_nsmr_fastreg_req {
-	struct c2_sq_hdr sq_hdr;
-	u64 va;
-	u8 stag_key;
-	u8 pad[3];
-	u32 stag_index;
-	u32 pbe_size;
-	u32 fbo;
-	u32 length;
-	u32 addrs_length;
-	/* array of paddrs (must be aligned on a 64bit boundary) */
-	u64 paddrs[0];
-} __attribute__((packed));
-
-union c2wr_nsmr_fastreg {
-	struct c2wr_nsmr_fastreg_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_stag_invalidate_req {
-	struct c2_sq_hdr sq_hdr;
-	u8 stag_key;
-	u8 pad[3];
-	u32 stag_index;
-} __attribute__((packed));
-
-union c2wr_stag_invalidate {
-	struct c2wr_stag_invalidate_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-union c2wr_sqwr {
-	struct c2_sq_hdr sq_hdr;
-	struct c2wr_send_req send;
-	struct c2wr_send_req send_se;
-	struct c2wr_send_req send_inv;
-	struct c2wr_send_req send_se_inv;
-	struct c2wr_rdma_write_req rdma_write;
-	struct c2wr_rdma_read_req rdma_read;
-	struct c2wr_mw_bind_req mw_bind;
-	struct c2wr_nsmr_fastreg_req nsmr_fastreg;
-	struct c2wr_stag_invalidate_req stag_inv;
-} __attribute__((packed));
-
-
-/*
- * RQ WRs
- */
-struct c2wr_rqwr {
-	struct c2_rq_hdr rq_hdr;
-	u8 data[0];		/* array of SGEs */
-} __attribute__((packed));
-
-union c2wr_recv {
-	struct c2wr_rqwr req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-/*
- * All AEs start with this header.  Most AEs only need to convey the
- * information in the header.  Some, like LLP connection events, need
- * more info.  The union typdef c2wr_ae_t has all the possible AEs.
- *
- * hdr.context is the user_context from the rnic_open WR.  NULL If this
- * is not affiliated with an rnic
- *
- * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
- * CCAE_LLP_CLOSE_COMPLETE)
- *
- * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
- *
- * user_context is the context passed down when the host created the resource.
- */
-struct c2wr_ae_hdr {
-	struct c2wr_hdr hdr;
-	u64 user_context;	/* user context for this res. */
-	__be32 resource_type;	/* see enum c2_resource_indicator */
-	__be32 resource;	/* handle for resource */
-	__be32 qp_state;	/* current QP State */
-} __attribute__((packed));
-
-/*
- * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
- * the adapter moves the QP into RTS state
- */
-struct c2wr_ae_active_connect_results {
-	struct c2wr_ae_hdr ae_hdr;
-	__be32 laddr;
-	__be32 raddr;
-	__be16 lport;
-	__be16 rport;
-	__be32 private_data_length;
-	u8 private_data[0];	/* data is in-line in the msg. */
-} __attribute__((packed));
-
-/*
- * When connections are established by the stack (and the private data
- * MPA frame is received), the adapter will generate an event to the host.
- * The details of the connection, any private data, and the new connection
- * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
- * AE queue:
- */
-struct c2wr_ae_connection_request {
-	struct c2wr_ae_hdr ae_hdr;
-	u32 cr_handle;		/* connreq handle (sock ptr) */
-	__be32 laddr;
-	__be32 raddr;
-	__be16 lport;
-	__be16 rport;
-	__be32 private_data_length;
-	u8 private_data[0];	/* data is in-line in the msg. */
-} __attribute__((packed));
-
-union c2wr_ae {
-	struct c2wr_ae_hdr ae_generic;
-	struct c2wr_ae_active_connect_results ae_active_connect_results;
-	struct c2wr_ae_connection_request ae_connection_request;
-} __attribute__((packed));
-
-struct c2wr_init_req {
-	struct c2wr_hdr hdr;
-	__be64 hint_count;
-	__be64 q0_host_shared;
-	__be64 q1_host_shared;
-	__be64 q1_host_msg_pool;
-	__be64 q2_host_shared;
-	__be64 q2_host_msg_pool;
-} __attribute__((packed));
-
-struct c2wr_init_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_init {
-	struct c2wr_init_req req;
-	struct c2wr_init_rep rep;
-} __attribute__((packed));
-
-/*
- * For upgrading flash.
- */
-
-struct c2wr_flash_init_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-} __attribute__((packed));
-
-struct c2wr_flash_init_rep {
-	struct c2wr_hdr hdr;
-	u32 adapter_flash_buf_offset;
-	u32 adapter_flash_len;
-} __attribute__((packed));
-
-union c2wr_flash_init {
-	struct c2wr_flash_init_req req;
-	struct c2wr_flash_init_rep rep;
-} __attribute__((packed));
-
-struct c2wr_flash_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 len;
-} __attribute__((packed));
-
-struct c2wr_flash_rep {
-	struct c2wr_hdr hdr;
-	u32 status;
-} __attribute__((packed));
-
-union c2wr_flash {
-	struct c2wr_flash_req req;
-	struct c2wr_flash_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 size;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_rep {
-	struct c2wr_hdr hdr;
-	u32 offset;		/* 0 if mem not available */
-	u32 size;		/* 0 if mem not available */
-} __attribute__((packed));
-
-union c2wr_buf_alloc {
-	struct c2wr_buf_alloc_req req;
-	struct c2wr_buf_alloc_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_free_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 offset;		/* Must match value from alloc */
-	u32 size;		/* Must match value from alloc */
-} __attribute__((packed));
-
-struct c2wr_buf_free_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_buf_free {
-	struct c2wr_buf_free_req req;
-	struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_flash_write_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 offset;
-	u32 size;
-	u32 type;
-	u32 flags;
-} __attribute__((packed));
-
-struct c2wr_flash_write_rep {
-	struct c2wr_hdr hdr;
-	u32 status;
-} __attribute__((packed));
-
-union c2wr_flash_write {
-	struct c2wr_flash_write_req req;
-	struct c2wr_flash_write_rep rep;
-} __attribute__((packed));
-
-/*
- * Messages for LLP connection setup.
- */
-
-/*
- * Listen Request.  This allocates a listening endpoint to allow passive
- * connection setup.  Newly established LLP connections are passed up
- * via an AE.  See c2wr_ae_connection_request_t
- */
-struct c2wr_ep_listen_create_req {
-	struct c2wr_hdr hdr;
-	u64 user_context;	/* returned in AEs. */
-	u32 rnic_handle;
-	__be32 local_addr;		/* local addr, or 0  */
-	__be16 local_port;		/* 0 means "pick one" */
-	u16 pad;
-	__be32 backlog;		/* tradional tcp listen bl */
-} __attribute__((packed));
-
-struct c2wr_ep_listen_create_rep {
-	struct c2wr_hdr hdr;
-	u32 ep_handle;		/* handle to new listening ep */
-	u16 local_port;		/* resulting port... */
-	u16 pad;
-} __attribute__((packed));
-
-union c2wr_ep_listen_create {
-	struct c2wr_ep_listen_create_req req;
-	struct c2wr_ep_listen_create_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_ep_listen_destroy {
-	struct c2wr_ep_listen_destroy_req req;
-	struct c2wr_ep_listen_destroy_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_query_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_query_rep {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 local_addr;
-	u32 remote_addr;
-	u16 local_port;
-	u16 remote_port;
-} __attribute__((packed));
-
-union c2wr_ep_query {
-	struct c2wr_ep_query_req req;
-	struct c2wr_ep_query_rep rep;
-} __attribute__((packed));
-
-
-/*
- * The host passes this down to indicate acceptance of a pending iWARP
- * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
- * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
- */
-struct c2wr_cr_accept_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 qp_handle;		/* QP to bind to this LLP conn */
-	u32 ep_handle;		/* LLP  handle to accept */
-	__be32 private_data_length;
-	u8 private_data[0];	/* data in-line in msg. */
-} __attribute__((packed));
-
-/*
- * adapter sends reply when private data is successfully submitted to
- * the LLP.
- */
-struct c2wr_cr_accept_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_accept {
-	struct c2wr_cr_accept_req req;
-	struct c2wr_cr_accept_rep rep;
-} __attribute__((packed));
-
-/*
- * The host sends this down if a given iWARP connection request was
- * rejected by the consumer.  The cr_handle was obtained from a
- * previous c2wr_ae_connection_request_t AE sent by the adapter.
- */
-struct  c2wr_cr_reject_req {
-	struct c2wr_hdr hdr;
-	u32 rnic_handle;
-	u32 ep_handle;		/* LLP handle to reject */
-} __attribute__((packed));
-
-/*
- * Dunno if this is needed, but we'll add it for now.  The adapter will
- * send the reject_reply after the LLP endpoint has been destroyed.
- */
-struct  c2wr_cr_reject_rep {
-	struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_reject {
-	struct c2wr_cr_reject_req req;
-	struct c2wr_cr_reject_rep rep;
-} __attribute__((packed));
-
-/*
- * console command.  Used to implement a debug console over the verbs
- * request and reply queues.
- */
-
-/*
- * Console request message.  It contains:
- *	- message hdr with id = CCWR_CONSOLE
- *	- the physaddr/len of host memory to be used for the reply.
- *	- the command string.  eg:  "netstat -s" or "zoneinfo"
- */
-struct c2wr_console_req {
-	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
-	u64 reply_buf;		/* pinned host buf for reply */
-	u32 reply_buf_len;	/* length of reply buffer */
-	u8 command[0];		/* NUL terminated ascii string */
-	/* containing the command req */
-} __attribute__((packed));
-
-/*
- * flags used in the console reply.
- */
-enum c2_console_flags {
-	CONS_REPLY_TRUNCATED = 0x00000001	/* reply was truncated */
-} __attribute__((packed));
-
-/*
- * Console reply message.
- * hdr.result contains the c2_status_t error if the reply was _not_ generated,
- * or C2_OK if the reply was generated.
- */
-struct c2wr_console_rep {
-	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
-	u32 flags;
-} __attribute__((packed));
-
-union c2wr_console {
-	struct c2wr_console_req req;
-	struct c2wr_console_rep rep;
-} __attribute__((packed));
-
-
-/*
- * Giant union with all WRs.  Makes life easier...
- */
-union c2wr {
-	struct c2wr_hdr hdr;
-	struct c2wr_user_hdr user_hdr;
-	union c2wr_rnic_open rnic_open;
-	union c2wr_rnic_query rnic_query;
-	union c2wr_rnic_getconfig rnic_getconfig;
-	union c2wr_rnic_setconfig rnic_setconfig;
-	union c2wr_rnic_close rnic_close;
-	union c2wr_cq_create cq_create;
-	union c2wr_cq_modify cq_modify;
-	union c2wr_cq_destroy cq_destroy;
-	union c2wr_pd_alloc pd_alloc;
-	union c2wr_pd_dealloc pd_dealloc;
-	union c2wr_srq_create srq_create;
-	union c2wr_srq_destroy srq_destroy;
-	union c2wr_qp_create qp_create;
-	union c2wr_qp_query qp_query;
-	union c2wr_qp_modify qp_modify;
-	union c2wr_qp_destroy qp_destroy;
-	struct c2wr_qp_connect qp_connect;
-	union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
-	union c2wr_nsmr_register nsmr_register;
-	union c2wr_nsmr_pbl nsmr_pbl;
-	union c2wr_mr_query mr_query;
-	union c2wr_mw_query mw_query;
-	union c2wr_stag_dealloc stag_dealloc;
-	union c2wr_sqwr sqwr;
-	struct c2wr_rqwr rqwr;
-	struct c2wr_ce ce;
-	union c2wr_ae ae;
-	union c2wr_init init;
-	union c2wr_ep_listen_create ep_listen_create;
-	union c2wr_ep_listen_destroy ep_listen_destroy;
-	union c2wr_cr_accept cr_accept;
-	union c2wr_cr_reject cr_reject;
-	union c2wr_console console;
-	union c2wr_flash_init flash_init;
-	union c2wr_flash flash;
-	union c2wr_buf_alloc buf_alloc;
-	union c2wr_buf_free buf_free;
-	union c2wr_flash_write flash_write;
-} __attribute__((packed));
-
-
-/*
- * Accessors for the wr fields that are packed together tightly to
- * reduce the wr message size.  The wr arguments are void* so that
- * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
- * in the struct c2wr union can be passed in.
- */
-static __inline__ u8 c2_wr_get_id(void *wr)
-{
-	return ((struct c2wr_hdr *) wr)->id;
-}
-static __inline__ void c2_wr_set_id(void *wr, u8 id)
-{
-	((struct c2wr_hdr *) wr)->id = id;
-}
-static __inline__ u8 c2_wr_get_result(void *wr)
-{
-	return ((struct c2wr_hdr *) wr)->result;
-}
-static __inline__ void c2_wr_set_result(void *wr, u8 result)
-{
-	((struct c2wr_hdr *) wr)->result = result;
-}
-static __inline__ u8 c2_wr_get_flags(void *wr)
-{
-	return ((struct c2wr_hdr *) wr)->flags;
-}
-static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
-{
-	((struct c2wr_hdr *) wr)->flags = flags;
-}
-static __inline__ u8 c2_wr_get_sge_count(void *wr)
-{
-	return ((struct c2wr_hdr *) wr)->sge_count;
-}
-static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
-{
-	((struct c2wr_hdr *) wr)->sge_count = sge_count;
-}
-static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
-{
-	return ((struct c2wr_hdr *) wr)->wqe_count;
-}
-static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
-{
-	((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
-}
-
-#endif				/* _C2_WR_H_ */
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
index e582553..5084088 100644
--- a/drivers/staging/rdma/Kconfig
+++ b/drivers/staging/rdma/Kconfig
@@ -22,6 +22,8 @@ menuconfig STAGING_RDMA
 # Please keep entries in alphabetic order
 if STAGING_RDMA
 
+source "drivers/staging/rdma/amso1100/Kconfig"
+
 source "drivers/staging/rdma/ipath/Kconfig"
 
 endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
index 484dac7..a2a459a 100644
--- a/drivers/staging/rdma/Makefile
+++ b/drivers/staging/rdma/Makefile
@@ -1,2 +1,3 @@
 # Entries for RDMA_STAGING tree
+obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
 obj-$(CONFIG_INFINIBAND_IPATH)	+= ipath/
diff --git a/drivers/staging/rdma/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild
new file mode 100644
index 0000000..950dfab
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/Kbuild
@@ -0,0 +1,6 @@
+ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
+
+iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
+	c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/staging/rdma/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig
new file mode 100644
index 0000000..e6ce5f2
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_AMSO1100
+	tristate "Ammasso 1100 HCA support"
+	depends on PCI && INET
+	---help---
+	  This is a low-level driver for the Ammasso 1100 host
+	  channel adapter (HCA).
+
+config INFINIBAND_AMSO1100_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_AMSO1100
+	default n
+	---help---
+	  This option causes the amso1100 driver to produce a bunch of
+	  debug messages.  Select this if you are developing the driver
+	  or trying to diagnose a problem.
diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO
new file mode 100644
index 0000000..18b00a5
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/TODO
@@ -0,0 +1,4 @@
+7/2015
+
+The amso1100 driver has been deprecated and moved to drivers/staging.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c
new file mode 100644
index 0000000..766a71c
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2.c
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_provider.h"
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1;		/* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int c2_up(struct net_device *netdev);
+static int c2_down(struct net_device *netdev);
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+static void c2_tx_interrupt(struct net_device *netdev);
+static void c2_rx_interrupt(struct net_device *netdev);
+static irqreturn_t c2_interrupt(int irq, void *dev_id);
+static void c2_tx_timeout(struct net_device *netdev);
+static int c2_change_mtu(struct net_device *netdev, int new_mtu);
+static void c2_reset(struct c2_port *c2_port);
+
+static struct pci_device_id c2_pci_table[] = {
+	{ PCI_DEVICE(0x18b8, 0xb001) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, c2_pci_table);
+
+static void c2_print_macaddr(struct net_device *netdev)
+{
+	pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+static void c2_set_rxbufsize(struct c2_port *c2_port)
+{
+	struct net_device *netdev = c2_port->netdev;
+
+	if (netdev->mtu > RX_BUF_SIZE)
+		c2_port->rx_buf_size =
+		    netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
+		    NET_IP_ALIGN;
+	else
+		c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
+}
+
+/*
+ * Allocate TX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_txp_ring)
+{
+	struct c2_tx_desc *tx_desc;
+	struct c2_txp_desc __iomem *txp_desc;
+	struct c2_element *elem;
+	int i;
+
+	tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
+	if (!tx_ring->start)
+		return -ENOMEM;
+
+	elem = tx_ring->start;
+	tx_desc = vaddr;
+	txp_desc = mmio_txp_ring;
+	for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
+		tx_desc->len = 0;
+		tx_desc->status = 0;
+
+		/* Set TXP_HTXD_UNINIT */
+		__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+			     (void __iomem *) txp_desc + C2_TXP_ADDR);
+		__raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
+		__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+			     (void __iomem *) txp_desc + C2_TXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = tx_desc;
+		elem->hw_desc = txp_desc;
+
+		if (i == tx_ring->count - 1) {
+			elem->next = tx_ring->start;
+			tx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			tx_desc->next_offset =
+			    base + (i + 1) * sizeof(*tx_desc);
+		}
+	}
+
+	tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
+
+	return 0;
+}
+
+/*
+ * Allocate RX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_rxp_ring)
+{
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_desc __iomem *rxp_desc;
+	struct c2_element *elem;
+	int i;
+
+	rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
+	if (!rx_ring->start)
+		return -ENOMEM;
+
+	elem = rx_ring->start;
+	rx_desc = vaddr;
+	rxp_desc = mmio_rxp_ring;
+	for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
+		rx_desc->len = 0;
+		rx_desc->status = 0;
+
+		/* Set RXP_HRXD_UNINIT */
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
+		       (void __iomem *) rxp_desc + C2_RXP_STATUS);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     (void __iomem *) rxp_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     (void __iomem *) rxp_desc + C2_RXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = rx_desc;
+		elem->hw_desc = rxp_desc;
+
+		if (i == rx_ring->count - 1) {
+			elem->next = rx_ring->start;
+			rx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			rx_desc->next_offset =
+			    base + (i + 1) * sizeof(*rx_desc);
+		}
+	}
+
+	rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
+
+	return 0;
+}
+
+/* Setup buffer for receiving */
+static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	struct c2_rxp_hdr *rxp_hdr;
+
+	skb = dev_alloc_skb(c2_port->rx_buf_size);
+	if (unlikely(!skb)) {
+		pr_debug("%s: out of memory for receive\n",
+			c2_port->netdev->name);
+		return -ENOMEM;
+	}
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(skb->data, 0, sizeof(*rxp_hdr));
+
+	skb->dev = c2_port->netdev;
+
+	maplen = c2_port->rx_buf_size;
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen,
+			   PCI_DMA_FROMDEVICE);
+
+	/* Set the sk_buff RXP_header to RXP_HRXD_READY */
+	rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+	rxp_hdr->flags = RXP_HRXD_READY;
+
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+	rx_desc->len = maplen;
+
+	return 0;
+}
+
+/*
+ * Allocate buffers for the Rx ring
+ * For receive:  rx_ring.to_clean is next received frame
+ */
+static int c2_rx_fill(struct c2_port *c2_port)
+{
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	int ret = 0;
+
+	elem = rx_ring->start;
+	do {
+		if (c2_rx_alloc(c2_port, elem)) {
+			ret = 1;
+			break;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+
+	rx_ring->to_clean = rx_ring->start;
+	return ret;
+}
+
+/* Free all buffers in RX ring, assumes receiver stopped */
+static void c2_rx_clean(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+
+	elem = rx_ring->start;
+	do {
+		rx_desc = elem->ht_desc;
+		rx_desc->len = 0;
+
+		__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+		__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+		__raw_writew(0, elem->hw_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     elem->hw_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     elem->hw_desc + C2_RXP_FLAGS);
+
+		if (elem->skb) {
+			pci_unmap_single(c2dev->pcidev, elem->mapaddr,
+					 elem->maplen, PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(elem->skb);
+			elem->skb = NULL;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+}
+
+static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
+{
+	struct c2_tx_desc *tx_desc = elem->ht_desc;
+
+	tx_desc->len = 0;
+
+	pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
+			 PCI_DMA_TODEVICE);
+
+	if (elem->skb) {
+		dev_kfree_skb_any(elem->skb);
+		elem->skb = NULL;
+	}
+
+	return 0;
+}
+
+/* Free all buffers in TX ring, assumes transmitter stopped */
+static void c2_tx_clean(struct c2_port *c2_port)
+{
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+	int retry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	elem = tx_ring->start;
+
+	do {
+		retry = 0;
+		do {
+			txp_htxd.flags =
+			    readw(elem->hw_desc + C2_TXP_FLAGS);
+
+			if (txp_htxd.flags == TXP_HTXD_READY) {
+				retry = 1;
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq(0,
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
+					     elem->hw_desc + C2_TXP_FLAGS);
+				c2_port->netdev->stats.tx_dropped++;
+				break;
+			} else {
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+					     elem->hw_desc + C2_TXP_FLAGS);
+			}
+
+			c2_tx_free(c2_port->c2dev, elem);
+
+		} while ((elem = elem->next) != tx_ring->start);
+	} while (retry);
+
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
+
+	if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(c2_port->netdev);
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+}
+
+/*
+ * Process transmit descriptors marked 'DONE' by the firmware,
+ * freeing up their unneeded sk_buffs.
+ */
+static void c2_tx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+
+	spin_lock(&c2_port->tx_lock);
+
+	for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
+	     elem = elem->next) {
+		txp_htxd.flags =
+		    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
+
+		if (txp_htxd.flags != TXP_HTXD_DONE)
+			break;
+
+		if (netif_msg_tx_done(c2_port)) {
+			/* PCI reads are expensive in fast path */
+			txp_htxd.len =
+			    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
+			pr_debug("%s: tx done slot %3Zu status 0x%x len "
+				"%5u bytes\n",
+				netdev->name, elem - tx_ring->start,
+				txp_htxd.flags, txp_htxd.len);
+		}
+
+		c2_tx_free(c2dev, elem);
+		++(c2_port->tx_avail);
+	}
+
+	tx_ring->to_clean = elem;
+
+	if (netif_queue_stopped(netdev)
+	    && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(netdev);
+
+	spin_unlock(&c2_port->tx_lock);
+}
+
+static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+
+	if (rxp_hdr->status != RXP_HRXD_OK ||
+	    rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
+		pr_debug("BAD RXP_HRXD\n");
+		pr_debug("  rx_desc : %p\n", rx_desc);
+		pr_debug("    index : %Zu\n",
+			elem - c2_port->rx_ring.start);
+		pr_debug("    len   : %u\n", rx_desc->len);
+		pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
+			(void *) __pa((unsigned long) rxp_hdr));
+		pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
+		pr_debug("    status: 0x%x\n", rxp_hdr->status);
+		pr_debug("    len   : %u\n", rxp_hdr->len);
+		pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
+	}
+
+	/* Setup the skb for reuse since we're dropping this pkt */
+	elem->skb->data = elem->skb->head;
+	skb_reset_tail_pointer(elem->skb);
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(elem->skb->data, 0, sizeof(*rxp_hdr));
+
+	/* Write the descriptor to the adapter's rx ring */
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+	__raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
+		     elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	pr_debug("packet dropped\n");
+	c2_port->netdev->stats.rx_dropped++;
+}
+
+static void c2_rx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen, buflen;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+
+	/* Begin where we left off */
+	rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
+
+	for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
+	     elem = elem->next) {
+		rx_desc = elem->ht_desc;
+		mapaddr = elem->mapaddr;
+		maplen = elem->maplen;
+		skb = elem->skb;
+		rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+
+		if (rxp_hdr->flags != RXP_HRXD_DONE)
+			break;
+		buflen = rxp_hdr->len;
+
+		/* Sanity check the RXP header */
+		if (rxp_hdr->status != RXP_HRXD_OK ||
+		    buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/*
+		 * Allocate and map a new skb for replenishing the host
+		 * RX desc
+		 */
+		if (c2_rx_alloc(c2_port, elem)) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/* Unmap the old skb */
+		pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
+				 PCI_DMA_FROMDEVICE);
+
+		prefetch(skb->data);
+
+		/*
+		 * Skip past the leading 8 bytes comprising of the
+		 * "struct c2_rxp_hdr", prepended by the adapter
+		 * to the usual Ethernet header ("struct ethhdr"),
+		 * to the start of the raw Ethernet packet.
+		 *
+		 * Fix up the various fields in the sk_buff before
+		 * passing it up to netif_rx(). The transfer size
+		 * (in bytes) specified by the adapter len field of
+		 * the "struct rxp_hdr_t" does NOT include the
+		 * "sizeof(struct c2_rxp_hdr)".
+		 */
+		skb->data += sizeof(*rxp_hdr);
+		skb_set_tail_pointer(skb, buflen);
+		skb->len = buflen;
+		skb->protocol = eth_type_trans(skb, netdev);
+
+		netif_rx(skb);
+
+		netdev->stats.rx_packets++;
+		netdev->stats.rx_bytes += buflen;
+	}
+
+	/* Save where we left off */
+	rx_ring->to_clean = elem;
+	c2dev->cur_rx = elem - rx_ring->start;
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+}
+
+/*
+ * Handle netisr0 TX & RX interrupts.
+ */
+static irqreturn_t c2_interrupt(int irq, void *dev_id)
+{
+	unsigned int netisr0, dmaisr;
+	int handled = 0;
+	struct c2_dev *c2dev = (struct c2_dev *) dev_id;
+
+	/* Process CCILNET interrupts */
+	netisr0 = readl(c2dev->regs + C2_NISR0);
+	if (netisr0) {
+
+		/*
+		 * There is an issue with the firmware that always
+		 * provides the status of RX for both TX & RX
+		 * interrupts.  So process both queues here.
+		 */
+		c2_rx_interrupt(c2dev->netdev);
+		c2_tx_interrupt(c2dev->netdev);
+
+		/* Clear the interrupt */
+		writel(netisr0, c2dev->regs + C2_NISR0);
+		handled++;
+	}
+
+	/* Process RNIC interrupts */
+	dmaisr = readl(c2dev->regs + C2_DISR);
+	if (dmaisr) {
+		writel(dmaisr, c2dev->regs + C2_DISR);
+		c2_rnic_interrupt(c2dev);
+		handled++;
+	}
+
+	if (handled) {
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+static int c2_up(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_element *elem;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct in_device *in_dev;
+	size_t rx_size, tx_size;
+	int ret, i;
+	unsigned int netimr0;
+
+	if (netif_msg_ifup(c2_port))
+		pr_debug("%s: enabling interface\n", netdev->name);
+
+	/* Set the Rx buffer size based on MTU */
+	c2_set_rxbufsize(c2_port);
+
+	/* Allocate DMA'able memory for Tx/Rx host descriptor rings */
+	rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
+	tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
+
+	c2_port->mem_size = tx_size + rx_size;
+	c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
+					     &c2_port->dma);
+	if (c2_port->mem == NULL) {
+		pr_debug("Unable to allocate memory for "
+			"host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Create the Rx host descriptor ring */
+	if ((ret =
+	     c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
+			      c2dev->mmio_rxp_ring))) {
+		pr_debug("Unable to create RX ring\n");
+		goto bail0;
+	}
+
+	/* Allocate Rx buffers for the host descriptor ring */
+	if (c2_rx_fill(c2_port)) {
+		pr_debug("Unable to fill RX ring\n");
+		goto bail1;
+	}
+
+	/* Create the Tx host descriptor ring */
+	if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
+				    c2_port->dma + rx_size,
+				    c2dev->mmio_txp_ring))) {
+		pr_debug("Unable to create TX ring\n");
+		goto bail1;
+	}
+
+	/* Set the TX pointer to where we left off */
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
+	    c2_port->tx_ring.start + c2dev->cur_tx;
+
+	/* missing: Initialize MAC */
+
+	BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
+	for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
+	     i++, elem++) {
+		rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+		rxp_hdr->flags = 0;
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+			     elem->hw_desc + C2_RXP_FLAGS);
+	}
+
+	/* Enable network packets */
+	netif_start_queue(netdev);
+
+	/* Enable IRQ */
+	writel(0, c2dev->regs + C2_IDIS);
+	netimr0 = readl(c2dev->regs + C2_NIMR0);
+	netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
+	writel(netimr0, c2dev->regs + C2_NIMR0);
+
+	/* Tell the stack to ignore arp requests for ipaddrs bound to
+	 * other interfaces.  This is needed to prevent the host stack
+	 * from responding to arp requests to the ipaddr bound on the
+	 * rdma interface.
+	 */
+	in_dev = in_dev_get(netdev);
+	IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
+	in_dev_put(in_dev);
+
+	return 0;
+
+      bail1:
+	c2_rx_clean(c2_port);
+	kfree(c2_port->rx_ring.start);
+
+      bail0:
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return ret;
+}
+
+static int c2_down(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+
+	if (netif_msg_ifdown(c2_port))
+		pr_debug("%s: disabling interface\n",
+			netdev->name);
+
+	/* Wait for all the queued packets to get sent */
+	c2_tx_interrupt(netdev);
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Disable IRQs by clearing the interrupt mask */
+	writel(1, c2dev->regs + C2_IDIS);
+	writel(0, c2dev->regs + C2_NIMR0);
+
+	/* missing: Stop transmitter */
+
+	/* missing: Stop receiver */
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* missing: Turn off LEDs here */
+
+	/* Free all buffers in the host descriptor rings */
+	c2_tx_clean(c2_port);
+	c2_rx_clean(c2_port);
+
+	/* Free the host descriptor rings */
+	kfree(c2_port->rx_ring.start);
+	kfree(c2_port->tx_ring.start);
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return 0;
+}
+
+static void c2_reset(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	unsigned int cur_rx = c2dev->cur_rx;
+
+	/* Tell the hardware to quiesce */
+	C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
+
+	/*
+	 * The hardware will reset the C2_PCI_HRX_QUI bit once
+	 * the RXP is quiesced.  Wait 2 seconds for this.
+	 */
+	ssleep(2);
+
+	cur_rx = C2_GET_CUR_RX(c2dev);
+
+	if (cur_rx & C2_PCI_HRX_QUI)
+		pr_debug("c2_reset: failed to quiesce the hardware!\n");
+
+	cur_rx &= ~C2_PCI_HRX_QUI;
+
+	c2dev->cur_rx = cur_rx;
+
+	pr_debug("Current RX: %u\n", c2dev->cur_rx);
+}
+
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	unsigned long flags;
+	unsigned int i;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
+		netif_stop_queue(netdev);
+		spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+		pr_debug("%s: Tx ring full when queue awake!\n",
+			netdev->name);
+		return NETDEV_TX_BUSY;
+	}
+
+	maplen = skb_headlen(skb);
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
+
+	elem = tx_ring->to_use;
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+
+	/* Tell HW to xmit */
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+		     elem->hw_desc + C2_TXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(maplen),
+		     elem->hw_desc + C2_TXP_LEN);
+	__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+		     elem->hw_desc + C2_TXP_FLAGS);
+
+	netdev->stats.tx_packets++;
+	netdev->stats.tx_bytes += maplen;
+
+	/* Loop thru additional data fragments and queue them */
+	if (skb_shinfo(skb)->nr_frags) {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			maplen = skb_frag_size(frag);
+			mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
+						   0, maplen, DMA_TO_DEVICE);
+			elem = elem->next;
+			elem->skb = NULL;
+			elem->mapaddr = mapaddr;
+			elem->maplen = maplen;
+
+			/* Tell HW to xmit */
+			__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+				     elem->hw_desc + C2_TXP_ADDR);
+			__raw_writew((__force u16) cpu_to_be16(maplen),
+				     elem->hw_desc + C2_TXP_LEN);
+			__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+				     elem->hw_desc + C2_TXP_FLAGS);
+
+			netdev->stats.tx_packets++;
+			netdev->stats.tx_bytes += maplen;
+		}
+	}
+
+	tx_ring->to_use = elem->next;
+	c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
+
+	if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
+		netif_stop_queue(netdev);
+		if (netif_msg_tx_queued(c2_port))
+			pr_debug("%s: transmit queue full\n",
+				netdev->name);
+	}
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+}
+
+static void c2_tx_timeout(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+
+	if (netif_msg_timer(c2_port))
+		pr_debug("%s: tx timeout\n", netdev->name);
+
+	c2_tx_clean(c2_port);
+}
+
+static int c2_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	int ret = 0;
+
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	if (netif_running(netdev)) {
+		c2_down(netdev);
+
+		c2_up(netdev);
+	}
+
+	return ret;
+}
+
+static const struct net_device_ops c2_netdev = {
+	.ndo_open 		= c2_up,
+	.ndo_stop 		= c2_down,
+	.ndo_start_xmit		= c2_xmit_frame,
+	.ndo_tx_timeout		= c2_tx_timeout,
+	.ndo_change_mtu		= c2_change_mtu,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+/* Initialize network device */
+static struct net_device *c2_devinit(struct c2_dev *c2dev,
+				     void __iomem * mmio_addr)
+{
+	struct c2_port *c2_port = NULL;
+	struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
+
+	if (!netdev) {
+		pr_debug("c2_port etherdev alloc failed");
+		return NULL;
+	}
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	netdev->netdev_ops = &c2_netdev;
+	netdev->watchdog_timeo = C2_TX_TIMEOUT;
+	netdev->irq = c2dev->pcidev->irq;
+
+	c2_port = netdev_priv(netdev);
+	c2_port->netdev = netdev;
+	c2_port->c2dev = c2dev;
+	c2_port->msg_enable = netif_msg_init(debug, default_msg);
+	c2_port->tx_ring.count = C2_NUM_TX_DESC;
+	c2_port->rx_ring.count = C2_NUM_RX_DESC;
+
+	spin_lock_init(&c2_port->tx_lock);
+
+	/* Copy our 48-bit ethernet hardware address */
+	memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
+
+	/* Validate the MAC address */
+	if (!is_valid_ether_addr(netdev->dev_addr)) {
+		pr_debug("Invalid MAC Address\n");
+		c2_print_macaddr(netdev);
+		free_netdev(netdev);
+		return NULL;
+	}
+
+	c2dev->netdev = netdev;
+
+	return netdev;
+}
+
+static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	int ret = 0, i;
+	unsigned long reg0_start, reg0_flags, reg0_len;
+	unsigned long reg2_start, reg2_flags, reg2_len;
+	unsigned long reg4_start, reg4_flags, reg4_len;
+	unsigned kva_map_size;
+	struct net_device *netdev = NULL;
+	struct c2_dev *c2dev = NULL;
+	void __iomem *mmio_regs = NULL;
+
+	printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
+		DRV_VERSION);
+
+	/* Enable PCI device */
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
+			pci_name(pcidev));
+		goto bail0;
+	}
+
+	reg0_start = pci_resource_start(pcidev, BAR_0);
+	reg0_len = pci_resource_len(pcidev, BAR_0);
+	reg0_flags = pci_resource_flags(pcidev, BAR_0);
+
+	reg2_start = pci_resource_start(pcidev, BAR_2);
+	reg2_len = pci_resource_len(pcidev, BAR_2);
+	reg2_flags = pci_resource_flags(pcidev, BAR_2);
+
+	reg4_start = pci_resource_start(pcidev, BAR_4);
+	reg4_len = pci_resource_len(pcidev, BAR_4);
+	reg4_flags = pci_resource_flags(pcidev, BAR_4);
+
+	pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
+	pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
+	pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(reg0_flags & IORESOURCE_MEM) ||
+	    !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Check for weird/broken PCI region reporting */
+	if ((reg0_len < C2_REG0_SIZE) ||
+	    (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
+		printk(KERN_ERR PFX "Invalid PCI region sizes\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to request regions\n",
+			pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	/* Enables bus-mastering on the device */
+	pci_set_master(pcidev);
+
+	/* Remap the adapter PCI registers in BAR4 */
+	mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				    sizeof(struct c2_adapter_pci_regs));
+	if (!mmio_regs) {
+		printk(KERN_ERR PFX
+			"Unable to remap adapter PCI registers in BAR4\n");
+		ret = -EIO;
+		goto bail2;
+	}
+
+	/* Validate PCI regs magic */
+	for (i = 0; i < sizeof(c2_magic); i++) {
+		if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
+			printk(KERN_ERR PFX "Downlevel Firmware boot loader "
+				"[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
+			       "utility to update your boot loader\n",
+				i + 1, sizeof(c2_magic),
+				readb(mmio_regs + C2_REGS_MAGIC + i),
+				c2_magic[i]);
+			printk(KERN_ERR PFX "Adapter not claimed\n");
+			iounmap(mmio_regs);
+			ret = -EIO;
+			goto bail2;
+		}
+	}
+
+	/* Validate the adapter version */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
+		printk(KERN_ERR PFX "Version mismatch "
+			"[fw=%u, c2=%u], Adapter not claimed\n",
+			be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
+			C2_VERSION);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Validate the adapter IVN */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
+		printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
+		       "the OpenIB device support kit. "
+		       "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
+		       C2_IVN);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Allocate hardware structure */
+	c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
+	if (!c2dev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
+			pci_name(pcidev));
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	memset(c2dev, 0, sizeof(*c2dev));
+	spin_lock_init(&c2dev->lock);
+	c2dev->pcidev = pcidev;
+	c2dev->cur_tx = 0;
+
+	/* Get the last RX index */
+	c2dev->cur_rx =
+	    (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
+	     0xffffc000) / sizeof(struct c2_rxp_desc);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+			pci_name(pcidev), pcidev->irq);
+		iounmap(mmio_regs);
+		goto bail3;
+	}
+
+	/* Set driver specific data */
+	pci_set_drvdata(pcidev, c2dev);
+
+	/* Initialize network device */
+	if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail4;
+	}
+
+	/* Save off the actual size prior to unmapping mmio_regs */
+	kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
+
+	/* Unmap the adapter PCI registers in BAR4 */
+	iounmap(mmio_regs);
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
+			ret);
+		goto bail5;
+	}
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Remap the adapter HRXDQ PA space to kernel VA space */
+	c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
+					       C2_RXP_HRXDQ_SIZE);
+	if (!c2dev->mmio_rxp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
+		ret = -EIO;
+		goto bail6;
+	}
+
+	/* Remap the adapter HTXDQ PA space to kernel VA space */
+	c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
+					       C2_TXP_HTXDQ_SIZE);
+	if (!c2dev->mmio_txp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
+		ret = -EIO;
+		goto bail7;
+	}
+
+	/* Save off the current RX index in the last 4 bytes of the TXP Ring */
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
+	if (!c2dev->regs) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail8;
+	}
+
+	/* Remap the PCI registers in adapter BAR4 to kernel VA space */
+	c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
+	c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				     kva_map_size);
+	if (!c2dev->kva) {
+		printk(KERN_ERR PFX "Unable to remap BAR4\n");
+		ret = -EIO;
+		goto bail9;
+	}
+
+	/* Print out the MAC address */
+	c2_print_macaddr(netdev);
+
+	ret = c2_rnic_init(c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
+		goto bail10;
+	}
+
+	ret = c2_register_device(c2dev);
+	if (ret)
+		goto bail10;
+
+	return 0;
+
+ bail10:
+	iounmap(c2dev->kva);
+
+ bail9:
+	iounmap(c2dev->regs);
+
+ bail8:
+	iounmap(c2dev->mmio_txp_ring);
+
+ bail7:
+	iounmap(c2dev->mmio_rxp_ring);
+
+ bail6:
+	unregister_netdev(netdev);
+
+ bail5:
+	free_netdev(netdev);
+
+ bail4:
+	free_irq(pcidev->irq, c2dev);
+
+ bail3:
+	ib_dealloc_device(&c2dev->ibdev);
+
+ bail2:
+	pci_release_regions(pcidev);
+
+ bail1:
+	pci_disable_device(pcidev);
+
+ bail0:
+	return ret;
+}
+
+static void c2_remove(struct pci_dev *pcidev)
+{
+	struct c2_dev *c2dev = pci_get_drvdata(pcidev);
+	struct net_device *netdev = c2dev->netdev;
+
+	/* Unregister with OpenIB */
+	c2_unregister_device(c2dev);
+
+	/* Clean up the RNIC resources */
+	c2_rnic_term(c2dev);
+
+	/* Remove network device from the kernel */
+	unregister_netdev(netdev);
+
+	/* Free network device */
+	free_netdev(netdev);
+
+	/* Free the interrupt line */
+	free_irq(pcidev->irq, c2dev);
+
+	/* missing: Turn LEDs off here */
+
+	/* Unmap adapter PA space */
+	iounmap(c2dev->kva);
+	iounmap(c2dev->regs);
+	iounmap(c2dev->mmio_txp_ring);
+	iounmap(c2dev->mmio_rxp_ring);
+
+	/* Free the hardware structure */
+	ib_dealloc_device(&c2dev->ibdev);
+
+	/* Release reserved PCI I/O and memory resources */
+	pci_release_regions(pcidev);
+
+	/* Disable PCI device */
+	pci_disable_device(pcidev);
+
+	/* Clear driver specific data */
+	pci_set_drvdata(pcidev, NULL);
+}
+
+static struct pci_driver c2_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = c2_pci_table,
+	.probe = c2_probe,
+	.remove = c2_remove,
+};
+
+module_pci_driver(c2_pci_driver);
diff --git a/drivers/staging/rdma/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h
new file mode 100644
index 0000000..d619d73
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2.h
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __C2_H
+#define __C2_H
+
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+
+#include "c2_provider.h"
+#include "c2_mq.h"
+#include "c2_status.h"
+
+#define DRV_NAME     "c2"
+#define DRV_VERSION  "1.1"
+#define PFX          DRV_NAME ": "
+
+#define BAR_0                0
+#define BAR_2                2
+#define BAR_4                4
+
+#define RX_BUF_SIZE         (1536 + 8)
+#define ETH_JUMBO_MTU        9000
+#define C2_MAGIC            "CEPHEUS"
+#define C2_VERSION           4
+#define C2_IVN              (18 & 0x7fffffff)
+
+#define C2_REG0_SIZE        (16 * 1024)
+#define C2_REG2_SIZE        (2 * 1024 * 1024)
+#define C2_REG4_SIZE        (256 * 1024 * 1024)
+#define C2_NUM_TX_DESC       341
+#define C2_NUM_RX_DESC       256
+#define C2_PCI_REGS_OFFSET  (0x10000)
+#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
+#define C2_RXP_HRXDQ_SIZE   (4096)
+#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
+#define C2_TXP_HTXDQ_SIZE   (4096)
+#define C2_TX_TIMEOUT	    (6*HZ)
+
+/* CEPHEUS */
+static const u8 c2_magic[] = {
+	0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
+};
+
+enum adapter_pci_regs {
+	C2_REGS_MAGIC = 0x0000,
+	C2_REGS_VERS = 0x0008,
+	C2_REGS_IVN = 0x000C,
+	C2_REGS_PCI_WINSIZE = 0x0010,
+	C2_REGS_Q0_QSIZE = 0x0014,
+	C2_REGS_Q0_MSGSIZE = 0x0018,
+	C2_REGS_Q0_POOLSTART = 0x001C,
+	C2_REGS_Q0_SHARED = 0x0020,
+	C2_REGS_Q1_QSIZE = 0x0024,
+	C2_REGS_Q1_MSGSIZE = 0x0028,
+	C2_REGS_Q1_SHARED = 0x0030,
+	C2_REGS_Q2_QSIZE = 0x0034,
+	C2_REGS_Q2_MSGSIZE = 0x0038,
+	C2_REGS_Q2_SHARED = 0x0040,
+	C2_REGS_ENADDR = 0x004C,
+	C2_REGS_RDMA_ENADDR = 0x0054,
+	C2_REGS_HRX_CUR = 0x006C,
+};
+
+struct c2_adapter_pci_regs {
+	char reg_magic[8];
+	u32 version;
+	u32 ivn;
+	u32 pci_window_size;
+	u32 q0_q_size;
+	u32 q0_msg_size;
+	u32 q0_pool_start;
+	u32 q0_shared;
+	u32 q1_q_size;
+	u32 q1_msg_size;
+	u32 q1_pool_start;
+	u32 q1_shared;
+	u32 q2_q_size;
+	u32 q2_msg_size;
+	u32 q2_pool_start;
+	u32 q2_shared;
+	u32 log_start;
+	u32 log_size;
+	u8 host_enaddr[8];
+	u8 rdma_enaddr[8];
+	u32 crash_entry;
+	u32 crash_ready[2];
+	u32 fw_txd_cur;
+	u32 fw_hrxd_cur;
+	u32 fw_rxd_cur;
+};
+
+enum pci_regs {
+	C2_HISR = 0x0000,
+	C2_DISR = 0x0004,
+	C2_HIMR = 0x0008,
+	C2_DIMR = 0x000C,
+	C2_NISR0 = 0x0010,
+	C2_NISR1 = 0x0014,
+	C2_NIMR0 = 0x0018,
+	C2_NIMR1 = 0x001C,
+	C2_IDIS = 0x0020,
+};
+
+enum {
+	C2_PCI_HRX_INT = 1 << 8,
+	C2_PCI_HTX_INT = 1 << 17,
+	C2_PCI_HRX_QUI = 1 << 31,
+};
+
+/*
+ * Cepheus registers in BAR0.
+ */
+struct c2_pci_regs {
+	u32 hostisr;
+	u32 dmaisr;
+	u32 hostimr;
+	u32 dmaimr;
+	u32 netisr0;
+	u32 netisr1;
+	u32 netimr0;
+	u32 netimr1;
+	u32 int_disable;
+};
+
+/* TXP flags */
+enum c2_txp_flags {
+	TXP_HTXD_DONE = 0,
+	TXP_HTXD_READY = 1 << 0,
+	TXP_HTXD_UNINIT = 1 << 1,
+};
+
+/* RXP flags */
+enum c2_rxp_flags {
+	RXP_HRXD_UNINIT = 0,
+	RXP_HRXD_READY = 1 << 0,
+	RXP_HRXD_DONE = 1 << 1,
+};
+
+/* RXP status */
+enum c2_rxp_status {
+	RXP_HRXD_ZERO = 0,
+	RXP_HRXD_OK = 1 << 0,
+	RXP_HRXD_BUF_OV = 1 << 1,
+};
+
+/* TXP descriptor fields */
+enum txp_desc {
+	C2_TXP_FLAGS = 0x0000,
+	C2_TXP_LEN = 0x0002,
+	C2_TXP_ADDR = 0x0004,
+};
+
+/* RXP descriptor fields */
+enum rxp_desc {
+	C2_RXP_FLAGS = 0x0000,
+	C2_RXP_STATUS = 0x0002,
+	C2_RXP_COUNT = 0x0004,
+	C2_RXP_LEN = 0x0006,
+	C2_RXP_ADDR = 0x0008,
+};
+
+struct c2_txp_desc {
+	u16 flags;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_desc {
+	u16 flags;
+	u16 status;
+	u16 count;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_hdr {
+	u16 flags;
+	u16 status;
+	u16 len;
+	u16 rsvd;
+} __attribute__ ((packed));
+
+struct c2_tx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_rx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_alloc {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_array {
+	struct {
+		void **page;
+		int used;
+	} *page_list;
+};
+
+/*
+ * The MQ shared pointer pool is organized as a linked list of
+ * chunks. Each chunk contains a linked list of free shared pointers
+ * that can be allocated to a given user mode client.
+ *
+ */
+struct sp_chunk {
+	struct sp_chunk *next;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 head;
+	u16 shared_ptr[0];
+};
+
+struct c2_pd_table {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_qp_table {
+	struct idr idr;
+	spinlock_t lock;
+};
+
+struct c2_element {
+	struct c2_element *next;
+	void *ht_desc;		/* host     descriptor */
+	void __iomem *hw_desc;	/* hardware descriptor */
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+};
+
+struct c2_ring {
+	struct c2_element *to_clean;
+	struct c2_element *to_use;
+	struct c2_element *start;
+	unsigned long count;
+};
+
+struct c2_dev {
+	struct ib_device ibdev;
+	void __iomem *regs;
+	void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
+	void __iomem *mmio_rxp_ring;
+	spinlock_t lock;
+	struct pci_dev *pcidev;
+	struct net_device *netdev;
+	struct net_device *pseudo_netdev;
+	unsigned int cur_tx;
+	unsigned int cur_rx;
+	u32 adapter_handle;
+	int device_cap_flags;
+	void __iomem *kva;	/* KVA device memory */
+	unsigned long pa;	/* PA device memory */
+	void **qptr_array;
+
+	struct kmem_cache *host_msg_cache;
+
+	struct list_head cca_link;		/* adapter list */
+	struct list_head eh_wakeup_list;	/* event wakeup list */
+	wait_queue_head_t req_vq_wo;
+
+	/* Cached RNIC properties */
+	struct ib_device_attr props;
+
+	struct c2_pd_table pd_table;
+	struct c2_qp_table qp_table;
+	int ports;		/* num of GigE ports */
+	int devnum;
+	spinlock_t vqlock;	/* sync vbs req MQ */
+
+	/* Verbs Queues */
+	struct c2_mq req_vq;	/* Verbs Request MQ */
+	struct c2_mq rep_vq;	/* Verbs Reply MQ */
+	struct c2_mq aeq;	/* Async Events MQ */
+
+	/* Kernel client MQs */
+	struct sp_chunk *kern_mqsp_pool;
+
+	/* Device updates these values when posting messages to a host
+	 * target queue */
+	u16 req_vq_shared;
+	u16 rep_vq_shared;
+	u16 aeq_shared;
+	u16 irq_claimed;
+
+	/*
+	 * Shared host target pages for user-accessible MQs.
+	 */
+	int hthead;		/* index of first free entry */
+	void *htpages;		/* kernel vaddr */
+	int htlen;		/* length of htpages memory */
+	void *htuva;		/* user mapped vaddr */
+	spinlock_t htlock;	/* serialize allocation */
+
+	u64 adapter_hint_uva;	/* access to the activity FIFO */
+
+	//	spinlock_t aeq_lock;
+	//	spinlock_t rnic_lock;
+
+	__be16 *hint_count;
+	dma_addr_t hint_count_dma;
+	u16 hints_read;
+
+	int init;		/* TRUE if it's ready */
+	char ae_cache_name[16];
+	char vq_cache_name[16];
+};
+
+struct c2_port {
+	u32 msg_enable;
+	struct c2_dev *c2dev;
+	struct net_device *netdev;
+
+	spinlock_t tx_lock;
+	u32 tx_avail;
+	struct c2_ring tx_ring;
+	struct c2_ring rx_ring;
+
+	void *mem;		/* PCI memory for host rings */
+	dma_addr_t dma;
+	unsigned long mem_size;
+
+	u32 rx_buf_size;
+};
+
+/*
+ * Activity FIFO registers in BAR0.
+ */
+#define PCI_BAR0_HOST_HINT	0x100
+#define PCI_BAR0_ADAPTER_HINT	0x2000
+
+/*
+ * Ammasso PCI vendor id and Cepheus PCI device id.
+ */
+#define CQ_ARMED 	0x01
+#define CQ_WAIT_FOR_DMA	0x80
+
+/*
+ * The format of a hint is as follows:
+ * Lower 16 bits are the count of hints for the queue.
+ * Next 15 bits are the qp_index
+ * Upper most bit depends on who reads it:
+ *    If read by producer, then it means Full (1) or Not-Full (0)
+ *    If read by consumer, then it means Empty (1) or Not-Empty (0)
+ */
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+
+/*
+ * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
+ * struct.
+ */
+#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
+
+#ifndef readq
+static inline u64 readq(const void __iomem * addr)
+{
+	u64 ret = readl(addr + 4);
+	ret <<= 32;
+	ret |= readl(addr);
+
+	return ret;
+}
+#endif
+
+#ifndef writeq
+static inline void __raw_writeq(u64 val, void __iomem * addr)
+{
+	__raw_writel((u32) (val), addr);
+	__raw_writel((u32) (val >> 32), (addr + 4));
+}
+#endif
+
+#define C2_SET_CUR_RX(c2dev, cur_rx) \
+	__raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
+
+#define C2_GET_CUR_RX(c2dev) \
+	be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
+
+static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c2_dev, ibdev);
+}
+
+static inline int c2_errno(void *reply)
+{
+	switch (c2_wr_get_result(reply)) {
+	case C2_OK:
+		return 0;
+	case CCERR_NO_BUFS:
+	case CCERR_INSUFFICIENT_RESOURCES:
+	case CCERR_ZERO_RDMA_READ_RESOURCES:
+		return -ENOMEM;
+	case CCERR_MR_IN_USE:
+	case CCERR_QP_IN_USE:
+		return -EBUSY;
+	case CCERR_ADDR_IN_USE:
+		return -EADDRINUSE;
+	case CCERR_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	case CCERR_CONN_RESET:
+		return -ECONNRESET;
+	case CCERR_NOT_IMPLEMENTED:
+	case CCERR_INVALID_WQE:
+		return -ENOSYS;
+	case CCERR_QP_NOT_PRIVILEGED:
+		return -EPERM;
+	case CCERR_STACK_ERROR:
+		return -EPROTO;
+	case CCERR_ACCESS_VIOLATION:
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return -EFAULT;
+	case CCERR_STAG_STATE_NOT_INVALID:
+	case CCERR_INVALID_ADDRESS:
+	case CCERR_INVALID_CQ:
+	case CCERR_INVALID_EP:
+	case CCERR_INVALID_MODIFIER:
+	case CCERR_INVALID_MTU:
+	case CCERR_INVALID_PD_ID:
+	case CCERR_INVALID_QP:
+	case CCERR_INVALID_RNIC:
+	case CCERR_INVALID_STAG:
+		return -EINVAL;
+	default:
+		return -EAGAIN;
+	}
+}
+
+/* Device */
+extern int c2_register_device(struct c2_dev *c2dev);
+extern void c2_unregister_device(struct c2_dev *c2dev);
+extern int c2_rnic_init(struct c2_dev *c2dev);
+extern void c2_rnic_term(struct c2_dev *c2dev);
+extern void c2_rnic_interrupt(struct c2_dev *c2dev);
+extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+
+/* QPs */
+extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
+		       struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
+extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
+extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
+extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+			struct ib_qp_attr *attr, int attr_mask);
+extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+				 int ord, int ird);
+extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+			struct ib_send_wr **bad_wr);
+extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+			   struct ib_recv_wr **bad_wr);
+extern void c2_init_qp_table(struct c2_dev *c2dev);
+extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
+extern void c2_set_qp_state(struct c2_qp *, int);
+extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
+
+/* PDs */
+extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
+extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
+extern int c2_init_pd_table(struct c2_dev *c2dev);
+extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
+
+/* CQs */
+extern int c2_init_cq(struct c2_dev *c2dev, int entries,
+		      struct c2_ucontext *ctx, struct c2_cq *cq);
+extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
+extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
+extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
+extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* CM */
+extern int c2_llp_connect(struct iw_cm_id *cm_id,
+			  struct iw_cm_conn_param *iw_param);
+extern int c2_llp_accept(struct iw_cm_id *cm_id,
+			 struct iw_cm_conn_param *iw_param);
+extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
+			 u8 pdata_len);
+extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
+extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
+
+/* MM */
+extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 				      int page_size, int pbl_depth, u32 length,
+ 				      u32 off, u64 *va, enum c2_acf acf,
+				      struct c2_mr *mr);
+extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
+
+/* AE */
+extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
+
+/* MQSP Allocator */
+extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+			     struct sp_chunk **root);
+extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
+extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+			     dma_addr_t *dma_addr, gfp_t gfp_mask);
+extern void c2_free_mqsp(__be16* mqsp);
+#endif
diff --git a/drivers/staging/rdma/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c
new file mode 100644
index 0000000..cedda25
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_ae.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_status.h"
+#include "c2_ae.h"
+
+static int c2_convert_cm_status(u32 c2_status)
+{
+	switch (c2_status) {
+	case C2_CONN_STATUS_SUCCESS:
+		return 0;
+	case C2_CONN_STATUS_REJECTED:
+		return -ENETRESET;
+	case C2_CONN_STATUS_REFUSED:
+		return -ECONNREFUSED;
+	case C2_CONN_STATUS_TIMEDOUT:
+		return -ETIMEDOUT;
+	case C2_CONN_STATUS_NETUNREACH:
+		return -ENETUNREACH;
+	case C2_CONN_STATUS_HOSTUNREACH:
+		return -EHOSTUNREACH;
+	case C2_CONN_STATUS_INVALID_RNIC:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP_STATE:
+		return -EINVAL;
+	case C2_CONN_STATUS_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	default:
+		printk(KERN_ERR PFX
+		       "%s - Unable to convert CM status: %d\n",
+		       __func__, c2_status);
+		return -EIO;
+	}
+}
+
+static const char* to_event_str(int event)
+{
+	static const char* event_str[] = {
+		"CCAE_REMOTE_SHUTDOWN",
+		"CCAE_ACTIVE_CONNECT_RESULTS",
+		"CCAE_CONNECTION_REQUEST",
+		"CCAE_LLP_CLOSE_COMPLETE",
+		"CCAE_TERMINATE_MESSAGE_RECEIVED",
+		"CCAE_LLP_CONNECTION_RESET",
+		"CCAE_LLP_CONNECTION_LOST",
+		"CCAE_LLP_SEGMENT_SIZE_INVALID",
+		"CCAE_LLP_INVALID_CRC",
+		"CCAE_LLP_BAD_FPDU",
+		"CCAE_INVALID_DDP_VERSION",
+		"CCAE_INVALID_RDMA_VERSION",
+		"CCAE_UNEXPECTED_OPCODE",
+		"CCAE_INVALID_DDP_QUEUE_NUMBER",
+		"CCAE_RDMA_READ_NOT_ENABLED",
+		"CCAE_RDMA_WRITE_NOT_ENABLED",
+		"CCAE_RDMA_READ_TOO_SMALL",
+		"CCAE_NO_L_BIT",
+		"CCAE_TAGGED_INVALID_STAG",
+		"CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
+		"CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_TAGGED_INVALID_PD",
+		"CCAE_WRAP_ERROR",
+		"CCAE_BAD_CLOSE",
+		"CCAE_BAD_LLP_CLOSE",
+		"CCAE_INVALID_MSN_RANGE",
+		"CCAE_INVALID_MSN_GAP",
+		"CCAE_IRRQ_OVERFLOW",
+		"CCAE_IRRQ_MSN_GAP",
+		"CCAE_IRRQ_MSN_RANGE",
+		"CCAE_IRRQ_INVALID_STAG",
+		"CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
+		"CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_IRRQ_INVALID_PD",
+		"CCAE_IRRQ_WRAP_ERROR",
+		"CCAE_CQ_SQ_COMPLETION_OVERFLOW",
+		"CCAE_CQ_RQ_COMPLETION_ERROR",
+		"CCAE_QP_SRQ_WQE_ERROR",
+		"CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
+		"CCAE_CQ_OVERFLOW",
+		"CCAE_CQ_OPERATION_ERROR",
+		"CCAE_SRQ_LIMIT_REACHED",
+		"CCAE_QP_RQ_LIMIT_REACHED",
+		"CCAE_SRQ_CATASTROPHIC_ERROR",
+		"CCAE_RNIC_CATASTROPHIC_ERROR"
+	};
+
+	if (event < CCAE_REMOTE_SHUTDOWN ||
+	    event > CCAE_RNIC_CATASTROPHIC_ERROR)
+		return "<invalid event>";
+
+	event -= CCAE_REMOTE_SHUTDOWN;
+	return event_str[event];
+}
+
+static const char *to_qp_state_str(int state)
+{
+	switch (state) {
+	case C2_QP_STATE_IDLE:
+		return "C2_QP_STATE_IDLE";
+	case C2_QP_STATE_CONNECTING:
+		return "C2_QP_STATE_CONNECTING";
+	case C2_QP_STATE_RTS:
+		return "C2_QP_STATE_RTS";
+	case C2_QP_STATE_CLOSING:
+		return "C2_QP_STATE_CLOSING";
+	case C2_QP_STATE_TERMINATE:
+		return "C2_QP_STATE_TERMINATE";
+	case C2_QP_STATE_ERROR:
+		return "C2_QP_STATE_ERROR";
+	default:
+		return "<invalid QP state>";
+	}
+}
+
+void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_mq *mq = c2dev->qptr_array[mq_index];
+	union c2wr *wr;
+	void *resource_user_context;
+	struct iw_cm_event cm_event;
+	struct ib_event ib_event;
+	enum c2_resource_indicator resource_indicator;
+	enum c2_event_id event_id;
+	unsigned long flags;
+	int status;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
+
+	/*
+	 * retrieve the message
+	 */
+	wr = c2_mq_consume(mq);
+	if (!wr)
+		return;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+	memset(&cm_event, 0, sizeof(cm_event));
+
+	event_id = c2_wr_get_id(wr);
+	resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
+	resource_user_context =
+	    (void *) (unsigned long) wr->ae.ae_generic.user_context;
+
+	status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
+
+	pr_debug("event received c2_dev=%p, event_id=%d, "
+		"resource_indicator=%d, user_context=%p, status = %d\n",
+		c2dev, event_id, resource_indicator, resource_user_context,
+		status);
+
+	switch (resource_indicator) {
+	case C2_RES_IND_QP:{
+
+		struct c2_qp *qp = (struct c2_qp *)resource_user_context;
+		struct iw_cm_id *cm_id = qp->cm_id;
+		struct c2wr_ae_active_connect_results *res;
+
+		if (!cm_id) {
+			pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
+				qp);
+			goto ignore_it;
+		}
+		pr_debug("%s: event = %s, user_context=%llx, "
+			"resource_type=%x, "
+			"resource=%x, qp_state=%s\n",
+			__func__,
+			to_event_str(event_id),
+			(unsigned long long) wr->ae.ae_generic.user_context,
+			be32_to_cpu(wr->ae.ae_generic.resource_type),
+			be32_to_cpu(wr->ae.ae_generic.resource),
+			to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
+
+		c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
+
+		switch (event_id) {
+		case CCAE_ACTIVE_CONNECT_RESULTS:
+			res = &wr->ae.ae_active_connect_results;
+			cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+			laddr->sin_addr.s_addr = res->laddr;
+			raddr->sin_addr.s_addr = res->raddr;
+			laddr->sin_port = res->lport;
+			raddr->sin_port = res->rport;
+			if (status == 0) {
+				cm_event.private_data_len =
+					be32_to_cpu(res->private_data_length);
+				cm_event.private_data = res->private_data;
+			} else {
+				spin_lock_irqsave(&qp->lock, flags);
+				if (qp->cm_id) {
+					qp->cm_id->rem_ref(qp->cm_id);
+					qp->cm_id = NULL;
+				}
+				spin_unlock_irqrestore(&qp->lock, flags);
+				cm_event.private_data_len = 0;
+				cm_event.private_data = NULL;
+			}
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		case CCAE_TERMINATE_MESSAGE_RECEIVED:
+		case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
+			ib_event.device = &c2dev->ibdev;
+			ib_event.element.qp = &qp->ibqp;
+			ib_event.event = IB_EVENT_QP_REQ_ERR;
+
+			if (qp->ibqp.event_handler)
+				qp->ibqp.event_handler(&ib_event,
+						       qp->ibqp.
+						       qp_context);
+			break;
+		case CCAE_BAD_CLOSE:
+		case CCAE_LLP_CLOSE_COMPLETE:
+		case CCAE_LLP_CONNECTION_RESET:
+		case CCAE_LLP_CONNECTION_LOST:
+			BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
+
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id) {
+				qp->cm_id->rem_ref(qp->cm_id);
+				qp->cm_id = NULL;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		default:
+			BUG_ON(1);
+			pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
+				"CM_ID=%p\n",
+				__func__, __LINE__,
+				event_id, qp, cm_id);
+			break;
+		}
+		break;
+	}
+
+	case C2_RES_IND_EP:{
+
+		struct c2wr_ae_connection_request *req =
+			&wr->ae.ae_connection_request;
+		struct iw_cm_id *cm_id =
+			(struct iw_cm_id *)resource_user_context;
+
+		pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
+		if (event_id != CCAE_CONNECTION_REQUEST) {
+			pr_debug("%s: Invalid event_id: %d\n",
+				__func__, event_id);
+			break;
+		}
+		cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+		cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
+		laddr->sin_addr.s_addr = req->laddr;
+		raddr->sin_addr.s_addr = req->raddr;
+		laddr->sin_port = req->lport;
+		raddr->sin_port = req->rport;
+		cm_event.private_data_len =
+			be32_to_cpu(req->private_data_length);
+		cm_event.private_data = req->private_data;
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+
+		if (cm_id->event_handler)
+			cm_id->event_handler(cm_id, &cm_event);
+		break;
+	}
+
+	case C2_RES_IND_CQ:{
+		struct c2_cq *cq =
+		    (struct c2_cq *) resource_user_context;
+
+		pr_debug("IB_EVENT_CQ_ERR\n");
+		ib_event.device = &c2dev->ibdev;
+		ib_event.element.cq = &cq->ibcq;
+		ib_event.event = IB_EVENT_CQ_ERR;
+
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_event,
+					       cq->ibcq.cq_context);
+		break;
+	}
+
+	default:
+		printk("Bad resource indicator = %d\n",
+		       resource_indicator);
+		break;
+	}
+
+ ignore_it:
+	c2_mq_free(mq);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h
new file mode 100644
index 0000000..3a065c3
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_ae.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_AE_H_
+#define _C2_AE_H_
+
+/*
+ * WARNING: If you change this file, also bump C2_IVN_BASE
+ * in common/include/clustercore/c2_ivn.h.
+ */
+
+/*
+ * Asynchronous Event Identifiers
+ *
+ * These start at 0x80 only so it's obvious from inspection that
+ * they are not work-request statuses.  This isn't critical.
+ *
+ * NOTE: these event id's must fit in eight bits.
+ */
+enum c2_event_id {
+	CCAE_REMOTE_SHUTDOWN = 0x80,
+	CCAE_ACTIVE_CONNECT_RESULTS,
+	CCAE_CONNECTION_REQUEST,
+	CCAE_LLP_CLOSE_COMPLETE,
+	CCAE_TERMINATE_MESSAGE_RECEIVED,
+	CCAE_LLP_CONNECTION_RESET,
+	CCAE_LLP_CONNECTION_LOST,
+	CCAE_LLP_SEGMENT_SIZE_INVALID,
+	CCAE_LLP_INVALID_CRC,
+	CCAE_LLP_BAD_FPDU,
+	CCAE_INVALID_DDP_VERSION,
+	CCAE_INVALID_RDMA_VERSION,
+	CCAE_UNEXPECTED_OPCODE,
+	CCAE_INVALID_DDP_QUEUE_NUMBER,
+	CCAE_RDMA_READ_NOT_ENABLED,
+	CCAE_RDMA_WRITE_NOT_ENABLED,
+	CCAE_RDMA_READ_TOO_SMALL,
+	CCAE_NO_L_BIT,
+	CCAE_TAGGED_INVALID_STAG,
+	CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
+	CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
+	CCAE_TAGGED_INVALID_PD,
+	CCAE_WRAP_ERROR,
+	CCAE_BAD_CLOSE,
+	CCAE_BAD_LLP_CLOSE,
+	CCAE_INVALID_MSN_RANGE,
+	CCAE_INVALID_MSN_GAP,
+	CCAE_IRRQ_OVERFLOW,
+	CCAE_IRRQ_MSN_GAP,
+	CCAE_IRRQ_MSN_RANGE,
+	CCAE_IRRQ_INVALID_STAG,
+	CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
+	CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
+	CCAE_IRRQ_INVALID_PD,
+	CCAE_IRRQ_WRAP_ERROR,
+	CCAE_CQ_SQ_COMPLETION_OVERFLOW,
+	CCAE_CQ_RQ_COMPLETION_ERROR,
+	CCAE_QP_SRQ_WQE_ERROR,
+	CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
+	CCAE_CQ_OVERFLOW,
+	CCAE_CQ_OPERATION_ERROR,
+	CCAE_SRQ_LIMIT_REACHED,
+	CCAE_QP_RQ_LIMIT_REACHED,
+	CCAE_SRQ_CATASTROPHIC_ERROR,
+	CCAE_RNIC_CATASTROPHIC_ERROR
+/* WARNING If you add more id's, make sure their values fit in eight bits. */
+};
+
+/*
+ * Resource Indicators and Identifiers
+ */
+enum c2_resource_indicator {
+	C2_RES_IND_QP = 1,
+	C2_RES_IND_EP,
+	C2_RES_IND_CQ,
+	C2_RES_IND_SRQ,
+};
+
+#endif /* _C2_AE_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c
new file mode 100644
index 0000000..78d247e
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_alloc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/bitmap.h>
+
+#include "c2.h"
+
+static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
+			       struct sp_chunk **head)
+{
+	int i;
+	struct sp_chunk *new_head;
+	dma_addr_t dma_addr;
+
+	new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
+				      &dma_addr, gfp_mask);
+	if (new_head == NULL)
+		return -ENOMEM;
+
+	new_head->dma_addr = dma_addr;
+	dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
+
+	new_head->next = NULL;
+	new_head->head = 0;
+
+	/* build list where each index is the next free slot */
+	for (i = 0;
+	     i < (PAGE_SIZE - sizeof(struct sp_chunk) -
+		  sizeof(u16)) / sizeof(u16) - 1;
+	     i++) {
+		new_head->shared_ptr[i] = i + 1;
+	}
+	/* terminate list */
+	new_head->shared_ptr[i] = 0xFFFF;
+
+	*head = new_head;
+	return 0;
+}
+
+int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+		      struct sp_chunk **root)
+{
+	return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
+}
+
+void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
+{
+	struct sp_chunk *next;
+
+	while (root) {
+		next = root->next;
+		dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
+				  dma_unmap_addr(root, mapping));
+		root = next;
+	}
+}
+
+__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+		      dma_addr_t *dma_addr, gfp_t gfp_mask)
+{
+	u16 mqsp;
+
+	while (head) {
+		mqsp = head->head;
+		if (mqsp != 0xFFFF) {
+			head->head = head->shared_ptr[mqsp];
+			break;
+		} else if (head->next == NULL) {
+			if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
+			    0) {
+				head = head->next;
+				mqsp = head->head;
+				head->head = head->shared_ptr[mqsp];
+				break;
+			} else
+				return NULL;
+		} else
+			head = head->next;
+	}
+	if (head) {
+		*dma_addr = head->dma_addr +
+			    ((unsigned long) &(head->shared_ptr[mqsp]) -
+			     (unsigned long) head);
+		pr_debug("%s addr %p dma_addr %llx\n", __func__,
+			 &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
+		return (__force __be16 *) &(head->shared_ptr[mqsp]);
+	}
+	return NULL;
+}
+
+void c2_free_mqsp(__be16 *mqsp)
+{
+	struct sp_chunk *head;
+	u16 idx;
+
+	/* The chunk containing this ptr begins at the page boundary */
+	head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
+
+	/* Link head to new mqsp */
+	*mqsp = (__force __be16) head->head;
+
+	/* Compute the shared_ptr index */
+	idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
+	idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
+
+	/* Point this index at the head */
+	head->shared_ptr[idx] = head->head;
+
+	/* Point head at this index */
+	head->head = idx;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c
new file mode 100644
index 0000000..23bfa94
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_cm.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_vq.h"
+#include <rdma/iw_cm.h>
+
+int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct ib_qp *ibqp;
+	struct c2_qp *qp;
+	struct c2wr_qp_connect_req *wr;	/* variable size needs a malloc. */
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Associate QP <--> CM_ID */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	/*
+	 * only support the max private_data length
+	 */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail0;
+	}
+	/*
+	 * Set the rdma read limits
+	 */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Create and send a WR_QP_CONNECT...
+	 */
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	c2_wr_set_id(wr, CCWR_QP_CONNECT);
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->qp_handle = qp->adapter_handle;
+
+	wr->remote_addr = raddr->sin_addr.s_addr;
+	wr->remote_port = raddr->sin_port;
+
+	/*
+	 * Move any private data from the callers's buf into
+	 * the WR.
+	 */
+	if (iw_param->private_data) {
+		wr->private_data_length =
+			cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0], iw_param->private_data,
+		       iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/*
+	 * Send WR to adapter.  NOTE: There is no synch reply from
+	 * the adapter.
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	vq_req_free(c2dev, vq_req);
+
+ bail1:
+	kfree(wr);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_create_req wr;
+	struct c2wr_ep_listen_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+	if (cm_id->local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.local_addr = laddr->sin_addr.s_addr;
+	wr.local_port = laddr->sin_port;
+	wr.backlog = cpu_to_be32(backlog);
+	wr.user_context = (u64) (unsigned long) cm_id;
+
+	/*
+	 * Reference the request struct.  Dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply =
+	    (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+	/*
+	 * Keep the adapter handle. Used in subsequent destroy
+	 */
+	cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
+
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+
+int c2_llp_service_destroy(struct iw_cm_id *cm_id)
+{
+
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_destroy_req wr;
+	struct c2wr_ep_listen_destroy_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct c2_qp *qp;
+	struct ib_qp *ibqp;
+	struct c2wr_cr_accept_req *wr;	/* variable length WR */
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_accept_rep *reply;	/* VQ Reply msg ptr. */
+	int err;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Set the RDMA read limits */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/* Allocate verbs request. */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	vq_req->qp = qp;
+	vq_req->cm_id = cm_id;
+	vq_req->event = IW_CM_EVENT_ESTABLISHED;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Build the WR */
+	c2_wr_set_id(wr, CCWR_CR_ACCEPT);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
+	wr->qp_handle = qp->adapter_handle;
+
+	/* Replace the cr_handle with the QP after accept */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	cm_id->provider_data = qp;
+
+	/* Validate private_data length */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail1;
+	}
+
+	if (iw_param->private_data) {
+		wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0],
+		       iw_param->private_data, iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/* Reference the request struct.  Dereferenced in the int handler. */
+	vq_req_get(c2dev, vq_req);
+
+	/* Send WR to adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/* Wait for reply from adapter */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	/* Check that reply is present */
+	reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+	if (!err)
+		c2_set_qp_state(qp, C2_QP_STATE_RTS);
+ bail1:
+	kfree(wr);
+	vq_req_free(c2dev, vq_req);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_cr_reject_req wr;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_reject_rep *reply;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_CR_REJECT);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_cr_reject_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	err = c2_errno(reply);
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
new file mode 100644
index 0000000..1b63185
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_cq.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
+
+static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
+{
+	struct c2_cq *cq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+	cq = c2dev->qptr_array[cqn];
+	if (!cq) {
+		spin_unlock_irqrestore(&c2dev->lock, flags);
+		return NULL;
+	}
+	atomic_inc(&cq->refcount);
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+	return cq;
+}
+
+static void c2_cq_put(struct c2_cq *cq)
+{
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_cq *cq;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq) {
+		printk("discarding events on destroyed CQN=%d\n", mq_index);
+		return;
+	}
+
+	(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+	c2_cq_put(cq);
+}
+
+void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
+{
+	struct c2_cq *cq;
+	struct c2_mq *q;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	q = &cq->mq;
+	if (q && !c2_mq_empty(q)) {
+		u16 priv = q->priv;
+		struct c2wr_ce *msg;
+
+		while (priv != be16_to_cpu(*q->shared)) {
+			msg = (struct c2wr_ce *)
+				(q->msg_pool.host + priv * q->msg_size);
+			if (msg->qp_user_context == (u64) (unsigned long) qp) {
+				msg->qp_user_context = (u64) 0;
+			}
+			priv = (priv + 1) % q->q_size;
+		}
+	}
+	spin_unlock_irq(&cq->lock);
+	c2_cq_put(cq);
+}
+
+static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
+{
+	switch (status) {
+	case C2_OK:
+		return IB_WC_SUCCESS;
+	case CCERR_FLUSHED:
+		return IB_WC_WR_FLUSH_ERR;
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return IB_WC_LOC_PROT_ERR;
+	case CCERR_ACCESS_VIOLATION:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CCERR_TOTAL_LENGTH_TOO_BIG:
+		return IB_WC_LOC_LEN_ERR;
+	case CCERR_INVALID_WINDOW:
+		return IB_WC_MW_BIND_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+
+static inline int c2_poll_one(struct c2_dev *c2dev,
+			      struct c2_cq *cq, struct ib_wc *entry)
+{
+	struct c2wr_ce *ce;
+	struct c2_qp *qp;
+	int is_recv = 0;
+
+	ce = c2_mq_consume(&cq->mq);
+	if (!ce) {
+		return -EAGAIN;
+	}
+
+	/*
+	 * if the qp returned is null then this qp has already
+	 * been freed and we are unable process the completion.
+	 * try pulling the next message
+	 */
+	while ((qp =
+		(struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
+		c2_mq_free(&cq->mq);
+		ce = c2_mq_consume(&cq->mq);
+		if (!ce)
+			return -EAGAIN;
+	}
+
+	entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
+	entry->wr_id = ce->hdr.context;
+	entry->qp = &qp->ibqp;
+	entry->wc_flags = 0;
+	entry->slid = 0;
+	entry->sl = 0;
+	entry->src_qp = 0;
+	entry->dlid_path_bits = 0;
+	entry->pkey_index = 0;
+
+	switch (c2_wr_get_id(ce)) {
+	case C2_WR_TYPE_SEND:
+		entry->opcode = IB_WC_SEND;
+		break;
+	case C2_WR_TYPE_RDMA_WRITE:
+		entry->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case C2_WR_TYPE_RDMA_READ:
+		entry->opcode = IB_WC_RDMA_READ;
+		break;
+	case C2_WR_TYPE_BIND_MW:
+		entry->opcode = IB_WC_BIND_MW;
+		break;
+	case C2_WR_TYPE_RECV:
+		entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
+		entry->opcode = IB_WC_RECV;
+		is_recv = 1;
+		break;
+	default:
+		break;
+	}
+
+	/* consume the WQEs */
+	if (is_recv)
+		c2_mq_lconsume(&qp->rq_mq, 1);
+	else
+		c2_mq_lconsume(&qp->sq_mq,
+			       be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
+
+	/* free the message */
+	c2_mq_free(&cq->mq);
+
+	return 0;
+}
+
+int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct c2_dev *c2dev = to_c2dev(ibcq->device);
+	struct c2_cq *cq = to_c2cq(ibcq);
+	unsigned long flags;
+	int npolled, err;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+
+		err = c2_poll_one(c2dev, cq, entry + npolled);
+		if (err)
+			break;
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return npolled;
+}
+
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct c2_mq_shared __iomem *shared;
+	struct c2_cq *cq;
+	unsigned long flags;
+	int ret = 0;
+
+	cq = to_c2cq(ibcq);
+	shared = cq->mq.peer;
+
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
+	else
+		return -EINVAL;
+
+	writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
+
+	/*
+	 * Now read back shared->armed to make the PCI
+	 * write synchronous.  This is necessary for
+	 * correct cq notification semantics.
+	 */
+	readb(&shared->armed);
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		spin_lock_irqsave(&cq->lock, flags);
+		ret = !c2_mq_empty(&cq->mq);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+
+	return ret;
+}
+
+static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
+{
+	dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
+			  mq->msg_pool.host, dma_unmap_addr(mq, mapping));
+}
+
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+			   size_t q_size, size_t msg_size)
+{
+	u8 *pool_start;
+
+	if (q_size > SIZE_MAX / msg_size)
+		return -EINVAL;
+
+	pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
+					&mq->host_dma, GFP_KERNEL);
+	if (!pool_start)
+		return -ENOMEM;
+
+	c2_mq_rep_init(mq,
+		       0,		/* index (currently unknown) */
+		       q_size,
+		       msg_size,
+		       pool_start,
+		       NULL,	/* peer (currently unknown) */
+		       C2_MQ_HOST_TARGET);
+
+	dma_unmap_addr_set(mq, mapping, mq->host_dma);
+
+	return 0;
+}
+
+int c2_init_cq(struct c2_dev *c2dev, int entries,
+	       struct c2_ucontext *ctx, struct c2_cq *cq)
+{
+	struct c2wr_cq_create_req wr;
+	struct c2wr_cq_create_rep *reply;
+	unsigned long peer_pa;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	might_sleep();
+
+	cq->ibcq.cqe = entries - 1;
+	cq->is_kernel = !ctx;
+
+	/* Allocate a shared pointer */
+	cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+				      &cq->mq.shared_dma, GFP_KERNEL);
+	if (!cq->mq.shared)
+		return -ENOMEM;
+
+	/* Allocate pages for the message pool */
+	err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
+	if (err)
+		goto bail0;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.msg_size = cpu_to_be32(cq->mq.msg_size);
+	wr.depth = cpu_to_be32(cq->mq.q_size);
+	wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
+	wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
+	wr.user_context = (u64) (unsigned long) (cq);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail2;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail2;
+
+	reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail3;
+
+	cq->adapter_handle = reply->cq_handle;
+	cq->mq.index = be32_to_cpu(reply->mq_index);
+
+	peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
+	cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
+	if (!cq->mq.peer) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->refcount, 1);
+	init_waitqueue_head(&cq->wait);
+
+	/*
+	 * Use the MQ index allocated by the adapter to
+	 * store the CQ in the qptr_array
+	 */
+	cq->cqn = cq->mq.index;
+	c2dev->qptr_array[cq->cqn] = cq;
+
+	return 0;
+
+      bail3:
+	vq_repbuf_free(c2dev, reply);
+      bail2:
+	vq_req_free(c2dev, vq_req);
+      bail1:
+	c2_free_cq_buf(c2dev, &cq->mq);
+      bail0:
+	c2_free_mqsp(cq->mq.shared);
+
+	return err;
+}
+
+void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
+{
+	int err;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cq_destroy_req wr;
+	struct c2wr_cq_destroy_rep *reply;
+
+	might_sleep();
+
+	/* Clear CQ from the qptr array */
+	spin_lock_irq(&c2dev->lock);
+	c2dev->qptr_array[cq->mq.index] = NULL;
+	atomic_dec(&cq->refcount);
+	spin_unlock_irq(&c2dev->lock);
+
+	wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		goto bail0;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.cq_handle = cq->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (reply)
+		vq_repbuf_free(c2dev, reply);
+      bail1:
+	vq_req_free(c2dev, vq_req);
+      bail0:
+	if (cq->is_kernel) {
+		c2_free_cq_buf(c2dev, &cq->mq);
+	}
+
+	return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c
new file mode 100644
index 0000000..3a17d9b
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_intr.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_vq.h"
+
+static void handle_mq(struct c2_dev *c2dev, u32 index);
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
+
+/*
+ * Handle RNIC interrupts
+ */
+void c2_rnic_interrupt(struct c2_dev *c2dev)
+{
+	unsigned int mq_index;
+
+	while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
+		mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
+		if (mq_index & 0x80000000) {
+			break;
+		}
+
+		c2dev->hints_read++;
+		handle_mq(c2dev, mq_index);
+	}
+
+}
+
+/*
+ * Top level MQ handler
+ */
+static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
+{
+	if (c2dev->qptr_array[mq_index] == NULL) {
+		pr_debug("handle_mq: stray activity for mq_index=%d\n",
+			 mq_index);
+		return;
+	}
+
+	switch (mq_index) {
+	case (0):
+		/*
+		 * An index of 0 in the activity queue
+		 * indicates the req vq now has messages
+		 * available...
+		 *
+		 * Wake up any waiters waiting on req VQ
+		 * message availability.
+		 */
+		wake_up(&c2dev->req_vq_wo);
+		break;
+	case (1):
+		handle_vq(c2dev, mq_index);
+		break;
+	case (2):
+		/* We have to purge the VQ in case there are pending
+		 * accept reply requests that would result in the
+		 * generation of an ESTABLISHED event. If we don't
+		 * generate these first, a CLOSE event could end up
+		 * being delivered before the ESTABLISHED event.
+		 */
+		handle_vq(c2dev, 1);
+
+		c2_ae_event(c2dev, mq_index);
+		break;
+	default:
+		/* There is no event synchronization between CQ events
+		 * and AE or CM events. In fact, CQE could be
+		 * delivered for all of the I/O up to and including the
+		 * FLUSH for a peer disconenct prior to the ESTABLISHED
+		 * event being delivered to the app. The reason for this
+		 * is that CM events are delivered on a thread, while AE
+		 * and CM events are delivered on interrupt context.
+		 */
+		c2_cq_event(c2dev, mq_index);
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Handles verbs WR replies.
+ */
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
+{
+	void *adapter_msg, *reply_msg;
+	struct c2wr_hdr *host_msg;
+	struct c2wr_hdr tmp;
+	struct c2_mq *reply_vq;
+	struct c2_vq_req *req;
+	struct iw_cm_event cm_event;
+	int err;
+
+	reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
+
+	/*
+	 * get next msg from mq_index into adapter_msg.
+	 * don't free it yet.
+	 */
+	adapter_msg = c2_mq_consume(reply_vq);
+	if (adapter_msg == NULL) {
+		return;
+	}
+
+	host_msg = vq_repbuf_alloc(c2dev);
+
+	/*
+	 * If we can't get a host buffer, then we'll still
+	 * wakeup the waiter, we just won't give him the msg.
+	 * It is assumed the waiter will deal with this...
+	 */
+	if (!host_msg) {
+		pr_debug("handle_vq: no repbufs!\n");
+
+		/*
+		 * just copy the WR header into a local variable.
+		 * this allows us to still demux on the context
+		 */
+		host_msg = &tmp;
+		memcpy(host_msg, adapter_msg, sizeof(tmp));
+		reply_msg = NULL;
+	} else {
+		memcpy(host_msg, adapter_msg, reply_vq->msg_size);
+		reply_msg = host_msg;
+	}
+
+	/*
+	 * consume the msg from the MQ
+	 */
+	c2_mq_free(reply_vq);
+
+	/*
+	 * wakeup the waiter.
+	 */
+	req = (struct c2_vq_req *) (unsigned long) host_msg->context;
+	if (req == NULL) {
+		/*
+		 * We should never get here, as the adapter should
+		 * never send us a reply that we're not expecting.
+		 */
+		if (reply_msg != NULL)
+			vq_repbuf_free(c2dev, host_msg);
+		pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
+		return;
+	}
+
+	if (reply_msg)
+		err = c2_errno(reply_msg);
+	else
+		err = -ENOMEM;
+
+	if (!err) switch (req->event) {
+	case IW_CM_EVENT_ESTABLISHED:
+		c2_set_qp_state(req->qp,
+				C2_QP_STATE_RTS);
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+	case IW_CM_EVENT_CLOSE:
+
+		/*
+		 * Move the QP to RTS if this is
+		 * the established event
+		 */
+		cm_event.event = req->event;
+		cm_event.status = 0;
+		cm_event.local_addr = req->cm_id->local_addr;
+		cm_event.remote_addr = req->cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+		req->cm_id->event_handler(req->cm_id, &cm_event);
+		break;
+	default:
+		break;
+	}
+
+	req->reply_msg = (u64) (unsigned long) (reply_msg);
+	atomic_set(&req->reply_ready, 1);
+	wake_up(&req->wait_object);
+
+	/*
+	 * If the request was cancelled, then this put will
+	 * free the vq_req memory...and reply_msg!!!
+	 */
+	vq_req_put(c2dev, req);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c
new file mode 100644
index 0000000..119c4f3
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mm.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+
+#define PBL_VIRT 1
+#define PBL_PHYS 2
+
+/*
+ * Send all the PBL messages to convey the remainder of the PBL
+ * Wait for the adapter's reply on the last one.
+ * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
+ *
+ * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
+ *	  Reply buffer _is_ freed by this function.
+ */
+static int
+send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
+		  unsigned long va, u32 pbl_depth,
+		  struct c2_vq_req *vq_req, int pbl_type)
+{
+	u32 pbe_count;		/* amt that fits in a PBL msg */
+	u32 count;		/* amt in this PBL MSG. */
+	struct c2wr_nsmr_pbl_req *wr;	/* PBL WR ptr */
+	struct c2wr_nsmr_pbl_rep *reply;	/* reply ptr */
+ 	int err, pbl_virt, pbl_index, i;
+
+	switch (pbl_type) {
+	case PBL_VIRT:
+		pbl_virt = 1;
+		break;
+	case PBL_PHYS:
+		pbl_virt = 0;
+		break;
+	default:
+		return -EINVAL;
+		break;
+	}
+
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		return -ENOMEM;
+	}
+	c2_wr_set_id(wr, CCWR_NSMR_PBL);
+
+	/*
+	 * Only the last PBL message will generate a reply from the verbs,
+	 * so we set the context to 0 indicating there is no kernel verbs
+	 * handler blocked awaiting this reply.
+	 */
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->stag_index = stag_index;	/* already swapped */
+	wr->flags = 0;
+	pbl_index = 0;
+	while (pbl_depth) {
+		count = min(pbe_count, pbl_depth);
+		wr->addrs_length = cpu_to_be32(count);
+
+		/*
+		 *  If this is the last message, then reference the
+		 *  vq request struct cuz we're gonna wait for a reply.
+		 *  also make this PBL msg as the last one.
+		 */
+		if (count == pbl_depth) {
+			/*
+			 * reference the request struct.  dereferenced in the
+			 * int handler.
+			 */
+			vq_req_get(c2dev, vq_req);
+			wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
+
+			/*
+			 * This is the last PBL message.
+			 * Set the context to our VQ Request Object so we can
+			 * wait for the reply.
+			 */
+			wr->hdr.context = (unsigned long) vq_req;
+		}
+
+		/*
+		 * If pbl_virt is set then va is a virtual address
+		 * that describes a virtually contiguous memory
+		 * allocation. The wr needs the start of each virtual page
+		 * to be converted to the corresponding physical address
+		 * of the page. If pbl_virt is not set then va is an array
+		 * of physical addresses and there is no conversion to do.
+		 * Just fill in the wr with what is in the array.
+		 */
+		for (i = 0; i < count; i++) {
+			if (pbl_virt) {
+				va += PAGE_SIZE;
+			} else {
+ 				wr->paddrs[i] =
+				    cpu_to_be64(((u64 *)va)[pbl_index + i]);
+			}
+		}
+
+		/*
+		 * Send WR to adapter
+		 */
+		err = vq_send_wr(c2dev, (union c2wr *) wr);
+		if (err) {
+			if (count <= pbe_count) {
+				vq_req_put(c2dev, vq_req);
+			}
+			goto bail0;
+		}
+		pbl_depth -= count;
+		pbl_index += count;
+	}
+
+	/*
+	 *  Now wait for the reply...
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	kfree(wr);
+	return err;
+}
+
+#define C2_PBL_MAX_DEPTH 131072
+int
+c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 			   int page_size, int pbl_depth, u32 length,
+ 			   u32 offset, u64 *va, enum c2_acf acf,
+			   struct c2_mr *mr)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_nsmr_register_req *wr;
+	struct c2wr_nsmr_register_rep *reply;
+	u16 flags;
+	int i, pbe_count, count;
+	int err;
+
+	if (!va || !length || !addr_list || !pbl_depth)
+		return -EINTR;
+
+	/*
+	 * Verify PBL depth is within rnic max
+	 */
+	if (pbl_depth > C2_PBL_MAX_DEPTH) {
+		return -EINTR;
+	}
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	/*
+	 * build the WR
+	 */
+	c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+
+	flags = (acf | MEM_VA_BASED | MEM_REMOTE);
+
+	/*
+	 * compute how many pbes can fit in the message
+	 */
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
+
+	if (pbl_depth <= pbe_count) {
+		flags |= MEM_PBL_COMPLETE;
+	}
+	wr->flags = cpu_to_be16(flags);
+	wr->stag_key = 0;	//stag_key;
+	wr->va = cpu_to_be64(*va);
+	wr->pd_id = mr->pd->pd_id;
+	wr->pbe_size = cpu_to_be32(page_size);
+	wr->length = cpu_to_be32(length);
+	wr->pbl_depth = cpu_to_be32(pbl_depth);
+	wr->fbo = cpu_to_be32(offset);
+	count = min(pbl_depth, pbe_count);
+	wr->addrs_length = cpu_to_be32(count);
+
+	/*
+	 * fill out the PBL for this message
+	 */
+	for (i = 0; i < count; i++) {
+		wr->paddrs[i] = cpu_to_be64(addr_list[i]);
+	}
+
+	/*
+	 * regerence the request struct
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * send the WR to the adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/*
+	 * wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail1;
+	}
+
+	/*
+	 * process reply
+	 */
+	reply =
+	    (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	if ((err = c2_errno(reply))) {
+		goto bail2;
+	}
+	//*p_pb_entries = be32_to_cpu(reply->pbl_depth);
+	mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
+	vq_repbuf_free(c2dev, reply);
+
+	/*
+	 * if there are still more PBEs we need to send them to
+	 * the adapter and wait for a reply on the final one.
+	 * reuse vq_req for this purpose.
+	 */
+	pbl_depth -= count;
+	if (pbl_depth) {
+
+		vq_req->reply_msg = (unsigned long) NULL;
+		atomic_set(&vq_req->reply_ready, 0);
+		err = send_pbl_messages(c2dev,
+					cpu_to_be32(mr->ibmr.lkey),
+					(unsigned long) &addr_list[i],
+					pbl_depth, vq_req, PBL_PHYS);
+		if (err) {
+			goto bail1;
+		}
+	}
+
+	vq_req_free(c2dev, vq_req);
+	kfree(wr);
+
+	return err;
+
+      bail2:
+	vq_repbuf_free(c2dev, reply);
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
+{
+	struct c2_vq_req *vq_req;	/* verbs request object */
+	struct c2wr_stag_dealloc_req wr;	/* work request */
+	struct c2wr_stag_dealloc_rep *reply;	/* WR reply  */
+	int err;
+
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.stag_index = cpu_to_be32(stag_index);
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c
new file mode 100644
index 0000000..0cddc49
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mq.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include "c2_mq.h"
+
+void *c2_mq_alloc(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (c2_mq_full(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m =
+		    (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
+		m->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_produce(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (!c2_mq_full(q)) {
+		q->priv = (q->priv + 1) % q->q_size;
+		q->hint_count++;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+void *c2_mq_consume(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (c2_mq_empty(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m = (struct c2wr_hdr *)
+		    (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_free(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (!c2_mq_empty(q)) {
+
+#ifdef CCMSGMAGIC
+		{
+			struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
+			    (q->msg_pool.adapter + q->priv * q->msg_size);
+			__raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
+		}
+#endif
+		q->priv = (q->priv + 1) % q->q_size;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+
+void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	while (wqe_count--) {
+		BUG_ON(c2_mq_empty(q));
+		*q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
+	}
+}
+
+#if 0
+u32 c2_mq_count(struct c2_mq *q)
+{
+	s32 count;
+
+	if (q->type == C2_MQ_HOST_TARGET)
+		count = be16_to_cpu(*q->shared) - q->priv;
+	else
+		count = q->priv - be16_to_cpu(*q->shared);
+
+	if (count < 0)
+		count += q->q_size;
+
+	return (u32) count;
+}
+#endif  /*  0  */
+
+void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.adapter = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
+void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.host = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h
new file mode 100644
index 0000000..fc1b9a7
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mq.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _C2_MQ_H_
+#define _C2_MQ_H_
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "c2_wr.h"
+
+enum c2_shared_regs {
+
+	C2_SHARED_ARMED = 0x10,
+	C2_SHARED_NOTIFY = 0x18,
+	C2_SHARED_SHARED = 0x40,
+};
+
+struct c2_mq_shared {
+	u16 unused1;
+	u8 armed;
+	u8 notification_type;
+	u32 unused2;
+	u16 shared;
+	/* Pad to 64 bytes. */
+	u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
+};
+
+enum c2_mq_type {
+	C2_MQ_HOST_TARGET = 1,
+	C2_MQ_ADAPTER_TARGET = 2,
+};
+
+/*
+ * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
+ * c2_user_mq_t (which is the same format) is for user-mode MQs...
+ */
+#define C2_MQ_MAGIC 0x4d512020	/* 'MQ  ' */
+struct c2_mq {
+	u32 magic;
+	union {
+		u8 *host;
+		u8 __iomem *adapter;
+	} msg_pool;
+	dma_addr_t host_dma;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 hint_count;
+	u16 priv;
+	struct c2_mq_shared __iomem *peer;
+	__be16 *shared;
+	dma_addr_t shared_dma;
+	u32 q_size;
+	u32 msg_size;
+	u32 index;
+	enum c2_mq_type type;
+};
+
+static __inline__ int c2_mq_empty(struct c2_mq *q)
+{
+	return q->priv == be16_to_cpu(*q->shared);
+}
+
+static __inline__ int c2_mq_full(struct c2_mq *q)
+{
+	return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
+}
+
+extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
+extern void *c2_mq_alloc(struct c2_mq *q);
+extern void c2_mq_produce(struct c2_mq *q);
+extern void *c2_mq_consume(struct c2_mq *q);
+extern void c2_mq_free(struct c2_mq *q);
+extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		       u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
+extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+			   u8 *pool_start, u16 __iomem *peer, u32 type);
+
+#endif				/* _C2_MQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c
new file mode 100644
index 0000000..f3e81dc
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_pd.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "c2.h"
+#include "c2_provider.h"
+
+int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
+{
+	u32 obj;
+	int ret = 0;
+
+	spin_lock(&c2dev->pd_table.lock);
+	obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
+				 c2dev->pd_table.last);
+	if (obj >= c2dev->pd_table.max)
+		obj = find_first_zero_bit(c2dev->pd_table.table,
+					  c2dev->pd_table.max);
+	if (obj < c2dev->pd_table.max) {
+		pd->pd_id = obj;
+		__set_bit(obj, c2dev->pd_table.table);
+		c2dev->pd_table.last = obj+1;
+		if (c2dev->pd_table.last >= c2dev->pd_table.max)
+			c2dev->pd_table.last = 0;
+	} else
+		ret = -ENOMEM;
+	spin_unlock(&c2dev->pd_table.lock);
+	return ret;
+}
+
+void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
+{
+	spin_lock(&c2dev->pd_table.lock);
+	__clear_bit(pd->pd_id, c2dev->pd_table.table);
+	spin_unlock(&c2dev->pd_table.lock);
+}
+
+int c2_init_pd_table(struct c2_dev *c2dev)
+{
+
+	c2dev->pd_table.last = 0;
+	c2dev->pd_table.max = c2dev->props.max_pd;
+	spin_lock_init(&c2dev->pd_table.lock);
+	c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
+					sizeof(long), GFP_KERNEL);
+	if (!c2dev->pd_table.table)
+		return -ENOMEM;
+	bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
+	return 0;
+}
+
+void c2_cleanup_pd_table(struct c2_dev *c2dev)
+{
+	kfree(c2dev->pd_table.table);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
new file mode 100644
index 0000000..25c3f00
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_provider.c
@@ -0,0 +1,912 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_arp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "c2.h"
+#include "c2_provider.h"
+#include "c2_user.h"
+
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			   struct ib_udata *uhw)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	*props = c2dev->props;
+	return 0;
+}
+
+static int c2_query_port(struct ib_device *ibdev,
+			 u8 port, struct ib_port_attr *props)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	props->max_mtu = IB_MTU_4096;
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = 0;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = 1;
+	props->active_speed = IB_SPEED_SDR;
+
+	return 0;
+}
+
+static int c2_query_pkey(struct ib_device *ibdev,
+			 u8 port, u16 index, u16 * pkey)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	*pkey = 0;
+	return 0;
+}
+
+static int c2_query_gid(struct ib_device *ibdev, u8 port,
+			int index, union ib_gid *gid)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
+
+	return 0;
+}
+
+/* Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
+					     struct ib_udata *udata)
+{
+	struct c2_ucontext *context;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	return &context->ibucontext;
+}
+
+static int c2_dealloc_ucontext(struct ib_ucontext *context)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	kfree(context);
+	return 0;
+}
+
+static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
+				 struct ib_ucontext *context,
+				 struct ib_udata *udata)
+{
+	struct c2_pd *pd;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
+			c2_pd_free(to_c2dev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int c2_dealloc_pd(struct ib_pd *pd)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c2_ah_destroy(struct ib_ah *ah)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static void c2_add_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	atomic_inc(&qp->refcount);
+}
+
+static void c2_rem_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
+{
+	struct c2_dev* c2dev = to_c2dev(device);
+	struct c2_qp *qp;
+
+	qp = c2_find_qpn(c2dev, qpn);
+	pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
+		__func__, qp, qpn, device,
+		(qp?atomic_read(&qp->refcount):0));
+
+	return (qp?&qp->ibqp:NULL);
+}
+
+static struct ib_qp *c2_create_qp(struct ib_pd *pd,
+				  struct ib_qp_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct c2_qp *qp;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp) {
+			pr_debug("%s: Unable to allocate QP\n", __func__);
+			return ERR_PTR(-ENOMEM);
+		}
+		spin_lock_init(&qp->lock);
+		if (pd->uobject) {
+			/* userspace specific */
+		}
+
+		err = c2_alloc_qp(to_c2dev(pd->device),
+				  to_c2pd(pd), init_attr, qp);
+
+		if (err && pd->uobject) {
+			/* userspace specific */
+		}
+
+		break;
+	default:
+		pr_debug("%s: Invalid QP type: %d\n", __func__,
+			init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	return &qp->ibqp;
+}
+
+static int c2_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c2_qp *qp = to_c2qp(ib_qp);
+
+	pr_debug("%s:%u qp=%p,qp->state=%d\n",
+		__func__, __LINE__, ib_qp, qp->state);
+	c2_free_qp(to_c2dev(ib_qp->device), qp);
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+				  const struct ib_cq_init_attr *attr,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct c2_cq *cq;
+	int err;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		pr_debug("%s: Unable to allocate CQ\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
+	if (err) {
+		pr_debug("%s: error initializing CQ\n", __func__);
+		kfree(cq);
+		return ERR_PTR(err);
+	}
+
+	return &cq->ibcq;
+}
+
+static int c2_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c2_cq *cq = to_c2cq(ib_cq);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2_free_cq(to_c2dev(ib_cq->device), cq);
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 c2_convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
+	    (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
+	    (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
+	    C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
+}
+
+static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
+				    struct ib_phys_buf *buffer_list,
+				    int num_phys_buf, int acc, u64 * iova_start)
+{
+	struct c2_mr *mr;
+	u64 *page_list;
+	u32 total_len;
+	int err, i, j, k, page_shift, pbl_depth;
+
+	pbl_depth = 0;
+	total_len = 0;
+
+	page_shift = PAGE_SHIFT;
+	/*
+	 * If there is only 1 buffer we assume this could
+	 * be a map of all phy mem...use a 32k page_shift.
+	 */
+	if (num_phys_buf == 1)
+		page_shift += 3;
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if (buffer_list[i].addr & ~PAGE_MASK) {
+			pr_debug("Unaligned Memory Buffer: 0x%x\n",
+				(unsigned int) buffer_list[i].addr);
+			return ERR_PTR(-EINVAL);
+		}
+
+		if (!buffer_list[i].size) {
+			pr_debug("Invalid Buffer Size\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		total_len += buffer_list[i].size;
+		pbl_depth += ALIGN(buffer_list[i].size,
+				   (1 << page_shift)) >> page_shift;
+	}
+
+	page_list = vmalloc(sizeof(u64) * pbl_depth);
+	if (!page_list) {
+		pr_debug("couldn't vmalloc page_list of size %zd\n",
+			(sizeof(u64) * pbl_depth));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0, j = 0; i < num_phys_buf; i++) {
+
+		int naddrs;
+
+ 		naddrs = ALIGN(buffer_list[i].size,
+			       (1 << page_shift)) >> page_shift;
+		for (k = 0; k < naddrs; k++)
+			page_list[j++] = (buffer_list[i].addr +
+						     (k << page_shift));
+	}
+
+	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		vfree(page_list);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mr->pd = to_c2pd(ib_pd);
+	mr->umem = NULL;
+	pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
+		"*iova_start %llx, first pa %llx, last pa %llx\n",
+		__func__, page_shift, pbl_depth, total_len,
+		(unsigned long long) *iova_start,
+	       	(unsigned long long) page_list[0],
+	       	(unsigned long long) page_list[pbl_depth-1]);
+  	err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+ 					 (1 << page_shift), pbl_depth,
+					 total_len, 0, iova_start,
+					 c2_convert_access(acc), mr);
+	vfree(page_list);
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* AMSO1100 limit */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				    u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 *pages;
+	u64 kva = 0;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct scatterlist *sg;
+	struct c2_pd *c2pd = to_c2pd(pd);
+	struct c2_mr *c2mr;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
+	if (!c2mr)
+		return ERR_PTR(-ENOMEM);
+	c2mr->pd = c2pd;
+
+	c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(c2mr->umem)) {
+		err = PTR_ERR(c2mr->umem);
+		kfree(c2mr);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(c2mr->umem->page_size) - 1;
+	n = c2mr->umem->nmap;
+
+	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	i = 0;
+	for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] =
+				sg_dma_address(sg) +
+				(c2mr->umem->page_size * k);
+		}
+	}
+
+	kva = virt;
+  	err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
+					 pages,
+					 c2mr->umem->page_size,
+					 i,
+					 length,
+					 ib_umem_offset(c2mr->umem),
+					 &kva,
+					 c2_convert_access(acc),
+					 c2mr);
+	kfree(pages);
+	if (err)
+		goto err;
+	return &c2mr->ibmr;
+
+err:
+	ib_umem_release(c2mr->umem);
+	kfree(c2mr);
+	return ERR_PTR(err);
+}
+
+static int c2_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c2_mr *mr = to_c2mr(ib_mr);
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
+	if (err)
+		pr_debug("c2_stag_dealloc failed: %d\n", err);
+	else {
+		if (mr->umem)
+			ib_umem_release(mr->umem);
+		kfree(mr);
+	}
+
+	return err;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x\n", c2dev->props.hw_ver);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x.%x.%x\n",
+		       (int) (c2dev->props.fw_ver >> 32),
+		       (int) (c2dev->props.fw_ver >> 16) & 0xffff,
+		       (int) (c2dev->props.fw_ver & 0xffff));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "AMSO1100\n");
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c2_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata)
+{
+	int err;
+
+	err =
+	    c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
+			 attr_mask);
+
+	return err;
+}
+
+static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_process_mad(struct ib_device *ibdev,
+			  int mad_flags,
+			  u8 port_num,
+			  const struct ib_wc *in_wc,
+			  const struct ib_grh *in_grh,
+			  const struct ib_mad_hdr *in_mad,
+			  size_t in_mad_size,
+			  struct ib_mad_hdr *out_mad,
+			  size_t *out_mad_size,
+			  u16 *out_mad_pkey_index)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Request a connection */
+	return c2_llp_connect(cm_id, iw_param);
+}
+
+static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Accept the new connection */
+	return c2_llp_accept(cm_id, iw_param);
+}
+
+static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_reject(cm_id, pdata, pdata_len);
+	return err;
+}
+
+static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	err = c2_llp_service_create(cm_id, backlog);
+	pr_debug("%s:%u err=%d\n",
+		__func__, __LINE__,
+		err);
+	return err;
+}
+
+static int c2_service_destroy(struct iw_cm_id *cm_id)
+{
+	int err;
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_service_destroy(cm_id);
+
+	return err;
+}
+
+static int c2_pseudo_up(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("adding...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_down(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("deleting...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	/* TODO: Tell rnic about new rmda interface mtu */
+	return 0;
+}
+
+static const struct net_device_ops c2_pseudo_netdev_ops = {
+	.ndo_open 		= c2_pseudo_up,
+	.ndo_stop 		= c2_pseudo_down,
+	.ndo_start_xmit 	= c2_pseudo_xmit_frame,
+	.ndo_change_mtu 	= c2_pseudo_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static void setup(struct net_device *netdev)
+{
+	netdev->netdev_ops = &c2_pseudo_netdev_ops;
+
+	netdev->watchdog_timeo = 0;
+	netdev->type = ARPHRD_ETHER;
+	netdev->mtu = 1500;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->tx_queue_len = 0;
+	netdev->flags |= IFF_NOARP;
+}
+
+static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
+{
+	char name[IFNAMSIZ];
+	struct net_device *netdev;
+
+	/* change ethxxx to iwxxx */
+	strcpy(name, "iw");
+	strcat(name, &c2dev->netdev->name[3]);
+	netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
+	if (!netdev) {
+		printk(KERN_ERR PFX "%s -  etherdev alloc failed",
+			__func__);
+		return NULL;
+	}
+
+	netdev->ml_priv = c2dev;
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
+
+	/* Print out the MAC address */
+	pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
+
+#if 0
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+#endif
+	return netdev;
+}
+
+static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
+			     struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = c2_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	return 0;
+}
+
+int c2_register_device(struct c2_dev *dev)
+{
+	int ret = -ENOMEM;
+	int i;
+
+	/* Register pseudo network device */
+	dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
+	if (!dev->pseudo_netdev)
+		goto out;
+
+	ret = register_netdev(dev->pseudo_netdev);
+	if (ret)
+		goto out_free_netdev;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &dev->pcidev->dev;
+	dev->ibdev.query_device = c2_query_device;
+	dev->ibdev.query_port = c2_query_port;
+	dev->ibdev.query_pkey = c2_query_pkey;
+	dev->ibdev.query_gid = c2_query_gid;
+	dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
+	dev->ibdev.mmap = c2_mmap_uar;
+	dev->ibdev.alloc_pd = c2_alloc_pd;
+	dev->ibdev.dealloc_pd = c2_dealloc_pd;
+	dev->ibdev.create_ah = c2_ah_create;
+	dev->ibdev.destroy_ah = c2_ah_destroy;
+	dev->ibdev.create_qp = c2_create_qp;
+	dev->ibdev.modify_qp = c2_modify_qp;
+	dev->ibdev.destroy_qp = c2_destroy_qp;
+	dev->ibdev.create_cq = c2_create_cq;
+	dev->ibdev.destroy_cq = c2_destroy_cq;
+	dev->ibdev.poll_cq = c2_poll_cq;
+	dev->ibdev.get_dma_mr = c2_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
+	dev->ibdev.reg_user_mr = c2_reg_user_mr;
+	dev->ibdev.dereg_mr = c2_dereg_mr;
+	dev->ibdev.get_port_immutable = c2_port_immutable;
+
+	dev->ibdev.alloc_fmr = NULL;
+	dev->ibdev.unmap_fmr = NULL;
+	dev->ibdev.dealloc_fmr = NULL;
+	dev->ibdev.map_phys_fmr = NULL;
+
+	dev->ibdev.attach_mcast = c2_multicast_attach;
+	dev->ibdev.detach_mcast = c2_multicast_detach;
+	dev->ibdev.process_mad = c2_process_mad;
+
+	dev->ibdev.req_notify_cq = c2_arm_cq;
+	dev->ibdev.post_send = c2_post_send;
+	dev->ibdev.post_recv = c2_post_receive;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+	if (dev->ibdev.iwcm == NULL) {
+		ret = -ENOMEM;
+		goto out_unregister_netdev;
+	}
+	dev->ibdev.iwcm->add_ref = c2_add_ref;
+	dev->ibdev.iwcm->rem_ref = c2_rem_ref;
+	dev->ibdev.iwcm->get_qp = c2_get_qp;
+	dev->ibdev.iwcm->connect = c2_connect;
+	dev->ibdev.iwcm->accept = c2_accept;
+	dev->ibdev.iwcm->reject = c2_reject;
+	dev->ibdev.iwcm->create_listen = c2_service_create;
+	dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto out_free_iwcm;
+
+	for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					       c2_dev_attributes[i]);
+		if (ret)
+			goto out_unregister_ibdev;
+	}
+	goto out;
+
+out_unregister_ibdev:
+	ib_unregister_device(&dev->ibdev);
+out_free_iwcm:
+	kfree(dev->ibdev.iwcm);
+out_unregister_netdev:
+	unregister_netdev(dev->pseudo_netdev);
+out_free_netdev:
+	free_netdev(dev->pseudo_netdev);
+out:
+	pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
+	return ret;
+}
+
+void c2_unregister_device(struct c2_dev *dev)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	unregister_netdev(dev->pseudo_netdev);
+	free_netdev(dev->pseudo_netdev);
+	ib_unregister_device(&dev->ibdev);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h
new file mode 100644
index 0000000..bf18998
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_provider.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_PROVIDER_H
+#define C2_PROVIDER_H
+#include <linux/inetdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#include "c2_mq.h"
+#include <rdma/iw_cm.h>
+
+#define C2_MPT_FLAG_ATOMIC        (1 << 14)
+#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct c2_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+
+/* The user context keeps track of objects allocated for a
+ * particular user-mode client. */
+struct c2_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+struct c2_mtt;
+
+/* All objects associated with a PD are kept in the
+ * associated user context if present.
+ */
+struct c2_pd {
+	struct ib_pd ibpd;
+	u32 pd_id;
+};
+
+struct c2_mr {
+	struct ib_mr ibmr;
+	struct c2_pd *pd;
+	struct ib_umem *umem;
+};
+
+struct c2_av;
+
+enum c2_ah_type {
+	C2_AH_ON_HCA,
+	C2_AH_PCI_POOL,
+	C2_AH_KMALLOC
+};
+
+struct c2_ah {
+	struct ib_ah ibah;
+};
+
+struct c2_cq {
+	struct ib_cq ibcq;
+	spinlock_t lock;
+	atomic_t refcount;
+	int cqn;
+	int is_kernel;
+	wait_queue_head_t wait;
+
+	u32 adapter_handle;
+	struct c2_mq mq;
+};
+
+struct c2_wq {
+	spinlock_t lock;
+};
+struct iw_cm_id;
+struct c2_qp {
+	struct ib_qp ibqp;
+	struct iw_cm_id *cm_id;
+	spinlock_t lock;
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	int qpn;
+
+	u32 adapter_handle;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u8 state;
+
+	struct c2_mq sq_mq;
+	struct c2_mq rq_mq;
+};
+
+struct c2_cr_query_attrs {
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+};
+
+static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c2_pd, ibpd);
+}
+
+static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct c2_ucontext, ibucontext);
+}
+
+static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c2_mr, ibmr);
+}
+
+
+static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct c2_ah, ibah);
+}
+
+static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c2_cq, ibcq);
+}
+
+static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c2_qp, ibqp);
+}
+
+static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
+{
+	struct in_device *ind;
+	int ret = 0;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	for_ifa(ind) {
+		if (ifa->ifa_address == addr) {
+			ret = 1;
+			break;
+		}
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return ret;
+}
+#endif				/* C2_PROVIDER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c
new file mode 100644
index 0000000..86708de
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_qp.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_MAX_ORD_PER_QP 128
+#define C2_MAX_IRD_PER_QP 128
+
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+#define NO_SUPPORT -1
+static const u8 c2_opcode[] = {
+	[IB_WR_SEND] = C2_WR_TYPE_SEND,
+	[IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
+};
+
+static int to_c2_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+		return C2_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C2_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_ERR:
+		return C2_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static int to_ib_state(enum c2_qp_state c2_state)
+{
+	switch (c2_state) {
+	case C2_QP_STATE_IDLE:
+		return IB_QPS_RESET;
+	case C2_QP_STATE_CONNECTING:
+		return IB_QPS_RTR;
+	case C2_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C2_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C2_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	case C2_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	default:
+		return -1;
+	}
+}
+
+static const char *to_ib_state_str(int ib_state)
+{
+	static const char *state_str[] = {
+		"IB_QPS_RESET",
+		"IB_QPS_INIT",
+		"IB_QPS_RTR",
+		"IB_QPS_RTS",
+		"IB_QPS_SQD",
+		"IB_QPS_SQE",
+		"IB_QPS_ERR"
+	};
+	if (ib_state < IB_QPS_RESET ||
+	    ib_state > IB_QPS_ERR)
+		return "<invalid IB QP state>";
+
+	ib_state -= IB_QPS_RESET;
+	return state_str[ib_state];
+}
+
+void c2_set_qp_state(struct c2_qp *qp, int c2_state)
+{
+	int new_state = to_ib_state(c2_state);
+
+	pr_debug("%s: qp[%p] state modify %s --> %s\n",
+	       __func__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(new_state));
+	qp->state = new_state;
+}
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+		 struct ib_qp_attr *attr, int attr_mask)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	unsigned long flags;
+	u8 next_state;
+	int err;
+
+	pr_debug("%s:%d qp=%p, %s --> %s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(attr->qp_state));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	if (attr_mask & IB_QP_STATE) {
+		/* Ensure the state is valid */
+		if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
+			err = -EINVAL;
+			goto bail0;
+		}
+
+		wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
+
+		if (attr->qp_state == IB_QPS_ERR) {
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id && qp->state == IB_QPS_RTS) {
+				pr_debug("Generating CLOSE event for QP-->ERR, "
+					"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+				/* Generate an CLOSE event */
+				vq_req->cm_id = qp->cm_id;
+				vq_req->event = IW_CM_EVENT_CLOSE;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+		}
+		next_state =  attr->qp_state;
+
+	} else if (attr_mask & IB_QP_CUR_STATE) {
+
+		if (attr->cur_qp_state != IB_QPS_RTR &&
+		    attr->cur_qp_state != IB_QPS_RTS &&
+		    attr->cur_qp_state != IB_QPS_SQD &&
+		    attr->cur_qp_state != IB_QPS_SQE) {
+			err = -EINVAL;
+			goto bail0;
+		} else
+			wr.next_qp_state =
+			    cpu_to_be32(to_c2_state(attr->cur_qp_state));
+
+		next_state = attr->cur_qp_state;
+
+	} else {
+		err = 0;
+		goto bail0;
+	}
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	if (!err)
+		qp->state = next_state;
+#ifdef DEBUG
+	else
+		pr_debug("%s: c2_errno=%d\n", __func__, err);
+#endif
+	/*
+	 * If we're going to error and generating the event here, then
+	 * we need to remove the reference because there will be no
+	 * close event generated by the adapter
+	*/
+	spin_lock_irqsave(&qp->lock, flags);
+	if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+
+	pr_debug("%s:%d qp=%p, cur_state=%s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state));
+	return err;
+}
+
+int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+			  int ord, int ird)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(ord);
+	wr.ird = cpu_to_be32(ird);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_qp_destroy_req wr;
+	struct c2wr_qp_destroy_rep *reply;
+	unsigned long flags;
+	int err;
+
+	/*
+	 * Allocate a verb request message
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Initialize the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_QP_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id && qp->state == IB_QPS_RTS) {
+		pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
+			"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+		/* Generate an CLOSE event */
+		vq_req->qp = qp;
+		vq_req->cm_id = qp->cm_id;
+		vq_req->event = IW_CM_EVENT_CLOSE;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&c2dev->qp_table.lock);
+
+	ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		qp->qpn = ret;
+
+	spin_unlock_irq(&c2dev->qp_table.lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
+}
+
+static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
+{
+	spin_lock_irq(&c2dev->qp_table.lock);
+	idr_remove(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irq(&c2dev->qp_table.lock);
+}
+
+struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
+{
+	unsigned long flags;
+	struct c2_qp *qp;
+
+	spin_lock_irqsave(&c2dev->qp_table.lock, flags);
+	qp = idr_find(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
+	return qp;
+}
+
+int c2_alloc_qp(struct c2_dev *c2dev,
+		struct c2_pd *pd,
+		struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
+{
+	struct c2wr_qp_create_req wr;
+	struct c2wr_qp_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
+	struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
+	unsigned long peer_pa;
+	u32 q_size, msg_size, mmap_size;
+	void __iomem *mmap;
+	int err;
+
+	err = c2_alloc_qpn(c2dev, qp);
+	if (err)
+		return err;
+	qp->ibqp.qp_num = qp->qpn;
+	qp->ibqp.qp_type = IB_QPT_RC;
+
+	/* Allocate the SQ and RQ shared pointers */
+	qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->sq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->sq_mq.shared) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->rq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->rq_mq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Allocate the verbs request */
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	/* Initialize the work request */
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_QP_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.sq_cq_handle = send_cq->adapter_handle;
+	wr.rq_cq_handle = recv_cq->adapter_handle;
+	wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
+	wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
+	wr.srq_handle = 0;
+	wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
+			       QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
+	wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
+	wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
+	wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
+	wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
+	wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
+	wr.pd_id = pd->pd_id;
+	wr.user_context = (unsigned long) qp;
+
+	vq_req_get(c2dev, vq_req);
+
+	/* Send the WR to the adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail3;
+	}
+
+	/* Wait for the verb reply  */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail3;
+	}
+
+	/* Process the reply */
+	reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	if ((err = c2_wr_get_result(reply)) != 0) {
+		goto bail4;
+	}
+
+	/* Fill in the kernel QP struct */
+	atomic_set(&qp->refcount, 1);
+	qp->adapter_handle = reply->qp_handle;
+	qp->state = IB_QPS_RESET;
+	qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
+	init_waitqueue_head(&qp->wait);
+
+	/* Initialize the SQ MQ */
+	q_size = be32_to_cpu(reply->sq_depth);
+	msg_size = be32_to_cpu(reply->sq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail5;
+	}
+
+	c2_mq_req_init(&qp->sq_mq,
+		       be32_to_cpu(reply->sq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the RQ mq */
+	q_size = be32_to_cpu(reply->rq_depth);
+	msg_size = be32_to_cpu(reply->rq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail6;
+	}
+
+	c2_mq_req_init(&qp->rq_mq,
+		       be32_to_cpu(reply->rq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+      bail6:
+	iounmap(qp->sq_mq.peer);
+      bail5:
+	destroy_qp(c2dev, qp);
+      bail4:
+	vq_repbuf_free(c2dev, reply);
+      bail3:
+	vq_req_free(c2dev, vq_req);
+      bail2:
+	c2_free_mqsp(qp->rq_mq.shared);
+      bail1:
+	c2_free_mqsp(qp->sq_mq.shared);
+      bail0:
+	c2_free_qpn(c2dev, qp->qpn);
+	return err;
+}
+
+static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_cq *send_cq;
+	struct c2_cq *recv_cq;
+
+	send_cq = to_c2cq(qp->ibqp.send_cq);
+	recv_cq = to_c2cq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	c2_lock_cqs(send_cq, recv_cq);
+	c2_free_qpn(c2dev, qp->qpn);
+	c2_unlock_cqs(send_cq, recv_cq);
+
+	/*
+	 * Destroy qp in the rnic...
+	 */
+	destroy_qp(c2dev, qp);
+
+	/*
+	 * Mark any unreaped CQEs as null and void.
+	 */
+	c2_cq_clean(c2dev, qp, send_cq->cqn);
+	if (send_cq != recv_cq)
+		c2_cq_clean(c2dev, qp, recv_cq->cqn);
+	/*
+	 * Unmap the MQs and return the shared pointers
+	 * to the message pool.
+	 */
+	iounmap(qp->sq_mq.peer);
+	iounmap(qp->rq_mq.peer);
+	c2_free_mqsp(qp->sq_mq.shared);
+	c2_free_mqsp(qp->rq_mq.shared);
+
+	atomic_dec(&qp->refcount);
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Function: move_sgl
+ *
+ * Description:
+ * Move an SGL from the user's work request struct into a CCIL Work Request
+ * message, swapping to WR byte order and ensure the total length doesn't
+ * overflow.
+ *
+ * IN:
+ * dst		- ptr to CCIL Work Request message SGL memory.
+ * src		- ptr to the consumers SGL memory.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int
+move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
+	 u8 * actual_count)
+{
+	u32 tot = 0;		/* running total */
+	u8 acount = 0;		/* running total non-0 len sge's */
+
+	while (count > 0) {
+		/*
+		 * If the addition of this SGE causes the
+		 * total SGL length to exceed 2^32-1, then
+		 * fail-n-bail.
+		 *
+		 * If the current total plus the next element length
+		 * wraps, then it will go negative and be less than the
+		 * current total...
+		 */
+		if ((tot + src->length) < tot) {
+			return -EINVAL;
+		}
+		/*
+		 * Bug: 1456 (as well as 1498 & 1643)
+		 * Skip over any sge's supplied with len=0
+		 */
+		if (src->length) {
+			tot += src->length;
+			dst->stag = cpu_to_be32(src->lkey);
+			dst->to = cpu_to_be64(src->addr);
+			dst->length = cpu_to_be32(src->length);
+			dst++;
+			acount++;
+		}
+		src++;
+		count--;
+	}
+
+	if (acount == 0) {
+		/*
+		 * Bug: 1476 (as well as 1498, 1456 and 1643)
+		 * Setup the SGL in the WR to make it easier for the RNIC.
+		 * This way, the FW doesn't have to deal with special cases.
+		 * Setting length=0 should be sufficient.
+		 */
+		dst->stag = 0;
+		dst->to = 0;
+		dst->length = 0;
+	}
+
+	*p_len = tot;
+	*actual_count = acount;
+	return 0;
+}
+
+/*
+ * Function: c2_activity (private function)
+ *
+ * Description:
+ * Post an mq index to the host->adapter activity fifo.
+ *
+ * IN:
+ * c2dev	- ptr to c2dev structure
+ * mq_index	- mq index to post
+ * shared	- value most recently written to shared
+ *
+ * OUT:
+ *
+ * Return:
+ * none
+ */
+static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
+{
+	/*
+	 * First read the register to see if the FIFO is full, and if so,
+	 * spin until it's not.  This isn't perfect -- there is no
+	 * synchronization among the clients of the register, but in
+	 * practice it prevents multiple CPU from hammering the bus
+	 * with PCI RETRY. Note that when this does happen, the card
+	 * cannot get on the bus and the card and system hang in a
+	 * deadlock -- thus the need for this code. [TOT]
+	 */
+	while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
+		udelay(10);
+
+	__raw_writel(C2_HINT_MAKE(mq_index, shared),
+		     c2dev->regs + PCI_BAR0_ADAPTER_HINT);
+}
+
+/*
+ * Function: qp_wr_post
+ *
+ * Description:
+ * This in-line function allocates a MQ msg, then moves the host-copy of
+ * the completed WR into msg.  Then it posts the message.
+ *
+ * IN:
+ * q		- ptr to user MQ.
+ * wr		- ptr to host-copy of the WR.
+ * qp		- ptr to user qp
+ * size		- Number of bytes to post.  Assumed to be divisible by 4.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
+{
+	union c2wr *msg;
+
+	msg = c2_mq_alloc(q);
+	if (msg == NULL) {
+		return -EINVAL;
+	}
+#ifdef CCMSGMAGIC
+	((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+
+	/*
+	 * Since all header fields in the WR are the same as the
+	 * CQE, set the following so the adapter need not.
+	 */
+	c2_wr_set_result(wr, CCERR_PENDING);
+
+	/*
+	 * Copy the wr down to the adapter
+	 */
+	memcpy((void *) msg, (void *) wr, size);
+
+	c2_mq_produce(q);
+	return 0;
+}
+
+
+int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		 struct ib_send_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	u32 flags;
+	u32 tot_len;
+	u8 actual_sge_count;
+	u32 msg_size;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	while (ib_wr) {
+
+		flags = 0;
+		wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+			flags |= SQ_SIGNALED;
+		}
+
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (ib_wr->opcode == IB_WR_SEND) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
+				wr.sqwr.send.remote_stag = 0;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
+				wr.sqwr.send.remote_stag =
+					cpu_to_be32(ib_wr->ex.invalidate_rkey);
+			}
+
+			msg_size = sizeof(struct c2wr_send_req) +
+				sizeof(struct c2_data_addr) * ib_wr->num_sge;
+			if (ib_wr->num_sge > qp->send_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_WRITE:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
+			msg_size = sizeof(struct c2wr_rdma_write_req) +
+			    (sizeof(struct c2_data_addr) * ib_wr->num_sge);
+			if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			wr.sqwr.rdma_write.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_write.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			err = move_sgl((struct c2_data_addr *)
+				       & (wr.sqwr.rdma_write.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_READ:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
+			msg_size = sizeof(struct c2wr_rdma_read_req);
+
+			/* IWarp only suppots 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				err = -EINVAL;
+				break;
+			}
+
+			/*
+			 * Move the local and remote stag/to/len into the WR.
+			 */
+			wr.sqwr.rdma_read.local_stag =
+			    cpu_to_be32(ib_wr->sg_list->lkey);
+			wr.sqwr.rdma_read.local_to =
+			    cpu_to_be64(ib_wr->sg_list->addr);
+			wr.sqwr.rdma_read.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_read.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			wr.sqwr.rdma_read.length =
+			    cpu_to_be32(ib_wr->sg_list->length);
+			break;
+		default:
+			/* error */
+			msg_size = 0;
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		/*
+		 * Store flags
+		 */
+		c2_wr_set_flags(&wr, flags);
+
+		/*
+		 * Post the puppy!
+		 */
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO.
+		 */
+		c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		    struct ib_recv_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Try and post each work request
+	 */
+	while (ib_wr) {
+		u32 tot_len;
+		u8 actual_sge_count;
+
+		if (ib_wr->num_sge > qp->recv_sgl_depth) {
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * Create local host-copy of the WR
+		 */
+		wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		c2_wr_set_id(&wr, CCWR_RECV);
+		c2_wr_set_flags(&wr, 0);
+
+		/* sge_count is limited to eight bits. */
+		BUG_ON(ib_wr->num_sge >= 256);
+		err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
+			       ib_wr->sg_list,
+			       ib_wr->num_sge, &tot_len, &actual_sge_count);
+		c2_wr_set_sge_count(&wr, actual_sge_count);
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO
+		 */
+		c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+void c2_init_qp_table(struct c2_dev *c2dev)
+{
+	spin_lock_init(&c2dev->qp_table.lock);
+	idr_init(&c2dev->qp_table.idr);
+}
+
+void c2_cleanup_qp_table(struct c2_dev *c2dev)
+{
+	idr_destroy(&c2dev->qp_table.idr);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c
new file mode 100644
index 0000000..d2a6d96
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_rnic.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/route.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_vq.h"
+
+/* Device capabilities */
+#define C2_MIN_PAGESIZE  1024
+
+#define C2_MAX_MRS       32768
+#define C2_MAX_QPS       16000
+#define C2_MAX_WQE_SZ    256
+#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
+#define C2_MAX_SGES      4
+#define C2_MAX_SGE_RD    1
+#define C2_MAX_CQS       32768
+#define C2_MAX_CQES      4096
+#define C2_MAX_PDS       16384
+
+/*
+ * Send the adapter INIT message to the amso1100
+ */
+static int c2_adapter_init(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_INIT);
+	wr.hdr.context = 0;
+	wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
+	wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
+	wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
+	wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
+	wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
+	wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
+
+	/* Post the init message */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+
+	return err;
+}
+
+/*
+ * Send the adapter TERM message to the amso1100
+ */
+static void c2_adapter_term(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_TERM);
+	wr.hdr.context = 0;
+
+	/* Post the init message */
+	vq_send_wr(c2dev, (union c2wr *) & wr);
+	c2dev->init = 0;
+
+	return;
+}
+
+/*
+ * Query the adapter
+ */
+static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_query_req wr;
+	struct c2wr_rnic_query_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply)
+		err = -ENOMEM;
+	else
+		err = c2_errno(reply);
+	if (err)
+		goto bail2;
+
+	props->fw_ver =
+		((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
+		((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
+		(be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
+	memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
+	props->max_mr_size         = 0xFFFFFFFF;
+	props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
+	props->vendor_id           = be32_to_cpu(reply->vendor_id);
+	props->vendor_part_id      = be32_to_cpu(reply->part_number);
+	props->hw_ver              = be32_to_cpu(reply->hw_version);
+	props->max_qp              = be32_to_cpu(reply->max_qps);
+	props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
+	props->device_cap_flags    = c2dev->device_cap_flags;
+	props->max_sge             = C2_MAX_SGES;
+	props->max_sge_rd          = C2_MAX_SGE_RD;
+	props->max_cq              = be32_to_cpu(reply->max_cqs);
+	props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
+	props->max_mr              = be32_to_cpu(reply->max_mrs);
+	props->max_pd              = be32_to_cpu(reply->max_pds);
+	props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
+	props->max_ee_rd_atom      = 0;
+	props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
+	props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
+	props->max_ee_init_rd_atom = 0;
+	props->atomic_cap          = IB_ATOMIC_NONE;
+	props->max_ee              = 0;
+	props->max_rdd             = 0;
+	props->max_mw              = be32_to_cpu(reply->max_mws);
+	props->max_raw_ipv6_qp     = 0;
+	props->max_raw_ethy_qp     = 0;
+	props->max_mcast_grp       = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_ah              = 0;
+	props->max_fmr             = 0;
+	props->max_map_per_fmr     = 0;
+	props->max_srq             = 0;
+	props->max_srq_wr          = 0;
+	props->max_srq_sge         = 0;
+	props->max_pkeys           = 0;
+	props->local_ca_ack_delay  = 0;
+
+ bail2:
+	vq_repbuf_free(c2dev, reply);
+
+ bail1:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Add an IP address to the RNIC interface
+ */
+int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Delete an IP address from the RNIC interface
+ */
+int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Open a single RNIC instance to use with all
+ * low level openib calls
+ */
+static int c2_rnic_open(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_open_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
+	wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
+	wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
+	wr.rnic_open.req.port_num = cpu_to_be16(0);
+	wr.rnic_open.req.user_context = (unsigned long) c2dev;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = reply->rnic_handle;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Close the RNIC instance
+ */
+static int c2_rnic_close(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_close_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
+	wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
+	wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = 0;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Called by c2_probe to initialize the RNIC. This principally
+ * involves initializing the various limits and resource pools that
+ * comprise the RNIC instance.
+ */
+int c2_rnic_init(struct c2_dev *c2dev)
+{
+	int err;
+	u32 qsize, msgsize;
+	void *q1_pages;
+	void *q2_pages;
+	void __iomem *mmio_regs;
+
+	/* Device capabilities */
+	c2dev->device_cap_flags =
+	    (IB_DEVICE_RESIZE_MAX_WR |
+	     IB_DEVICE_CURR_QP_STATE_MOD |
+	     IB_DEVICE_SYS_IMAGE_GUID |
+	     IB_DEVICE_LOCAL_DMA_LKEY |
+	     IB_DEVICE_MEM_WINDOW);
+
+	/* Allocate the qptr_array */
+	c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
+	if (!c2dev->qptr_array) {
+		return -ENOMEM;
+	}
+
+	/* Initialize the qptr_array */
+	c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
+	c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
+	c2dev->qptr_array[2] = (void *) &c2dev->aeq;
+
+	/* Initialize data structures */
+	init_waitqueue_head(&c2dev->req_vq_wo);
+	spin_lock_init(&c2dev->vqlock);
+	spin_lock_init(&c2dev->lock);
+
+	/* Allocate MQ shared pointer pool for kernel clients. User
+	 * mode client pools are hung off the user context
+	 */
+	err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
+	if (err) {
+		goto bail0;
+	}
+
+	/* Allocate shared pointers for Q0, Q1, and Q2 from
+	 * the shared pointer pool.
+	 */
+
+	c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->hint_count_dma,
+					     GFP_KERNEL);
+	c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->req_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->rep_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					  &c2dev->aeq.shared_dma, GFP_KERNEL);
+	if (!c2dev->hint_count || !c2dev->req_vq.shared ||
+	    !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	mmio_regs = c2dev->kva;
+	/* Initialize the Verbs Request Queue */
+	c2_mq_req_init(&c2dev->req_vq, 0,
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the Verbs Reply Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
+	q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->rep_vq.host_dma, GFP_KERNEL);
+	if (!q1_pages) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
+	pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
+		 (unsigned long long) c2dev->rep_vq.host_dma);
+	c2_mq_rep_init(&c2dev->rep_vq,
+		   1,
+		   qsize,
+		   msgsize,
+		   q1_pages,
+		   mmio_regs +
+		   be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
+		   C2_MQ_HOST_TARGET);
+
+	/* Initialize the Asynchronus Event Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
+	q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->aeq.host_dma, GFP_KERNEL);
+	if (!q2_pages) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+	dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
+	pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
+		 (unsigned long long) c2dev->aeq.host_dma);
+	c2_mq_rep_init(&c2dev->aeq,
+		       2,
+		       qsize,
+		       msgsize,
+		       q2_pages,
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
+		       C2_MQ_HOST_TARGET);
+
+	/* Initialize the verbs request allocator */
+	err = vq_init(c2dev);
+	if (err)
+		goto bail3;
+
+	/* Enable interrupts on the adapter */
+	writel(0, c2dev->regs + C2_IDIS);
+
+	/* create the WR init message */
+	err = c2_adapter_init(c2dev);
+	if (err)
+		goto bail4;
+	c2dev->init++;
+
+	/* open an adapter instance */
+	err = c2_rnic_open(c2dev);
+	if (err)
+		goto bail4;
+
+	/* Initialize cached the adapter limits */
+	err = c2_rnic_query(c2dev, &c2dev->props);
+	if (err)
+		goto bail5;
+
+	/* Initialize the PD pool */
+	err = c2_init_pd_table(c2dev);
+	if (err)
+		goto bail5;
+
+	/* Initialize the QP pool */
+	c2_init_qp_table(c2dev);
+	return 0;
+
+      bail5:
+	c2_rnic_close(c2dev);
+      bail4:
+	vq_term(c2dev);
+      bail3:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
+      bail2:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
+      bail1:
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+      bail0:
+	vfree(c2dev->qptr_array);
+
+	return err;
+}
+
+/*
+ * Called by c2_remove to cleanup the RNIC resources.
+ */
+void c2_rnic_term(struct c2_dev *c2dev)
+{
+
+	/* Close the open adapter instance */
+	c2_rnic_close(c2dev);
+
+	/* Send the TERM message to the adapter */
+	c2_adapter_term(c2dev);
+
+	/* Disable interrupts on the adapter */
+	writel(1, c2dev->regs + C2_IDIS);
+
+	/* Free the QP pool */
+	c2_cleanup_qp_table(c2dev);
+
+	/* Free the PD pool */
+	c2_cleanup_pd_table(c2dev);
+
+	/* Free the verbs request allocator */
+	vq_term(c2dev);
+
+	/* Free the asynchronus event queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  c2dev->aeq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->aeq, mapping));
+
+	/* Free the verbs reply queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  c2dev->rep_vq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->rep_vq, mapping));
+
+	/* Free the MQ shared pointer pool */
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+
+	/* Free the qptr_array */
+	vfree(c2dev->qptr_array);
+
+	return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h
new file mode 100644
index 0000000..6ee4aa9
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_status.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef	_C2_STATUS_H_
+#define _C2_STATUS_H_
+
+/*
+ * Verbs Status Codes
+ */
+enum c2_status {
+	C2_OK = 0,		/* This must be zero */
+	CCERR_INSUFFICIENT_RESOURCES = 1,
+	CCERR_INVALID_MODIFIER = 2,
+	CCERR_INVALID_MODE = 3,
+	CCERR_IN_USE = 4,
+	CCERR_INVALID_RNIC = 5,
+	CCERR_INTERRUPTED_OPERATION = 6,
+	CCERR_INVALID_EH = 7,
+	CCERR_INVALID_CQ = 8,
+	CCERR_CQ_EMPTY = 9,
+	CCERR_NOT_IMPLEMENTED = 10,
+	CCERR_CQ_DEPTH_TOO_SMALL = 11,
+	CCERR_PD_IN_USE = 12,
+	CCERR_INVALID_PD = 13,
+	CCERR_INVALID_SRQ = 14,
+	CCERR_INVALID_ADDRESS = 15,
+	CCERR_INVALID_NETMASK = 16,
+	CCERR_INVALID_QP = 17,
+	CCERR_INVALID_QP_STATE = 18,
+	CCERR_TOO_MANY_WRS_POSTED = 19,
+	CCERR_INVALID_WR_TYPE = 20,
+	CCERR_INVALID_SGL_LENGTH = 21,
+	CCERR_INVALID_SQ_DEPTH = 22,
+	CCERR_INVALID_RQ_DEPTH = 23,
+	CCERR_INVALID_ORD = 24,
+	CCERR_INVALID_IRD = 25,
+	CCERR_QP_ATTR_CANNOT_CHANGE = 26,
+	CCERR_INVALID_STAG = 27,
+	CCERR_QP_IN_USE = 28,
+	CCERR_OUTSTANDING_WRS = 29,
+	CCERR_STAG_IN_USE = 30,
+	CCERR_INVALID_STAG_INDEX = 31,
+	CCERR_INVALID_SGL_FORMAT = 32,
+	CCERR_ADAPTER_TIMEOUT = 33,
+	CCERR_INVALID_CQ_DEPTH = 34,
+	CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
+	CCERR_INVALID_EP = 36,
+	CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
+	CCERR_FLUSHED = 38,
+	CCERR_INVALID_WQE = 39,
+	CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
+	CCERR_REMOTE_TERMINATION_ERROR = 41,
+	CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
+	CCERR_ACCESS_VIOLATION = 43,
+	CCERR_INVALID_PD_ID = 44,
+	CCERR_WRAP_ERROR = 45,
+	CCERR_INV_STAG_ACCESS_ERROR = 46,
+	CCERR_ZERO_RDMA_READ_RESOURCES = 47,
+	CCERR_QP_NOT_PRIVILEGED = 48,
+	CCERR_STAG_STATE_NOT_INVALID = 49,
+	CCERR_INVALID_PAGE_SIZE = 50,
+	CCERR_INVALID_BUFFER_SIZE = 51,
+	CCERR_INVALID_PBE = 52,
+	CCERR_INVALID_FBO = 53,
+	CCERR_INVALID_LENGTH = 54,
+	CCERR_INVALID_ACCESS_RIGHTS = 55,
+	CCERR_PBL_TOO_BIG = 56,
+	CCERR_INVALID_VA = 57,
+	CCERR_INVALID_REGION = 58,
+	CCERR_INVALID_WINDOW = 59,
+	CCERR_TOTAL_LENGTH_TOO_BIG = 60,
+	CCERR_INVALID_QP_ID = 61,
+	CCERR_ADDR_IN_USE = 62,
+	CCERR_ADDR_NOT_AVAIL = 63,
+	CCERR_NET_DOWN = 64,
+	CCERR_NET_UNREACHABLE = 65,
+	CCERR_CONN_ABORTED = 66,
+	CCERR_CONN_RESET = 67,
+	CCERR_NO_BUFS = 68,
+	CCERR_CONN_TIMEDOUT = 69,
+	CCERR_CONN_REFUSED = 70,
+	CCERR_HOST_UNREACHABLE = 71,
+	CCERR_INVALID_SEND_SGL_DEPTH = 72,
+	CCERR_INVALID_RECV_SGL_DEPTH = 73,
+	CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
+	CCERR_INSUFFICIENT_PRIVILEGES = 75,
+	CCERR_STACK_ERROR = 76,
+	CCERR_INVALID_VERSION = 77,
+	CCERR_INVALID_MTU = 78,
+	CCERR_INVALID_IMAGE = 79,
+	CCERR_PENDING = 98,	/* not an error; user internally by adapter */
+	CCERR_DEFER = 99,	/* not an error; used internally by adapter */
+	CCERR_FAILED_WRITE = 100,
+	CCERR_FAILED_ERASE = 101,
+	CCERR_FAILED_VERIFICATION = 102,
+	CCERR_NOT_FOUND = 103,
+
+};
+
+/*
+ * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
+ */
+enum c2_connect_status {
+	C2_CONN_STATUS_SUCCESS = C2_OK,
+	C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
+	C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
+	C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
+	C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
+	C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
+	C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
+	C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
+	C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
+	C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
+	C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
+};
+
+/*
+ * Flash programming status codes.
+ */
+enum c2_flash_status {
+	C2_FLASH_STATUS_SUCCESS = 0x0000,
+	C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
+	C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
+	C2_FLASH_STATUS_ECLBS = 0x0400,
+	C2_FLASH_STATUS_PSLBS = 0x0800,
+	C2_FLASH_STATUS_VPENS = 0x1000,
+};
+
+#endif				/* _C2_STATUS_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h
new file mode 100644
index 0000000..7e9e7ad
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_user.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_USER_H
+#define C2_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct c2_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct c2_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct c2_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct c2_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct c2_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif				/* C2_USER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c
new file mode 100644
index 0000000..2ec716f
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_vq.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "c2_vq.h"
+#include "c2_provider.h"
+
+/*
+ * Verbs Request Objects:
+ *
+ * VQ Request Objects are allocated by the kernel verbs handlers.
+ * They contain a wait object, a refcnt, an atomic bool indicating that the
+ * adapter has replied, and a copy of the verb reply work request.
+ * A pointer to the VQ Request Object is passed down in the context
+ * field of the work request message, and reflected back by the adapter
+ * in the verbs reply message.  The function handle_vq() in the interrupt
+ * path will use this pointer to:
+ * 	1) append a copy of the verbs reply message
+ * 	2) mark that the reply is ready
+ * 	3) wake up the kernel verbs handler blocked awaiting the reply.
+ *
+ *
+ * The kernel verbs handlers do a "get" to put a 2nd reference on the
+ * VQ Request object.  If the kernel verbs handler exits before the adapter
+ * can respond, this extra reference will keep the VQ Request object around
+ * until the adapter's reply can be processed.  The reason we need this is
+ * because a pointer to this object is stuffed into the context field of
+ * the verbs work request message, and reflected back in the reply message.
+ * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
+ * kernel verb handler that is blocked awaiting the verb reply.
+ * So handle_vq() will do a "put" on the object when it's done accessing it.
+ * NOTE:  If we guarantee that the kernel verb handler will never bail before
+ *        getting the reply, then we don't need these refcnts.
+ *
+ *
+ * VQ Request objects are freed by the kernel verbs handlers only
+ * after the verb has been processed, or when the adapter fails and
+ * does not reply.
+ *
+ *
+ * Verbs Reply Buffers:
+ *
+ * VQ Reply bufs are local host memory copies of a
+ * outstanding Verb Request reply
+ * message.  The are always allocated by the kernel verbs handlers, and _may_ be
+ * freed by either the kernel verbs handler -or- the interrupt handler.  The
+ * kernel verbs handler _must_ free the repbuf, then free the vq request object
+ * in that order.
+ */
+
+int vq_init(struct c2_dev *c2dev)
+{
+	sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
+		(char) ('0' + c2dev->devnum));
+	c2dev->host_msg_cache =
+	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
+			      SLAB_HWCACHE_ALIGN, NULL);
+	if (c2dev->host_msg_cache == NULL) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void vq_term(struct c2_dev *c2dev)
+{
+	kmem_cache_destroy(c2dev->host_msg_cache);
+}
+
+/* vq_req_alloc - allocate a VQ Request Object and initialize it.
+ * The refcnt is set to 1.
+ */
+struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *r;
+
+	r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
+	if (r) {
+		init_waitqueue_head(&r->wait_object);
+		r->reply_msg = 0;
+		r->event = 0;
+		r->cm_id = NULL;
+		r->qp = NULL;
+		atomic_set(&r->refcnt, 1);
+		atomic_set(&r->reply_ready, 0);
+	}
+	return r;
+}
+
+
+/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
+ * has already free the VQ Reply Buffer if it existed.
+ */
+void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	r->reply_msg = 0;
+	if (atomic_dec_and_test(&r->refcnt)) {
+		kfree(r);
+	}
+}
+
+/* vq_req_get - reference a VQ Request Object.  Done
+ * only in the kernel verbs handlers.
+ */
+void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	atomic_inc(&r->refcnt);
+}
+
+
+/* vq_req_put - dereference and potentially free a VQ Request Object.
+ *
+ * This is only called by handle_vq() on the
+ * interrupt when it is done processing
+ * a verb reply message.  If the associated
+ * kernel verbs handler has already bailed,
+ * then this put will actually free the VQ
+ * Request object _and_ the VQ Reply Buffer
+ * if it exists.
+ */
+void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	if (atomic_dec_and_test(&r->refcnt)) {
+		if (r->reply_msg != 0)
+			vq_repbuf_free(c2dev,
+				       (void *) (unsigned long) r->reply_msg);
+		kfree(r);
+	}
+}
+
+
+/*
+ * vq_repbuf_alloc - allocate a VQ Reply Buffer.
+ */
+void *vq_repbuf_alloc(struct c2_dev *c2dev)
+{
+	return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
+}
+
+/*
+ * vq_send_wr - post a verbs request message to the Verbs Request Queue.
+ * If a message is not available in the MQ, then block until one is available.
+ * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
+ * When the adapter drains the Verbs Request Queue,
+ * it inserts MQ index 0 in to the
+ * adapter->host activity fifo and interrupts the host.
+ */
+int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
+{
+	void *msg;
+	wait_queue_t __wait;
+
+	/*
+	 * grab adapter vq lock
+	 */
+	spin_lock(&c2dev->vqlock);
+
+	/*
+	 * allocate msg
+	 */
+	msg = c2_mq_alloc(&c2dev->req_vq);
+
+	/*
+	 * If we cannot get a msg, then we'll wait
+	 * When a messages are available, the int handler will wake_up()
+	 * any waiters.
+	 */
+	while (msg == NULL) {
+		pr_debug("%s:%d no available msg in VQ, waiting...\n",
+		       __func__, __LINE__);
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_unlock(&c2dev->vqlock);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!c2_mq_full(&c2dev->req_vq)) {
+				break;
+			}
+			if (!signal_pending(current)) {
+				schedule_timeout(1 * HZ);	/* 1 second... */
+				continue;
+			}
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+			return -EINTR;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_lock(&c2dev->vqlock);
+		msg = c2_mq_alloc(&c2dev->req_vq);
+	}
+
+	/*
+	 * copy wr into adapter msg
+	 */
+	memcpy(msg, wr, c2dev->req_vq.msg_size);
+
+	/*
+	 * post msg
+	 */
+	c2_mq_produce(&c2dev->req_vq);
+
+	/*
+	 * release adapter vq lock
+	 */
+	spin_unlock(&c2dev->vqlock);
+	return 0;
+}
+
+
+/*
+ * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
+ */
+int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
+{
+	if (!wait_event_timeout(req->wait_object,
+				atomic_read(&req->reply_ready),
+				60*HZ))
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/*
+ * vq_repbuf_free - Free a Verbs Reply Buffer.
+ */
+void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
+{
+	kmem_cache_free(c2dev->host_msg_cache, reply);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h
new file mode 100644
index 0000000..3380562
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_vq.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_VQ_H_
+#define _C2_VQ_H_
+#include <linux/sched.h>
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_provider.h"
+
+struct c2_vq_req {
+	u64 reply_msg;		/* ptr to reply msg */
+	wait_queue_head_t wait_object;	/* wait object for vq reqs */
+	atomic_t reply_ready;	/* set when reply is ready */
+	atomic_t refcnt;	/* used to cancel WRs... */
+	int event;
+	struct iw_cm_id *cm_id;
+	struct c2_qp *qp;
+};
+
+extern int vq_init(struct c2_dev *c2dev);
+extern void vq_term(struct c2_dev *c2dev);
+
+extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
+extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
+
+extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
+extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
+
+extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
+#endif				/* _C2_VQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h
new file mode 100644
index 0000000..8d4b4ca
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_wr.h
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_WR_H_
+#define _C2_WR_H_
+
+#ifdef CCDEBUG
+#define CCWR_MAGIC		0xb07700b0
+#endif
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+/* Maximum allowed size in bytes of private_data exchange
+ * on connect.
+ */
+#define C2_MAX_PRIVATE_DATA_SIZE 200
+
+/*
+ * These types are shared among the adapter, host, and CCIL consumer.
+ */
+enum c2_cq_notification_type {
+	C2_CQ_NOTIFICATION_TYPE_NONE = 1,
+	C2_CQ_NOTIFICATION_TYPE_NEXT,
+	C2_CQ_NOTIFICATION_TYPE_NEXT_SE
+};
+
+enum c2_setconfig_cmd {
+	C2_CFG_ADD_ADDR = 1,
+	C2_CFG_DEL_ADDR = 2,
+	C2_CFG_ADD_ROUTE = 3,
+	C2_CFG_DEL_ROUTE = 4
+};
+
+enum c2_getconfig_cmd {
+	C2_GETCONFIG_ROUTES = 1,
+	C2_GETCONFIG_ADDRS
+};
+
+/*
+ *  CCIL Work Request Identifiers
+ */
+enum c2wr_ids {
+	CCWR_RNIC_OPEN = 1,
+	CCWR_RNIC_QUERY,
+	CCWR_RNIC_SETCONFIG,
+	CCWR_RNIC_GETCONFIG,
+	CCWR_RNIC_CLOSE,
+	CCWR_CQ_CREATE,
+	CCWR_CQ_QUERY,
+	CCWR_CQ_MODIFY,
+	CCWR_CQ_DESTROY,
+	CCWR_QP_CONNECT,
+	CCWR_PD_ALLOC,
+	CCWR_PD_DEALLOC,
+	CCWR_SRQ_CREATE,
+	CCWR_SRQ_QUERY,
+	CCWR_SRQ_MODIFY,
+	CCWR_SRQ_DESTROY,
+	CCWR_QP_CREATE,
+	CCWR_QP_QUERY,
+	CCWR_QP_MODIFY,
+	CCWR_QP_DESTROY,
+	CCWR_NSMR_STAG_ALLOC,
+	CCWR_NSMR_REGISTER,
+	CCWR_NSMR_PBL,
+	CCWR_STAG_DEALLOC,
+	CCWR_NSMR_REREGISTER,
+	CCWR_SMR_REGISTER,
+	CCWR_MR_QUERY,
+	CCWR_MW_ALLOC,
+	CCWR_MW_QUERY,
+	CCWR_EP_CREATE,
+	CCWR_EP_GETOPT,
+	CCWR_EP_SETOPT,
+	CCWR_EP_DESTROY,
+	CCWR_EP_BIND,
+	CCWR_EP_CONNECT,
+	CCWR_EP_LISTEN,
+	CCWR_EP_SHUTDOWN,
+	CCWR_EP_LISTEN_CREATE,
+	CCWR_EP_LISTEN_DESTROY,
+	CCWR_EP_QUERY,
+	CCWR_CR_ACCEPT,
+	CCWR_CR_REJECT,
+	CCWR_CONSOLE,
+	CCWR_TERM,
+	CCWR_FLASH_INIT,
+	CCWR_FLASH,
+	CCWR_BUF_ALLOC,
+	CCWR_BUF_FREE,
+	CCWR_FLASH_WRITE,
+	CCWR_INIT,		/* WARNING: Don't move this ever again! */
+
+
+
+	/* Add new IDs here */
+
+
+
+	/*
+	 * WARNING: CCWR_LAST must always be the last verbs id defined!
+	 *          All the preceding IDs are fixed, and must not change.
+	 *          You can add new IDs, but must not remove or reorder
+	 *          any IDs. If you do, YOU will ruin any hope of
+	 *          compatibility between versions.
+	 */
+	CCWR_LAST,
+
+	/*
+	 * Start over at 1 so that arrays indexed by user wr id's
+	 * begin at 1.  This is OK since the verbs and user wr id's
+	 * are always used on disjoint sets of queues.
+	 */
+	/*
+	 * The order of the CCWR_SEND_XX verbs must
+	 * match the order of the RDMA_OPs
+	 */
+	CCWR_SEND = 1,
+	CCWR_SEND_INV,
+	CCWR_SEND_SE,
+	CCWR_SEND_SE_INV,
+	CCWR_RDMA_WRITE,
+	CCWR_RDMA_READ,
+	CCWR_RDMA_READ_INV,
+	CCWR_MW_BIND,
+	CCWR_NSMR_FASTREG,
+	CCWR_STAG_INVALIDATE,
+	CCWR_RECV,
+	CCWR_NOP,
+	CCWR_UNIMPL,
+/* WARNING: This must always be the last user wr id defined! */
+};
+#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
+
+/*
+ * SQ/RQ Work Request Types
+ */
+enum c2_wr_type {
+	C2_WR_TYPE_SEND = CCWR_SEND,
+	C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
+	C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
+	C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
+	C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
+	C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
+	C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
+	C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
+	C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
+	C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
+	C2_WR_TYPE_RECV = CCWR_RECV,
+	C2_WR_TYPE_NOP = CCWR_NOP,
+};
+
+struct c2_netaddr {
+	__be32 ip_addr;
+	__be32 netmask;
+	u32 mtu;
+};
+
+struct c2_route {
+	u32 ip_addr;		/* 0 indicates the default route */
+	u32 netmask;		/* netmask associated with dst */
+	u32 flags;
+	union {
+		u32 ipaddr;	/* address of the nexthop interface */
+		u8 enaddr[6];
+	} nexthop;
+};
+
+/*
+ * A Scatter Gather Entry.
+ */
+struct c2_data_addr {
+	__be32 stag;
+	__be32 length;
+	__be64 to;
+};
+
+/*
+ * MR and MW flags used by the consumer, RI, and RNIC.
+ */
+enum c2_mm_flags {
+	MEM_REMOTE = 0x0001,	/* allow mw binds with remote access. */
+	MEM_VA_BASED = 0x0002,	/* Not Zero-based */
+	MEM_PBL_COMPLETE = 0x0004,	/* PBL array is complete in this msg */
+	MEM_LOCAL_READ = 0x0008,	/* allow local reads */
+	MEM_LOCAL_WRITE = 0x0010,	/* allow local writes */
+	MEM_REMOTE_READ = 0x0020,	/* allow remote reads */
+	MEM_REMOTE_WRITE = 0x0040,	/* allow remote writes */
+	MEM_WINDOW_BIND = 0x0080,	/* binds allowed */
+	MEM_SHARED = 0x0100,	/* set if MR is shared */
+	MEM_STAG_VALID = 0x0200	/* set if STAG is in valid state */
+};
+
+/*
+ * CCIL API ACF flags defined in terms of the low level mem flags.
+ * This minimizes translation needed in the user API
+ */
+enum c2_acf {
+	C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
+	C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
+	C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
+	C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
+	C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
+};
+
+/*
+ * Image types of objects written to flash
+ */
+#define C2_FLASH_IMG_BITFILE 1
+#define C2_FLASH_IMG_OPTION_ROM 2
+#define C2_FLASH_IMG_VPD 3
+
+/*
+ *  to fix bug 1815 we define the max size allowable of the
+ *  terminate message (per the IETF spec).Refer to the IETF
+ *  protocol specification, section 12.1.6, page 64)
+ *  The message is prefixed by 20 types of DDP info.
+ *
+ *  Then the message has 6 bytes for the terminate control
+ *  and DDP segment length info plus a DDP header (either
+ *  14 or 18 byts) plus 28 bytes for the RDMA header.
+ *  Thus the max size in:
+ *  20 + (6 + 18 + 28) = 72
+ */
+#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
+
+/*
+ * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
+ */
+#define WR_BUILD_STR_LEN 64
+
+/*
+ * WARNING:  All of these structs need to align any 64bit types on
+ * 64 bit boundaries!  64bit types include u64 and u64.
+ */
+
+/*
+ * Clustercore Work Request Header.  Be sensitive to field layout
+ * and alignment.
+ */
+struct c2wr_hdr {
+	/* wqe_count is part of the cqe.  It is put here so the
+	 * adapter can write to it while the wr is pending without
+	 * clobbering part of the wr.  This word need not be dma'd
+	 * from the host to adapter by libccil, but we copy it anyway
+	 * to make the memcpy to the adapter better aligned.
+	 */
+	__be32 wqe_count;
+
+	/* Put these fields next so that later 32- and 64-bit
+	 * quantities are naturally aligned.
+	 */
+	u8 id;
+	u8 result;		/* adapter -> host */
+	u8 sge_count;		/* host -> adapter */
+	u8 flags;		/* host -> adapter */
+
+	u64 context;
+#ifdef CCMSGMAGIC
+	u32 magic;
+	u32 pad;
+#endif
+} __attribute__((packed));
+
+/*
+ *------------------------ RNIC ------------------------
+ */
+
+/*
+ * WR_RNIC_OPEN
+ */
+
+/*
+ * Flags for the RNIC WRs
+ */
+enum c2_rnic_flags {
+	RNIC_IRD_STATIC = 0x0001,
+	RNIC_ORD_STATIC = 0x0002,
+	RNIC_QP_STATIC = 0x0004,
+	RNIC_SRQ_SUPPORTED = 0x0008,
+	RNIC_PBL_BLOCK_MODE = 0x0010,
+	RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
+	RNIC_CQ_OVF_DETECTED = 0x0040,
+	RNIC_PRIV_MODE = 0x0080
+};
+
+struct c2wr_rnic_open_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be16 flags;		/* See enum c2_rnic_flags */
+	__be16 port_num;
+} __attribute__((packed));
+
+struct c2wr_rnic_open_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+union c2wr_rnic_open {
+	struct c2wr_rnic_open_req req;
+	struct c2wr_rnic_open_rep rep;
+} __attribute__((packed));
+
+struct c2wr_rnic_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_QUERY
+ */
+struct c2wr_rnic_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be32 vendor_id;
+	__be32 part_number;
+	__be32 hw_version;
+	__be32 fw_ver_major;
+	__be32 fw_ver_minor;
+	__be32 fw_ver_patch;
+	char fw_ver_build_str[WR_BUILD_STR_LEN];
+	__be32 max_qps;
+	__be32 max_qp_depth;
+	u32 max_srq_depth;
+	u32 max_send_sgl_depth;
+	u32 max_rdma_sgl_depth;
+	__be32 max_cqs;
+	__be32 max_cq_depth;
+	u32 max_cq_event_handlers;
+	__be32 max_mrs;
+	u32 max_pbl_depth;
+	__be32 max_pds;
+	__be32 max_global_ird;
+	u32 max_global_ord;
+	__be32 max_qp_ird;
+	__be32 max_qp_ord;
+	u32 flags;
+	__be32 max_mws;
+	u32 pbe_range_low;
+	u32 pbe_range_high;
+	u32 max_srqs;
+	u32 page_size;
+} __attribute__((packed));
+
+union c2wr_rnic_query {
+	struct c2wr_rnic_query_req req;
+	struct c2wr_rnic_query_rep rep;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_GETCONFIG
+ */
+
+struct c2wr_rnic_getconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u64 reply_buf;
+	u32 reply_buf_len;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_getconfig_rep {
+	struct c2wr_hdr hdr;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u32 count_len;		/* length of the number of addresses configured */
+} __attribute__((packed)) ;
+
+union c2wr_rnic_getconfig {
+	struct c2wr_rnic_getconfig_req req;
+	struct c2wr_rnic_getconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_SETCONFIG
+ */
+struct c2wr_rnic_setconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 option;		/* See c2_setconfig_cmd_t */
+	/* variable data and pad. See c2_netaddr and c2_route */
+	u8 data[0];
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_setconfig_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_setconfig {
+	struct c2wr_rnic_setconfig_req req;
+	struct c2wr_rnic_setconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_CLOSE
+ */
+struct c2wr_rnic_close_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_close_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_close {
+	struct c2wr_rnic_close_req req;
+	struct c2wr_rnic_close_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ CQ ------------------------
+ */
+struct c2wr_cq_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_ht;
+	u64 user_context;
+	__be64 msg_pool;
+	u32 rnic_handle;
+	__be32 msg_size;
+	__be32 depth;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 mq_index;
+	__be32 adapter_shared;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_cq_create {
+	struct c2wr_cq_create_req req;
+	struct c2wr_cq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+	u32 new_depth;
+	u64 new_msg_pool;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_modify {
+	struct c2wr_cq_modify_req req;
+	struct c2wr_cq_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_destroy {
+	struct c2wr_cq_destroy_req req;
+	struct c2wr_cq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ PD ------------------------
+ */
+struct c2wr_pd_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_alloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_alloc {
+	struct c2wr_pd_alloc_req req;
+	struct c2wr_pd_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_dealloc {
+	struct c2wr_pd_dealloc_req req;
+	struct c2wr_pd_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ SRQ ------------------------
+ */
+struct c2wr_srq_create_req {
+	struct c2wr_hdr hdr;
+	u64 shared_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 srq_depth;
+	u32 srq_limit;
+	u32 sgl_depth;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_create_rep {
+	struct c2wr_hdr hdr;
+	u32 srq_depth;
+	u32 sgl_depth;
+	u32 msg_size;
+	u32 mq_index;
+	u32 mq_start;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_srq_create {
+	struct c2wr_srq_create_req req;
+	struct c2wr_srq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_srq_destroy {
+	struct c2wr_srq_destroy_req req;
+	struct c2wr_srq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ QP ------------------------
+ */
+enum c2wr_qp_flags {
+	QP_RDMA_READ = 0x00000001,	/* RDMA read enabled? */
+	QP_RDMA_WRITE = 0x00000002,	/* RDMA write enabled? */
+	QP_MW_BIND = 0x00000004,	/* MWs enabled */
+	QP_ZERO_STAG = 0x00000008,	/* enabled? */
+	QP_REMOTE_TERMINATION = 0x00000010,	/* remote end terminated */
+	QP_RDMA_READ_RESPONSE = 0x00000020	/* Remote RDMA read  */
+	    /* enabled? */
+};
+
+struct c2wr_qp_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_sq_ht;
+	__be64 shared_rq_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_cq_handle;
+	u32 rq_cq_handle;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 srq_handle;
+	u32 srq_limit;
+	__be32 flags;		/* see enum c2wr_qp_flags */
+	__be32 send_sgl_depth;
+	__be32 recv_sgl_depth;
+	__be32 rdma_write_sgl_depth;
+	__be32 ord;
+	__be32 ird;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 ord;
+	u32 ird;
+	__be32 sq_msg_size;
+	__be32 sq_mq_index;
+	__be32 sq_mq_start;
+	__be32 rq_msg_size;
+	__be32 rq_mq_index;
+	__be32 rq_mq_start;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+union c2wr_qp_create {
+	struct c2wr_qp_create_req req;
+	struct c2wr_qp_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 send_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 ord;
+	u32 ird;
+	u16 qp_state;
+	u16 flags;		/* see c2wr_qp_flags_t */
+	u32 qp_id;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+	u32 terminate_msg_length;	/* 0 if not present */
+	u8 data[0];
+	/* Terminate Message in-line here. */
+} __attribute__((packed)) ;
+
+union c2wr_qp_query {
+	struct c2wr_qp_query_req req;
+	struct c2wr_qp_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_req {
+	struct c2wr_hdr hdr;
+	u64 stream_msg;
+	u32 stream_msg_length;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 next_qp_state;
+	__be32 ord;
+	__be32 ird;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 llp_ep_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_rep {
+	struct c2wr_hdr hdr;
+	u32 ord;
+	u32 ird;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 sq_msg_size;
+	u32 sq_mq_index;
+	u32 sq_mq_start;
+	u32 rq_msg_size;
+	u32 rq_mq_index;
+	u32 rq_mq_start;
+} __attribute__((packed)) ;
+
+union c2wr_qp_modify {
+	struct c2wr_qp_modify_req req;
+	struct c2wr_qp_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_qp_destroy {
+	struct c2wr_qp_destroy_req req;
+	struct c2wr_qp_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
+ * only be posted when a QP is in IDLE state.  After the connect request is
+ * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
+ * No synchronous reply from adapter to this WR.  The results of
+ * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
+ * See c2wr_ae_active_connect_results_t
+ */
+struct c2wr_qp_connect_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 remote_addr;
+	__be16 remote_port;
+	u16 pad;
+	__be32 private_data_length;
+	u8 private_data[0];	/* Private data in-line. */
+} __attribute__((packed)) ;
+
+struct c2wr_qp_connect {
+	struct c2wr_qp_connect_req req;
+	/* no synchronous reply.         */
+} __attribute__((packed)) ;
+
+
+/*
+ *------------------------ MM ------------------------
+ */
+
+struct c2wr_nsmr_stag_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pbl_depth;
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_stag_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_stag_alloc {
+	struct c2wr_nsmr_stag_alloc_req req;
+	struct c2wr_nsmr_stag_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_req {
+	struct c2wr_hdr hdr;
+	__be64 va;
+	u32 rnic_handle;
+	__be16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 pd_id;
+	__be32 pbl_depth;
+	__be32 pbe_size;
+	__be32 fbo;
+	__be32 length;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_register {
+	struct c2wr_nsmr_register_req req;
+	struct c2wr_nsmr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 flags;
+	__be32 stag_index;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_pbl {
+	struct c2wr_nsmr_pbl_req req;
+	struct c2wr_nsmr_pbl_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+	u32 pbl_depth;
+} __attribute__((packed)) ;
+
+union c2wr_mr_query {
+	struct c2wr_mr_query_req req;
+	struct c2wr_mr_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+union c2wr_mw_query {
+	struct c2wr_mw_query_req req;
+	struct c2wr_mw_query_rep rep;
+} __attribute__((packed)) ;
+
+
+struct c2wr_stag_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_stag_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_stag_dealloc {
+	struct c2wr_stag_dealloc_req req;
+	struct c2wr_stag_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+	u32 pbl_depth;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	u32 pad1;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_reregister {
+	struct c2wr_nsmr_reregister_req req;
+	struct c2wr_nsmr_reregister_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_smr_register {
+	struct c2wr_smr_register_req req;
+	struct c2wr_smr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_mw_alloc {
+	struct c2wr_mw_alloc_req req;
+	struct c2wr_mw_alloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ WRs -----------------------
+ */
+
+struct c2wr_user_hdr {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+} __attribute__((packed)) ;
+
+enum c2_qp_state {
+	C2_QP_STATE_IDLE = 0x01,
+	C2_QP_STATE_CONNECTING = 0x02,
+	C2_QP_STATE_RTS = 0x04,
+	C2_QP_STATE_CLOSING = 0x08,
+	C2_QP_STATE_TERMINATE = 0x10,
+	C2_QP_STATE_ERROR = 0x20,
+};
+
+/* Completion queue entry. */
+struct c2wr_ce {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+	u64 qp_user_context;	/* c2_user_qp_t * */
+	u32 qp_state;		/* Current QP State */
+	u32 handle;		/* QPID or EP Handle */
+	__be32 bytes_rcvd;		/* valid for RECV WCs */
+	u32 stag;
+} __attribute__((packed)) ;
+
+
+/*
+ * Flags used for all post-sq WRs.  These must fit in the flags
+ * field of the struct c2wr_hdr (eight bits).
+ */
+enum {
+	SQ_SIGNALED = 0x01,
+	SQ_READ_FENCE = 0x02,
+	SQ_FENCE = 0x04,
+};
+
+/*
+ * Common fields for all post-sq WRs.  Namely the standard header and a
+ * secondary header with fields common to all post-sq WRs.
+ */
+struct c2_sq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * Same as above but for post-rq WRs.
+ */
+struct c2_rq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * use the same struct for all sends.
+ */
+struct c2wr_send_req {
+	struct c2_sq_hdr sq_hdr;
+	__be32 sge_len;
+	__be32 remote_stag;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_send {
+	struct c2wr_send_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_write_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 remote_to;
+	__be32 remote_stag;
+	__be32 sge_len;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_rdma_write {
+	struct c2wr_rdma_write_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_read_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 local_to;
+	__be64 remote_to;
+	__be32 local_stag;
+	__be32 remote_stag;
+	__be32 length;
+} __attribute__((packed));
+
+union c2wr_rdma_read {
+	struct c2wr_rdma_read_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_mw_bind_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 mw_stag_index;
+	u32 mr_stag_index;
+	u32 length;
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_mw_bind {
+	struct c2wr_mw_bind_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_nsmr_fastreg_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed));
+
+union c2wr_nsmr_fastreg {
+	struct c2wr_nsmr_fastreg_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_stag_invalidate_req {
+	struct c2_sq_hdr sq_hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+} __attribute__((packed));
+
+union c2wr_stag_invalidate {
+	struct c2wr_stag_invalidate_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+union c2wr_sqwr {
+	struct c2_sq_hdr sq_hdr;
+	struct c2wr_send_req send;
+	struct c2wr_send_req send_se;
+	struct c2wr_send_req send_inv;
+	struct c2wr_send_req send_se_inv;
+	struct c2wr_rdma_write_req rdma_write;
+	struct c2wr_rdma_read_req rdma_read;
+	struct c2wr_mw_bind_req mw_bind;
+	struct c2wr_nsmr_fastreg_req nsmr_fastreg;
+	struct c2wr_stag_invalidate_req stag_inv;
+} __attribute__((packed));
+
+
+/*
+ * RQ WRs
+ */
+struct c2wr_rqwr {
+	struct c2_rq_hdr rq_hdr;
+	u8 data[0];		/* array of SGEs */
+} __attribute__((packed));
+
+union c2wr_recv {
+	struct c2wr_rqwr req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+/*
+ * All AEs start with this header.  Most AEs only need to convey the
+ * information in the header.  Some, like LLP connection events, need
+ * more info.  The union typdef c2wr_ae_t has all the possible AEs.
+ *
+ * hdr.context is the user_context from the rnic_open WR.  NULL If this
+ * is not affiliated with an rnic
+ *
+ * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
+ * CCAE_LLP_CLOSE_COMPLETE)
+ *
+ * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
+ *
+ * user_context is the context passed down when the host created the resource.
+ */
+struct c2wr_ae_hdr {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* user context for this res. */
+	__be32 resource_type;	/* see enum c2_resource_indicator */
+	__be32 resource;	/* handle for resource */
+	__be32 qp_state;	/* current QP State */
+} __attribute__((packed));
+
+/*
+ * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
+ * the adapter moves the QP into RTS state
+ */
+struct c2wr_ae_active_connect_results {
+	struct c2wr_ae_hdr ae_hdr;
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+/*
+ * When connections are established by the stack (and the private data
+ * MPA frame is received), the adapter will generate an event to the host.
+ * The details of the connection, any private data, and the new connection
+ * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
+ * AE queue:
+ */
+struct c2wr_ae_connection_request {
+	struct c2wr_ae_hdr ae_hdr;
+	u32 cr_handle;		/* connreq handle (sock ptr) */
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+union c2wr_ae {
+	struct c2wr_ae_hdr ae_generic;
+	struct c2wr_ae_active_connect_results ae_active_connect_results;
+	struct c2wr_ae_connection_request ae_connection_request;
+} __attribute__((packed));
+
+struct c2wr_init_req {
+	struct c2wr_hdr hdr;
+	__be64 hint_count;
+	__be64 q0_host_shared;
+	__be64 q1_host_shared;
+	__be64 q1_host_msg_pool;
+	__be64 q2_host_shared;
+	__be64 q2_host_msg_pool;
+} __attribute__((packed));
+
+struct c2wr_init_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_init {
+	struct c2wr_init_req req;
+	struct c2wr_init_rep rep;
+} __attribute__((packed));
+
+/*
+ * For upgrading flash.
+ */
+
+struct c2wr_flash_init_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+struct c2wr_flash_init_rep {
+	struct c2wr_hdr hdr;
+	u32 adapter_flash_buf_offset;
+	u32 adapter_flash_len;
+} __attribute__((packed));
+
+union c2wr_flash_init {
+	struct c2wr_flash_init_req req;
+	struct c2wr_flash_init_rep rep;
+} __attribute__((packed));
+
+struct c2wr_flash_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 len;
+} __attribute__((packed));
+
+struct c2wr_flash_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash {
+	struct c2wr_flash_req req;
+	struct c2wr_flash_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 size;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 offset;		/* 0 if mem not available */
+	u32 size;		/* 0 if mem not available */
+} __attribute__((packed));
+
+union c2wr_buf_alloc {
+	struct c2wr_buf_alloc_req req;
+	struct c2wr_buf_alloc_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_free_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;		/* Must match value from alloc */
+	u32 size;		/* Must match value from alloc */
+} __attribute__((packed));
+
+struct c2wr_buf_free_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_buf_free {
+	struct c2wr_buf_free_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_flash_write_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;
+	u32 size;
+	u32 type;
+	u32 flags;
+} __attribute__((packed));
+
+struct c2wr_flash_write_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash_write {
+	struct c2wr_flash_write_req req;
+	struct c2wr_flash_write_rep rep;
+} __attribute__((packed));
+
+/*
+ * Messages for LLP connection setup.
+ */
+
+/*
+ * Listen Request.  This allocates a listening endpoint to allow passive
+ * connection setup.  Newly established LLP connections are passed up
+ * via an AE.  See c2wr_ae_connection_request_t
+ */
+struct c2wr_ep_listen_create_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* returned in AEs. */
+	u32 rnic_handle;
+	__be32 local_addr;		/* local addr, or 0  */
+	__be16 local_port;		/* 0 means "pick one" */
+	u16 pad;
+	__be32 backlog;		/* tradional tcp listen bl */
+} __attribute__((packed));
+
+struct c2wr_ep_listen_create_rep {
+	struct c2wr_hdr hdr;
+	u32 ep_handle;		/* handle to new listening ep */
+	u16 local_port;		/* resulting port... */
+	u16 pad;
+} __attribute__((packed));
+
+union c2wr_ep_listen_create {
+	struct c2wr_ep_listen_create_req req;
+	struct c2wr_ep_listen_create_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_ep_listen_destroy {
+	struct c2wr_ep_listen_destroy_req req;
+	struct c2wr_ep_listen_destroy_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_query_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+} __attribute__((packed));
+
+union c2wr_ep_query {
+	struct c2wr_ep_query_req req;
+	struct c2wr_ep_query_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * The host passes this down to indicate acceptance of a pending iWARP
+ * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
+ * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
+ */
+struct c2wr_cr_accept_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;		/* QP to bind to this LLP conn */
+	u32 ep_handle;		/* LLP  handle to accept */
+	__be32 private_data_length;
+	u8 private_data[0];	/* data in-line in msg. */
+} __attribute__((packed));
+
+/*
+ * adapter sends reply when private data is successfully submitted to
+ * the LLP.
+ */
+struct c2wr_cr_accept_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_accept {
+	struct c2wr_cr_accept_req req;
+	struct c2wr_cr_accept_rep rep;
+} __attribute__((packed));
+
+/*
+ * The host sends this down if a given iWARP connection request was
+ * rejected by the consumer.  The cr_handle was obtained from a
+ * previous c2wr_ae_connection_request_t AE sent by the adapter.
+ */
+struct  c2wr_cr_reject_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;		/* LLP handle to reject */
+} __attribute__((packed));
+
+/*
+ * Dunno if this is needed, but we'll add it for now.  The adapter will
+ * send the reject_reply after the LLP endpoint has been destroyed.
+ */
+struct  c2wr_cr_reject_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_reject {
+	struct c2wr_cr_reject_req req;
+	struct c2wr_cr_reject_rep rep;
+} __attribute__((packed));
+
+/*
+ * console command.  Used to implement a debug console over the verbs
+ * request and reply queues.
+ */
+
+/*
+ * Console request message.  It contains:
+ *	- message hdr with id = CCWR_CONSOLE
+ *	- the physaddr/len of host memory to be used for the reply.
+ *	- the command string.  eg:  "netstat -s" or "zoneinfo"
+ */
+struct c2wr_console_req {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u64 reply_buf;		/* pinned host buf for reply */
+	u32 reply_buf_len;	/* length of reply buffer */
+	u8 command[0];		/* NUL terminated ascii string */
+	/* containing the command req */
+} __attribute__((packed));
+
+/*
+ * flags used in the console reply.
+ */
+enum c2_console_flags {
+	CONS_REPLY_TRUNCATED = 0x00000001	/* reply was truncated */
+} __attribute__((packed));
+
+/*
+ * Console reply message.
+ * hdr.result contains the c2_status_t error if the reply was _not_ generated,
+ * or C2_OK if the reply was generated.
+ */
+struct c2wr_console_rep {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_console {
+	struct c2wr_console_req req;
+	struct c2wr_console_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * Giant union with all WRs.  Makes life easier...
+ */
+union c2wr {
+	struct c2wr_hdr hdr;
+	struct c2wr_user_hdr user_hdr;
+	union c2wr_rnic_open rnic_open;
+	union c2wr_rnic_query rnic_query;
+	union c2wr_rnic_getconfig rnic_getconfig;
+	union c2wr_rnic_setconfig rnic_setconfig;
+	union c2wr_rnic_close rnic_close;
+	union c2wr_cq_create cq_create;
+	union c2wr_cq_modify cq_modify;
+	union c2wr_cq_destroy cq_destroy;
+	union c2wr_pd_alloc pd_alloc;
+	union c2wr_pd_dealloc pd_dealloc;
+	union c2wr_srq_create srq_create;
+	union c2wr_srq_destroy srq_destroy;
+	union c2wr_qp_create qp_create;
+	union c2wr_qp_query qp_query;
+	union c2wr_qp_modify qp_modify;
+	union c2wr_qp_destroy qp_destroy;
+	struct c2wr_qp_connect qp_connect;
+	union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
+	union c2wr_nsmr_register nsmr_register;
+	union c2wr_nsmr_pbl nsmr_pbl;
+	union c2wr_mr_query mr_query;
+	union c2wr_mw_query mw_query;
+	union c2wr_stag_dealloc stag_dealloc;
+	union c2wr_sqwr sqwr;
+	struct c2wr_rqwr rqwr;
+	struct c2wr_ce ce;
+	union c2wr_ae ae;
+	union c2wr_init init;
+	union c2wr_ep_listen_create ep_listen_create;
+	union c2wr_ep_listen_destroy ep_listen_destroy;
+	union c2wr_cr_accept cr_accept;
+	union c2wr_cr_reject cr_reject;
+	union c2wr_console console;
+	union c2wr_flash_init flash_init;
+	union c2wr_flash flash;
+	union c2wr_buf_alloc buf_alloc;
+	union c2wr_buf_free buf_free;
+	union c2wr_flash_write flash_write;
+} __attribute__((packed));
+
+
+/*
+ * Accessors for the wr fields that are packed together tightly to
+ * reduce the wr message size.  The wr arguments are void* so that
+ * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
+ * in the struct c2wr union can be passed in.
+ */
+static __inline__ u8 c2_wr_get_id(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->id;
+}
+static __inline__ void c2_wr_set_id(void *wr, u8 id)
+{
+	((struct c2wr_hdr *) wr)->id = id;
+}
+static __inline__ u8 c2_wr_get_result(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->result;
+}
+static __inline__ void c2_wr_set_result(void *wr, u8 result)
+{
+	((struct c2wr_hdr *) wr)->result = result;
+}
+static __inline__ u8 c2_wr_get_flags(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->flags;
+}
+static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
+{
+	((struct c2wr_hdr *) wr)->flags = flags;
+}
+static __inline__ u8 c2_wr_get_sge_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->sge_count;
+}
+static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
+{
+	((struct c2wr_hdr *) wr)->sge_count = sge_count;
+}
+static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->wqe_count;
+}
+static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
+{
+	((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
+}
+
+#endif				/* _C2_WR_H_ */
-- 
cgit v0.10.2


From d4ab347005fb26f414b98b2c8d5ef6de5778c3dc Mon Sep 17 00:00:00 2001
From: Dennis Dalessandro <dennis.dalessandro@intel.com>
Date: Thu, 30 Jul 2015 15:17:32 -0400
Subject: IB/core: Add core header changes needed for OPA

This patch adds the value of the CNP opcode to the existing list of enumerated
opcodes in ib_pack.h

Add common OPA header definitions for driver
build:
- opa_port_info.h
- opa_smi.h
- hfi1_user.h

Additionally, ib_mad.h, has additional definitions
that are common to ib_drivers including:
- trap support
- cca support

The qib driver has the duplication removed in favor
those in ib_mad.h

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: John, Jubin <jubin.john@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
index 941d4d5..57e99dc 100644
--- a/drivers/infiniband/hw/qib/qib_mad.h
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -36,148 +36,17 @@
 
 #include <rdma/ib_pma.h>
 
-#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+#define IB_SMP_UNSUP_VERSION \
+cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_VERSION)
 
-struct ib_node_info {
-	u8 base_version;
-	u8 class_version;
-	u8 node_type;
-	u8 num_ports;
-	__be64 sys_guid;
-	__be64 node_guid;
-	__be64 port_guid;
-	__be16 partition_cap;
-	__be16 device_id;
-	__be32 revision;
-	u8 local_port_num;
-	u8 vendor_id[3];
-} __packed;
-
-struct ib_mad_notice_attr {
-	u8 generic_type;
-	u8 prod_type_msb;
-	__be16 prod_type_lsb;
-	__be16 trap_num;
-	__be16 issuer_lid;
-	__be16 toggle_count;
-
-	union {
-		struct {
-			u8	details[54];
-		} raw_data;
-
-		struct {
-			__be16	reserved;
-			__be16	lid;		/* where violation happened */
-			u8	port_num;	/* where violation happened */
-		} __packed ntc_129_131;
-
-		struct {
-			__be16	reserved;
-			__be16	lid;		/* LID where change occurred */
-			u8	reserved2;
-			u8	local_changes;	/* low bit - local changes */
-			__be32	new_cap_mask;	/* new capability mask */
-			u8	reserved3;
-			u8	change_flags;	/* low 3 bits only */
-		} __packed ntc_144;
-
-		struct {
-			__be16	reserved;
-			__be16	lid;		/* lid where sys guid changed */
-			__be16	reserved2;
-			__be64	new_sys_guid;
-		} __packed ntc_145;
-
-		struct {
-			__be16	reserved;
-			__be16	lid;
-			__be16	dr_slid;
-			u8	method;
-			u8	reserved2;
-			__be16	attr_id;
-			__be32	attr_mod;
-			__be64	mkey;
-			u8	reserved3;
-			u8	dr_trunc_hop;
-			u8	dr_rtn_path[30];
-		} __packed ntc_256;
-
-		struct {
-			__be16		reserved;
-			__be16		lid1;
-			__be16		lid2;
-			__be32		key;
-			__be32		sl_qp1;	/* SL: high 4 bits */
-			__be32		qp2;	/* high 8 bits reserved */
-			union ib_gid	gid1;
-			union ib_gid	gid2;
-		} __packed ntc_257_258;
-
-	} details;
-};
-
-/*
- * Generic trap/notice types
- */
-#define IB_NOTICE_TYPE_FATAL	0x80
-#define IB_NOTICE_TYPE_URGENT	0x81
-#define IB_NOTICE_TYPE_SECURITY	0x82
-#define IB_NOTICE_TYPE_SM	0x83
-#define IB_NOTICE_TYPE_INFO	0x84
+#define IB_SMP_UNSUP_METHOD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD)
 
-/*
- * Generic trap/notice producers
- */
-#define IB_NOTICE_PROD_CA		cpu_to_be16(1)
-#define IB_NOTICE_PROD_SWITCH		cpu_to_be16(2)
-#define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
-#define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
+#define IB_SMP_UNSUP_METH_ATTR \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
 
-/*
- * Generic trap/notice numbers
- */
-#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
-#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
-#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
-#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
-#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
-#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
-#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
-#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
-
-/*
- * Repress trap/notice flags
- */
-#define IB_NOTICE_REPRESS_LLI_THRESH	(1 << 0)
-#define IB_NOTICE_REPRESS_EBO_THRESH	(1 << 1)
-#define IB_NOTICE_REPRESS_FLOW_UPDATE	(1 << 2)
-#define IB_NOTICE_REPRESS_CAP_MASK_CHG	(1 << 3)
-#define IB_NOTICE_REPRESS_SYS_GUID_CHG	(1 << 4)
-#define IB_NOTICE_REPRESS_BAD_MKEY	(1 << 5)
-#define IB_NOTICE_REPRESS_BAD_PKEY	(1 << 6)
-#define IB_NOTICE_REPRESS_BAD_QKEY	(1 << 7)
-
-/*
- * Generic trap/notice other local changes flags (trap 144).
- */
-#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
-#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
-#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
-
-/*
- * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
- */
-#define IB_NOTICE_TRAP_DR_NOTICE	0x80
-#define IB_NOTICE_TRAP_DR_TRUNC		0x40
-
-struct ib_vl_weight_elem {
-	u8      vl;     /* Only low 4 bits, upper 4 bits reserved */
-	u8      weight;
-};
+#define IB_SMP_INVALID_FIELD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
 
 #define IB_VLARB_LOWPRI_0_31    1
 #define IB_VLARB_LOWPRI_32_63   2
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index c8422d5..d5ac022 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -127,6 +127,60 @@
 #define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
 #define IB_DEFAULT_PKEY_FULL	0xFFFF
 
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL	0x80
+#define IB_NOTICE_TYPE_URGENT	0x81
+#define IB_NOTICE_TYPE_SECURITY	0x82
+#define IB_NOTICE_TYPE_SM	0x83
+#define IB_NOTICE_TYPE_INFO	0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA		cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH		cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
+
+/*
+ * Generic trap/notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
+
+/*
+ * Repress trap/notice flags
+ */
+#define IB_NOTICE_REPRESS_LLI_THRESH	(1 << 0)
+#define IB_NOTICE_REPRESS_EBO_THRESH	(1 << 1)
+#define IB_NOTICE_REPRESS_FLOW_UPDATE	(1 << 2)
+#define IB_NOTICE_REPRESS_CAP_MASK_CHG	(1 << 3)
+#define IB_NOTICE_REPRESS_SYS_GUID_CHG	(1 << 4)
+#define IB_NOTICE_REPRESS_BAD_MKEY	(1 << 5)
+#define IB_NOTICE_REPRESS_BAD_PKEY	(1 << 6)
+#define IB_NOTICE_REPRESS_BAD_QKEY	(1 << 7)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
+
+/*
+ * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE	0x80
+#define IB_NOTICE_TRAP_DR_TRUNC		0x40
+
 enum {
 	IB_MGMT_MAD_HDR = 24,
 	IB_MGMT_MAD_DATA = 232,
@@ -240,6 +294,90 @@ struct ib_class_port_info {
 	__be32			trap_qkey;
 };
 
+struct ib_node_info {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __packed;
+
+struct ib_mad_notice_attr {
+	u8 generic_type;
+	u8 prod_type_msb;
+	__be16 prod_type_lsb;
+	__be16 trap_num;
+	__be16 issuer_lid;
+	__be16 toggle_count;
+
+	union {
+		struct {
+			u8	details[54];
+		} raw_data;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* where violation happened */
+			u8	port_num;	/* where violation happened */
+		} __packed ntc_129_131;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* LID where change occurred */
+			u8	reserved2;
+			u8	local_changes;	/* low bit - local changes */
+			__be32	new_cap_mask;	/* new capability mask */
+			u8	reserved3;
+			u8	change_flags;	/* low 3 bits only */
+		} __packed ntc_144;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* lid where sys guid changed */
+			__be16	reserved2;
+			__be64	new_sys_guid;
+		} __packed ntc_145;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;
+			__be16	dr_slid;
+			u8	method;
+			u8	reserved2;
+			__be16	attr_id;
+			__be32	attr_mod;
+			__be64	mkey;
+			u8	reserved3;
+			u8	dr_trunc_hop;
+			u8	dr_rtn_path[30];
+		} __packed ntc_256;
+
+		struct {
+			__be16		reserved;
+			__be16		lid1;
+			__be16		lid2;
+			__be32		key;
+			__be32		sl_qp1;	/* SL: high 4 bits */
+			__be32		qp2;	/* high 8 bits reserved */
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+		} __packed ntc_257_258;
+
+	} details;
+};
+
+struct ib_vl_weight_elem {
+	u8      vl;     /* VL is low 5 bits, upper 3 bits reserved */
+	u8      weight;
+};
+
 /**
  * ib_mad_send_buf - MAD data buffer and work request for sends.
  * @next: A pointer used to chain together MADs for posting.
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index b1f7592..709a533 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -76,6 +76,8 @@ enum {
 	IB_OPCODE_UC                                = 0x20,
 	IB_OPCODE_RD                                = 0x40,
 	IB_OPCODE_UD                                = 0x60,
+	/* per IBTA 3.1 Table 38, A10.3.2 */
+	IB_OPCODE_CNP                               = 0x80,
 
 	/* operations -- just used to define real constants */
 	IB_OPCODE_SEND_FIRST                        = 0x00,
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
new file mode 100644
index 0000000..391dae1
--- /dev/null
+++ b/include/rdma/opa_port_info.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_PORT_INFO_H)
+#define OPA_PORT_INFO_H
+
+/* Temporary until HFI driver is updated */
+#ifndef USE_PI_LED_ENABLE
+#define USE_PI_LED_ENABLE 0
+#endif
+
+#define OPA_PORT_LINK_MODE_NOP	0		/* No change */
+#define OPA_PORT_LINK_MODE_OPA	4		/* Port mode is OPA */
+
+#define OPA_PORT_PACKET_FORMAT_NOP	0		/* No change */
+#define OPA_PORT_PACKET_FORMAT_8B	1		/* Format 8B */
+#define OPA_PORT_PACKET_FORMAT_9B	2		/* Format 9B */
+#define OPA_PORT_PACKET_FORMAT_10B	4		/* Format 10B */
+#define OPA_PORT_PACKET_FORMAT_16B	8		/* Format 16B */
+
+#define OPA_PORT_LTP_CRC_MODE_NONE	0	/* No change */
+#define OPA_PORT_LTP_CRC_MODE_14	1	/* 14-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_16	2	/* 16-bit LTP CRC mode */
+#define OPA_PORT_LTP_CRC_MODE_48	4	/* 48-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_PER_LANE  8	/* 12/16-bit per lane LTP CRC mode */
+
+/* Link Down / Neighbor Link Down Reason; indicated as follows: */
+#define OPA_LINKDOWN_REASON_NONE				0	/* No specified reason */
+#define OPA_LINKDOWN_REASON_RCV_ERROR_0				1
+#define OPA_LINKDOWN_REASON_BAD_PKT_LEN				2
+#define OPA_LINKDOWN_REASON_PKT_TOO_LONG			3
+#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT			4
+#define OPA_LINKDOWN_REASON_BAD_SLID				5
+#define OPA_LINKDOWN_REASON_BAD_DLID				6
+#define OPA_LINKDOWN_REASON_BAD_L2				7
+#define OPA_LINKDOWN_REASON_BAD_SC				8
+#define OPA_LINKDOWN_REASON_RCV_ERROR_8				9
+#define OPA_LINKDOWN_REASON_BAD_MID_TAIL			10
+#define OPA_LINKDOWN_REASON_RCV_ERROR_10			11
+#define OPA_LINKDOWN_REASON_PREEMPT_ERROR			12
+#define OPA_LINKDOWN_REASON_PREEMPT_VL15			13
+#define OPA_LINKDOWN_REASON_BAD_VL_MARKER			14
+#define OPA_LINKDOWN_REASON_RCV_ERROR_14			15
+#define OPA_LINKDOWN_REASON_RCV_ERROR_15			16
+#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST			17
+#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST			18
+#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST			19
+#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK			20
+#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER		21
+#define OPA_LINKDOWN_REASON_BAD_PREEMPT				22
+#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT			23
+#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT		24
+#define OPA_LINKDOWN_REASON_RCV_ERROR_24			25
+#define OPA_LINKDOWN_REASON_RCV_ERROR_25			26
+#define OPA_LINKDOWN_REASON_RCV_ERROR_26			27
+#define OPA_LINKDOWN_REASON_RCV_ERROR_27			28
+#define OPA_LINKDOWN_REASON_RCV_ERROR_28			29
+#define OPA_LINKDOWN_REASON_RCV_ERROR_29			30
+#define OPA_LINKDOWN_REASON_RCV_ERROR_30			31
+#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN		32
+#define OPA_LINKDOWN_REASON_UNKNOWN				33
+/* 34 -reserved */
+#define OPA_LINKDOWN_REASON_REBOOT				35
+#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN			36
+/* 37-38 reserved */
+#define OPA_LINKDOWN_REASON_FM_BOUNCE				39
+#define OPA_LINKDOWN_REASON_SPEED_POLICY			40
+#define OPA_LINKDOWN_REASON_WIDTH_POLICY			41
+/* 42-48 reserved */
+#define OPA_LINKDOWN_REASON_DISCONNECTED			49
+#define OPA_LINKDOWN_REASONLOCAL_MEDIA_NOT_INSTALLED		50
+#define OPA_LINKDOWN_REASON_NOT_INSTALLED			51
+#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG			52
+/* 53 reserved */
+#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED		54
+/* 55 reserved */
+#define OPA_LINKDOWN_REASON_POWER_POLICY			56
+#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY			57
+#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY			58
+/* 59 reserved */
+#define OPA_LINKDOWN_REASON_SWITCH_MGMT				60
+#define OPA_LINKDOWN_REASON_SMA_DISABLED			61
+/* 62 reserved */
+#define OPA_LINKDOWN_REASON_TRANSIENT				63
+/* 64-255 reserved */
+
+/* OPA Link Init reason; indicated as follows: */
+/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
+#define OPA_LINKINIT_REASON_NOP                 0
+#define OPA_LINKINIT_REASON_LINKUP              (1 << 4)
+#define OPA_LINKINIT_REASON_FLAPPING            (2 << 4)
+#define OPA_LINKINIT_REASON_CLEAR               (8 << 4)
+#define OPA_LINKINIT_OUTSIDE_POLICY             (8 << 4)
+#define OPA_LINKINIT_QUARANTINED                (9 << 4)
+#define OPA_LINKINIT_INSUFIC_CAPABILITY         (10 << 4)
+
+#define OPA_LINK_SPEED_NOP              0x0000  /*  Reserved (1-5 Gbps) */
+#define OPA_LINK_SPEED_12_5G            0x0001  /*  12.5 Gbps */
+#define OPA_LINK_SPEED_25G              0x0002  /*  25.78125?  Gbps (EDR) */
+
+#define OPA_LINK_WIDTH_1X            0x0001
+#define OPA_LINK_WIDTH_2X            0x0002
+#define OPA_LINK_WIDTH_3X            0x0004
+#define OPA_LINK_WIDTH_4X            0x0008
+
+#define OPA_CAP_MASK3_IsSnoopSupported            (1 << 7)
+#define OPA_CAP_MASK3_IsAsyncSC2VLSupported       (1 << 6)
+#define OPA_CAP_MASK3_IsAddrRangeConfigSupported  (1 << 5)
+#define OPA_CAP_MASK3_IsPassThroughSupported      (1 << 4)
+#define OPA_CAP_MASK3_IsSharedSpaceSupported      (1 << 3)
+/* reserved (1 << 2) */
+#define OPA_CAP_MASK3_IsVLMarkerSupported         (1 << 1)
+#define OPA_CAP_MASK3_IsVLrSupported              (1 << 0)
+
+/**
+ * new MTU values
+ */
+enum {
+	OPA_MTU_8192  = 6,
+	OPA_MTU_10240 = 7,
+};
+
+enum {
+	OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
+	OPA_PORT_PHYS_CONF_STANDARD     = 1,
+	OPA_PORT_PHYS_CONF_FIXED        = 2,
+	OPA_PORT_PHYS_CONF_VARIABLE     = 3,
+	OPA_PORT_PHYS_CONF_SI_PHOTO     = 4
+};
+
+enum port_info_field_masks {
+	/* vl.cap */
+	OPA_PI_MASK_VL_CAP                        = 0x1F,
+	/* port_states.ledenable_offlinereason */
+	OPA_PI_MASK_OFFLINE_REASON                = 0x0F,
+	OPA_PI_MASK_LED_ENABLE                    = 0x40,
+	/* port_states.unsleepstate_downdefstate */
+	OPA_PI_MASK_UNSLEEP_STATE                 = 0xF0,
+	OPA_PI_MASK_DOWNDEF_STATE                 = 0x0F,
+	/* port_states.portphysstate_portstate */
+	OPA_PI_MASK_PORT_PHYSICAL_STATE           = 0xF0,
+	OPA_PI_MASK_PORT_STATE                    = 0x0F,
+	/* port_phys_conf */
+	OPA_PI_MASK_PORT_PHYSICAL_CONF            = 0x0F,
+	/* collectivemask_multicastmask */
+	OPA_PI_MASK_COLLECT_MASK                  = 0x38,
+	OPA_PI_MASK_MULTICAST_MASK                = 0x07,
+	/* mkeyprotect_lmc */
+	OPA_PI_MASK_MKEY_PROT_BIT                 = 0xC0,
+	OPA_PI_MASK_LMC                           = 0x0F,
+	/* smsl */
+	OPA_PI_MASK_SMSL                          = 0x1F,
+	/* partenforce_filterraw */
+	/* Filter Raw In/Out bits 1 and 2 were removed */
+	OPA_PI_MASK_LINKINIT_REASON               = 0xF0,
+	OPA_PI_MASK_PARTITION_ENFORCE_IN          = 0x08,
+	OPA_PI_MASK_PARTITION_ENFORCE_OUT         = 0x04,
+	/* operational_vls */
+	OPA_PI_MASK_OPERATIONAL_VL                = 0x1F,
+	/* sa_qp */
+	OPA_PI_MASK_SA_QP                         = 0x00FFFFFF,
+	/* sm_trap_qp */
+	OPA_PI_MASK_SM_TRAP_QP                    = 0x00FFFFFF,
+	/* localphy_overrun_errors */
+	OPA_PI_MASK_LOCAL_PHY_ERRORS              = 0xF0,
+	OPA_PI_MASK_OVERRUN_ERRORS                = 0x0F,
+	/* clientrereg_subnettimeout */
+	OPA_PI_MASK_CLIENT_REREGISTER             = 0x80,
+	OPA_PI_MASK_SUBNET_TIMEOUT                = 0x1F,
+	/* port_link_mode */
+	OPA_PI_MASK_PORT_LINK_SUPPORTED           = (0x001F << 10),
+	OPA_PI_MASK_PORT_LINK_ENABLED             = (0x001F <<  5),
+	OPA_PI_MASK_PORT_LINK_ACTIVE              = (0x001F <<  0),
+	/* port_link_crc_mode */
+	OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED       = 0x0F00,
+	OPA_PI_MASK_PORT_LINK_CRC_ENABLED         = 0x00F0,
+	OPA_PI_MASK_PORT_LINK_CRC_ACTIVE          = 0x000F,
+	/* port_mode */
+	OPA_PI_MASK_PORT_MODE_SECURITY_CHECK      = 0x0001,
+	OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY      = 0x0002,
+	OPA_PI_MASK_PORT_MODE_PKEY_CONVERT        = 0x0004,
+	OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING       = 0x0008,
+	OPA_PI_MASK_PORT_MODE_VL_MARKER           = 0x0010,
+	OPA_PI_MASK_PORT_PASS_THROUGH             = 0x0020,
+	OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE          = 0x0040,
+	/* flit_control.interleave */
+	OPA_PI_MASK_INTERLEAVE_DIST_SUP           = (0x0003 << 12),
+	OPA_PI_MASK_INTERLEAVE_DIST_ENABLE        = (0x0003 << 10),
+	OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX        = (0x001F <<  5),
+	OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX        = (0x001F <<  0),
+
+	/* port_error_action */
+	OPA_PI_MASK_EX_BUFFER_OVERRUN                  = 0x80000000,
+		/* 7 bits reserved */
+	OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT  = 0x00800000,
+	OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT            = 0x00400000,
+	OPA_PI_MASK_FM_CFG_BAD_PREEMPT                 = 0x00200000,
+	OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER       = 0x00100000,
+	OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK                = 0x00080000,
+	OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST               = 0x00040000,
+	OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST               = 0x00020000,
+	OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST               = 0x00010000,
+		/* 2 bits reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER             = 0x00002000,
+	OPA_PI_MASK_PORT_RCV_PREEMPT_VL15              = 0x00001000,
+	OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR             = 0x00000800,
+		/* 1 bit reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_MidTail               = 0x00000200,
+		/* 1 bit reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_SC                    = 0x00000080,
+	OPA_PI_MASK_PORT_RCV_BAD_L2                    = 0x00000040,
+	OPA_PI_MASK_PORT_RCV_BAD_DLID                  = 0x00000020,
+	OPA_PI_MASK_PORT_RCV_BAD_SLID                  = 0x00000010,
+	OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT           = 0x00000008,
+	OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG            = 0x00000004,
+	OPA_PI_MASK_PORT_RCV_BAD_PKTLEN                = 0x00000002,
+	OPA_PI_MASK_PORT_RCV_BAD_LT                    = 0x00000001,
+
+	/* pass_through.res_drctl */
+	OPA_PI_MASK_PASS_THROUGH_DR_CONTROL       = 0x01,
+
+	/* buffer_units */
+	OPA_PI_MASK_BUF_UNIT_VL15_INIT            = (0x00000FFF  << 11),
+	OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE     = (0x0000001F  <<  6),
+	OPA_PI_MASK_BUF_UNIT_CREDIT_ACK           = (0x00000003  <<  3),
+	OPA_PI_MASK_BUF_UNIT_BUF_ALLOC            = (0x00000003  <<  0),
+
+	/* neigh_mtu.pvlx_to_mtu */
+	OPA_PI_MASK_NEIGH_MTU_PVL0                = 0xF0,
+	OPA_PI_MASK_NEIGH_MTU_PVL1                = 0x0F,
+
+	/* neigh_mtu.vlstall_hoq_life */
+	OPA_PI_MASK_VL_STALL                      = (0x03 << 5),
+	OPA_PI_MASK_HOQ_LIFE                      = (0x1F << 0),
+
+	/* port_neigh_mode */
+	OPA_PI_MASK_NEIGH_MGMT_ALLOWED            = (0x01 << 3),
+	OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS          = (0x01 << 2),
+	OPA_PI_MASK_NEIGH_NODE_TYPE               = (0x03 << 0),
+
+	/* resptime_value */
+	OPA_PI_MASK_RESPONSE_TIME_VALUE           = 0x1F,
+
+	/* mtucap */
+	OPA_PI_MASK_MTU_CAP                       = 0x0F,
+};
+
+#if USE_PI_LED_ENABLE
+struct opa_port_states {
+	u8     reserved;
+	u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
+	u8     reserved2;
+	u8     portphysstate_portstate;   /* 4 bits, 4 bits */
+};
+#define PI_LED_ENABLE_SUP 1
+#else
+struct opa_port_states {
+	u8     reserved;
+	u8     offline_reason;            /* 2 res, 6 bits */
+	u8     reserved2;
+	u8     portphysstate_portstate;   /* 4 bits, 4 bits */
+};
+#define PI_LED_ENABLE_SUP 0
+#endif
+
+struct opa_port_state_info {
+	struct opa_port_states port_states;
+	u16 link_width_downgrade_tx_active;
+	u16 link_width_downgrade_rx_active;
+};
+
+struct opa_port_info {
+	__be32 lid;
+	__be32 flow_control_mask;
+
+	struct {
+		u8     res;                       /* was inittype */
+		u8     cap;                       /* 3 res, 5 bits */
+		__be16 high_limit;
+		__be16 preempt_limit;
+		u8     arb_high_cap;
+		u8     arb_low_cap;
+	} vl;
+
+	struct opa_port_states  port_states;
+	u8     port_phys_conf;                    /* 4 res, 4 bits */
+	u8     collectivemask_multicastmask;      /* 2 res, 3, 3 */
+	u8     mkeyprotect_lmc;                   /* 2 bits, 2 res, 4 bits */
+	u8     smsl;                              /* 3 res, 5 bits */
+
+	u8     partenforce_filterraw;             /* bit fields */
+	u8     operational_vls;                    /* 3 res, 5 bits */
+	__be16 pkey_8b;
+	__be16 pkey_10b;
+	__be16 mkey_violations;
+
+	__be16 pkey_violations;
+	__be16 qkey_violations;
+	__be32 sm_trap_qp;                        /* 8 bits, 24 bits */
+
+	__be32 sa_qp;                             /* 8 bits, 24 bits */
+	u8     neigh_port_num;
+	u8     link_down_reason;
+	u8     neigh_link_down_reason;
+	u8     clientrereg_subnettimeout;	  /* 1 bit, 2 bits, 5 */
+
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 active;
+	} link_speed;
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 active;
+	} link_width;
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 tx_active;
+		__be16 rx_active;
+	} link_width_downgrade;
+	__be16 port_link_mode;                  /* 1 res, 5 bits, 5 bits, 5 bits */
+	__be16 port_ltp_crc_mode;               /* 4 res, 4 bits, 4 bits, 4 bits */
+
+	__be16 port_mode;                       /* 9 res, bit fields */
+	struct {
+		__be16 supported;
+		__be16 enabled;
+	} port_packet_format;
+	struct {
+		__be16 interleave;  /* 2 res, 2,2,5,5 */
+		struct {
+			__be16 min_initial;
+			__be16 min_tail;
+			u8     large_pkt_limit;
+			u8     small_pkt_limit;
+			u8     max_small_pkt_limit;
+			u8     preemption_limit;
+		} preemption;
+	} flit_control;
+
+	__be32 reserved4;
+	__be32 port_error_action; /* bit field */
+
+	struct {
+		u8 egress_port;
+		u8 res_drctl;                    /* 7 res, 1 */
+	} pass_through;
+	__be16 mkey_lease_period;
+	__be32 buffer_units;                     /* 9 res, 12, 5, 3, 3 */
+
+	__be32 reserved5;
+	__be32 sm_lid;
+
+	__be64 mkey;
+
+	__be64 subnet_prefix;
+
+	struct {
+		u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
+	} neigh_mtu;
+
+	struct {
+		u8 vlstall_hoqlife;             /* 3 bits, 5 bits */
+	} xmit_q[OPA_MAX_VLS];
+
+	struct {
+		u8 addr[16];
+	} ipaddr_ipv6;
+
+	struct {
+		u8 addr[4];
+	} ipaddr_ipv4;
+
+	u32    reserved6;
+	u32    reserved7;
+	u32    reserved8;
+
+	__be64 neigh_node_guid;
+
+	__be32 ib_cap_mask;
+	__be16 reserved9;                    /* was ib_cap_mask2 */
+	__be16 opa_cap_mask;
+
+	__be32 reserved10;                   /* was link_roundtrip_latency */
+	__be16 overall_buffer_space;
+	__be16 reserved11;                   /* was max_credit_hint */
+
+	__be16 diag_code;
+	struct {
+		u8 buffer;
+		u8 wire;
+	} replay_depth;
+	u8     port_neigh_mode;
+	u8     mtucap;                          /* 4 res, 4 bits */
+
+	u8     resptimevalue;		        /* 3 res, 5 bits */
+	u8     local_port_num;
+	u8     reserved12;
+	u8     reserved13;                       /* was guid_cap */
+} __attribute__ ((packed));
+
+#endif /* OPA_PORT_INFO_H */
diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h
index 29063e8..4a529ef 100644
--- a/include/rdma/opa_smi.h
+++ b/include/rdma/opa_smi.h
@@ -40,6 +40,10 @@
 #define OPA_SMP_DR_DATA_SIZE			1872
 #define OPA_SMP_MAX_PATH_HOPS			64
 
+#define OPA_MAX_VLS				32
+#define OPA_MAX_SLS				32
+#define OPA_MAX_SCS				32
+
 #define OPA_SMI_CLASS_VERSION			0x80
 
 #define OPA_LID_PERMISSIVE			cpu_to_be32(0xFFFFFFFF)
@@ -73,6 +77,49 @@ struct opa_smp {
 } __packed;
 
 
+/* Subnet management attributes */
+/* ... */
+#define OPA_ATTRIB_ID_NODE_DESCRIPTION		cpu_to_be16(0x0010)
+#define OPA_ATTRIB_ID_NODE_INFO			cpu_to_be16(0x0011)
+#define OPA_ATTRIB_ID_PORT_INFO			cpu_to_be16(0x0015)
+#define OPA_ATTRIB_ID_PARTITION_TABLE		cpu_to_be16(0x0016)
+#define OPA_ATTRIB_ID_SL_TO_SC_MAP		cpu_to_be16(0x0017)
+#define OPA_ATTRIB_ID_VL_ARBITRATION		cpu_to_be16(0x0018)
+#define OPA_ATTRIB_ID_SM_INFO			cpu_to_be16(0x0020)
+#define OPA_ATTRIB_ID_CABLE_INFO		cpu_to_be16(0x0032)
+#define OPA_ATTRIB_ID_AGGREGATE			cpu_to_be16(0x0080)
+#define OPA_ATTRIB_ID_SC_TO_SL_MAP		cpu_to_be16(0x0082)
+#define OPA_ATTRIB_ID_SC_TO_VLR_MAP		cpu_to_be16(0x0083)
+#define OPA_ATTRIB_ID_SC_TO_VLT_MAP		cpu_to_be16(0x0084)
+#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP		cpu_to_be16(0x0085)
+/* ... */
+#define OPA_ATTRIB_ID_PORT_STATE_INFO		cpu_to_be16(0x0087)
+/* ... */
+#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE	cpu_to_be16(0x008A)
+/* ... */
+
+struct opa_node_description {
+	u8 data[64];
+} __attribute__ ((packed));
+
+struct opa_node_info {
+	u8      base_version;
+	u8      class_version;
+	u8      node_type;
+	u8      num_ports;
+	__be32  reserved;
+	__be64  system_image_guid;
+	__be64  node_guid;
+	__be64  port_guid;
+	__be16  partition_cap;
+	__be16  device_id;
+	__be32  revision;
+	u8      local_port_num;
+	u8      vendor_id[3];   /* network byte order */
+} __attribute__ ((packed));
+
+#define OPA_PARTITION_TABLE_BLK_SIZE 32
+
 static inline u8
 opa_get_smp_direction(struct opa_smp *smp)
 {
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
new file mode 100644
index 0000000..78c442f
--- /dev/null
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -0,0 +1,427 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef _LINUX__HFI1_USER_H
+#define _LINUX__HFI1_USER_H
+
+#include <linux/types.h>
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of hfi1_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures change in an incompatible
+ * way. The driver must be the same for initialization to succeed.
+ */
+#define HFI1_USER_SWMAJOR 4
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define HFI1_USER_SWMINOR 0
+
+/*
+ * Set of HW and driver capability/feature bits.
+ * These bit values are used to configure enabled/disabled HW and
+ * driver features. The same set of bits are communicated to user
+ * space.
+ */
+#define HFI1_CAP_DMA_RTAIL        (1UL <<  0) /* Use DMA'ed RTail value */
+#define HFI1_CAP_SDMA             (1UL <<  1) /* Enable SDMA support */
+#define HFI1_CAP_SDMA_AHG         (1UL <<  2) /* Enable SDMA AHG support */
+#define HFI1_CAP_EXTENDED_PSN     (1UL <<  3) /* Enable Extended PSN support */
+#define HFI1_CAP_HDRSUPP          (1UL <<  4) /* Enable Header Suppression */
+/* 1UL << 5 reserved */
+#define HFI1_CAP_USE_SDMA_HEAD    (1UL <<  6) /* DMA Hdr Q tail vs. use CSR */
+#define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
+#define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
+#define HFI1_CAP_NODROP_EGR_FULL  (1UL <<  9) /* Don't drop on EGR buffs full */
+#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Enable Expected TID caching */
+#define HFI1_CAP_PRINT_UNIMPL     (1UL << 11) /* Show for unimplemented feats */
+#define HFI1_CAP_ALLOW_PERM_JKEY  (1UL << 12) /* Allow use of permissive JKEY */
+#define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
+#define HFI1_CAP_PKEY_CHECK       (1UL << 14) /* Enable ctxt PKey checking */
+#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
+#define HFI1_CAP_QSFP_ENABLED     (1UL << 16) /* Enable QSFP check during LNI */
+#define HFI1_CAP_SDMA_HEAD_CHECK  (1UL << 17) /* SDMA head checking */
+#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
+
+#define HFI1_RCVHDR_ENTSIZE_2    (1UL << 0)
+#define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
+#define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
+
+/*
+ * If the unit is specified via open, HFI choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HFIs, using different algorithms.  WITHIN is
+ * the old default, prior to this mechanism.
+ */
+#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
+			  * ports; this is the default */
+#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
+			  * active ports within), then next HFI */
+#define HFI1_ALG_COUNT  2 /* number of algorithm choices */
+
+
+/* User commands. */
+#define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
+#define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
+#define HFI1_CMD_USER_INFO       3	/* set up userspace */
+#define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
+#define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
+#define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
+#define HFI1_CMD_SDMA_STATUS_UPD 7       /* force update of SDMA status ring */
+
+#define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
+#define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
+#define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
+#define HFI1_CMD_SET_PKEY        11      /* set context's pkey */
+#define HFI1_CMD_CTXT_RESET      12      /* reset context's HW send context */
+/* separate EPROM commands from normal PSM commands */
+#define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
+#define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
+#define HFI1_CMD_EP_ERASE_P0     66      /* erase EPROM partition 0 */
+#define HFI1_CMD_EP_ERASE_P1     67      /* erase EPROM partition 1 */
+#define HFI1_CMD_EP_READ_P0      68      /* read EPROM partition 0 */
+#define HFI1_CMD_EP_READ_P1      69      /* read EPROM partition 1 */
+#define HFI1_CMD_EP_WRITE_P0     70      /* write EPROM partition 0 */
+#define HFI1_CMD_EP_WRITE_P1     71      /* write EPROM partition 1 */
+
+#define _HFI1_EVENT_FROZEN_BIT       0
+#define _HFI1_EVENT_LINKDOWN_BIT     1
+#define _HFI1_EVENT_LID_CHANGE_BIT   2
+#define _HFI1_EVENT_LMC_CHANGE_BIT   3
+#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4
+#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT
+
+#define HFI1_EVENT_FROZEN                (1UL << _HFI1_EVENT_FROZEN_BIT)
+#define HFI1_EVENT_LINKDOWN_BIT		(1UL << _HFI1_EVENT_LINKDOWN_BIT)
+#define HFI1_EVENT_LID_CHANGE_BIT	(1UL << _HFI1_EVENT_LID_CHANGE_BIT)
+#define HFI1_EVENT_LMC_CHANGE_BIT	(1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
+#define HFI1_EVENT_SL2VL_CHANGE_BIT	(1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+
+/*
+ * These are the status bits readable (in ASCII form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define HFI1_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initialized */
+#define HFI1_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define HFI1_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define HFI1_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define HFI1_STATUS_HWERROR     0x200
+
+/*
+ * Number of supported shared contexts.
+ * This is the maximum number of software contexts that can share
+ * a hardware send/receive context.
+ */
+#define HFI1_MAX_SHARED_CTXTS 8
+
+/*
+ * Poll types
+ */
+#define HFI1_POLL_TYPE_ANYRCV     0x0
+#define HFI1_POLL_TYPE_URGENT     0x1
+
+/*
+ * This structure is passed to the driver to tell it where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct hfi1_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to HFI1_USER_SWVERSION.
+	 */
+	__u32 userversion;
+	__u16 pad;
+	/* HFI selection algorithm, if unit has not selected */
+	__u16 hfi1_alg;
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the subcontext_cnt and subcontext_id to the same
+	 * values.  The only restriction on the subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 subctxt_cnt;
+	__u16 subctxt_id;
+	/* 128bit UUID passed in by PSM. */
+	__u8 uuid[16];
+};
+
+struct hfi1_ctxt_info {
+	__u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
+	__u32 rcvegr_size;      /* size of each eager buffer */
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 rcvtids;          /* number of Rcv TIDs for this context */
+	__u16 credits;          /* number of PIO credits for this context */
+	__u16 numa_node;        /* NUMA node of the assigned device */
+	__u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
+	__u16 send_ctxt;        /* send context in use by this user context */
+	__u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
+	__u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
+	__u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
+	__u16 sdma_ring_size;   /* number of entries in SDMA request ring */
+};
+
+struct hfi1_tid_info {
+	/* virtual address of first page in transfer */
+	__u64 vaddr;
+	/* pointer to tid array. this array is big enough */
+	__u64 tidlist;
+	/* number of tids programmed by this request */
+	__u32 tidcnt;
+	/* length of transfer buffer programmed by this request */
+	__u32 length;
+	/*
+	 * pointer to bitmap of TIDs used for this call;
+	 * checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct hfi1_cmd {
+	__u32 type;        /* command type */
+	__u32 len;         /* length of struct pointed to by add */
+	__u64 addr;        /* pointer to user structure */
+};
+
+enum hfi1_sdma_comp_state {
+	FREE = 0,
+	QUEUED,
+	COMPLETE,
+	ERROR
+};
+
+/*
+ * SDMA completion ring entry
+ */
+struct hfi1_sdma_comp_entry {
+	__u32 status;
+	__u32 errcode;
+};
+
+/*
+ * Device status and notifications from driver to user-space.
+ */
+struct hfi1_status {
+	__u64 dev;      /* device/hw status bits */
+	__u64 port;     /* port state and status bits */
+	char freezemsg[0];
+};
+
+/*
+ * This structure is returned by the driver immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explicit pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct hfi1_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 hw_version;
+	/* version of software, for feature checking. */
+	__u32 sw_version;
+	/* Job key */
+	__u16 jkey;
+	__u16 padding1;
+	/*
+	 * The special QP (queue pair) value that identifies PSM
+	 * protocol packet from standard IB packets.
+	 */
+	__u32 bthqp;
+	/* PIO credit return address, */
+	__u64 sc_credits_addr;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase_sop;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase;
+	/* address where receive buffer queue is mapped into */
+	__u64 rcvhdr_bufbase;
+	/* base address of Eager receive buffers. */
+	__u64 rcvegr_bufbase;
+	/* base address of SDMA completion ring */
+	__u64 sdma_comp_bufbase;
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always maps real chip register space.
+	 * the register addresses are:
+	 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
+	 * ur_rcvtidflow
+	 */
+	__u64 user_regbase;
+	/* notification events */
+	__u64 events_bufbase;
+	/* status page */
+	__u64 status_bufbase;
+	/* rcvhdrtail update */
+	__u64 rcvhdrtail_base;
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 subctxt_uregbase;
+	__u64 subctxt_rcvegrbuf;
+	__u64 subctxt_rcvhdrbuf;
+};
+
+enum sdma_req_opcode {
+	EXPECTED = 0,
+	EAGER
+};
+
+#define HFI1_SDMA_REQ_VERSION_MASK 0xF
+#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0
+#define HFI1_SDMA_REQ_OPCODE_MASK 0xF
+#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4
+#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF
+#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8
+
+struct sdma_req_info {
+	/*
+	 * bits 0-3 - version (currently unused)
+	 * bits 4-7 - opcode (enum sdma_req_opcode)
+	 * bits 8-15 - io vector count
+	 */
+	__u16 ctrl;
+	/*
+	 * Number of fragments contained in this request.
+	 * User-space has already computed how many
+	 * fragment-sized packet the user buffer will be
+	 * split into.
+	 */
+	__u16 npkts;
+	/*
+	 * Size of each fragment the user buffer will be
+	 * split into.
+	 */
+	__u16 fragsize;
+	/*
+	 * Index of the slot in the SDMA completion ring
+	 * this request should be using. User-space is
+	 * in charge of managing its own ring.
+	 */
+	__u16 comp_idx;
+} __packed;
+
+/*
+ * SW KDETH header.
+ * swdata is SW defined portion.
+ */
+struct hfi1_kdeth_header {
+	__le32 ver_tid_offset;
+	__le16 jkey;
+	__le16 hcrc;
+	__le32 swdata[7];
+} __packed;
+
+/*
+ * Structure describing the headers that User space uses. The
+ * structure above is a subset of this one.
+ */
+struct hfi1_pkt_header {
+	__le16 pbc[4];
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct hfi1_kdeth_header kdeth;
+} __packed;
+
+
+/*
+ * The list of usermode accessible registers.
+ */
+enum hfi1_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* (RO)  Receive Eager Offset Tail */
+	ur_rcvegroffsettail = 4,
+	/* For internal use only; max register number. */
+	ur_maxreg,
+	/* (RW)  Receive TID flow table */
+	ur_rcvtidflowtable = 256
+};
+
+#endif /* _LINIUX__HFI1_USER_H */
-- 
cgit v0.10.2


From 7724105686e718ac476a6ad3304fea2fbcfcffde Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Thu, 30 Jul 2015 15:17:43 -0400
Subject: IB/hfi1: add driver files

Signed-off-by: Andrew Friedley <andrew.friedley@intel.com>
Signed-off-by: Arthur Kepner <arthur.kepner@intel.com>
Signed-off-by: Brendan Cunningham <brendan.cunningham@intel.com>
Signed-off-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Caz Yokoyama <caz.yokoyama@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Jim Snow <jim.m.snow@intel.com>
Signed-off-by: John Gregor <john.a.gregor@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Kevin Pine <kevin.pine@intel.com>
Signed-off-by: Kyle Liddell <kyle.liddell@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ravi Krishnaswamy <ravi.krishnaswamy@intel.com>
Signed-off-by: Sadanand Warrier <sadanand.warrier@intel.com>
Signed-off-by: Sanath Kumar <sanath.s.kumar@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Vlad Danushevsky <vladimir.danusevsky@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
index ddd519b..9028b02 100644
--- a/Documentation/infiniband/sysfs.txt
+++ b/Documentation/infiniband/sysfs.txt
@@ -64,3 +64,23 @@ MTHCA
     fw_ver   - Firmware version
     hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)",
                or "MT25208"
+
+HFI1
+
+  The hfi1 driver also creates these additional files:
+
+   hw_rev - hardware revision
+   board_id - manufacturing board id
+   tempsense - thermal sense information
+   serial - board serial number
+   nfreectxts - number of free user contexts
+   nctxts - number of allowed contexts (PSM2)
+   chip_reset - diagnostic (root only)
+   boardversion - board version
+   ports/1/
+          CMgtA/
+               cc_settings_bin - CCA tables used by PSM2
+               cc_table_bin
+          sc2v/ - 32 files (0 - 31) used to translate sl->vl
+          sl2sc/ - 32 files (0 - 31) used to translate sl->sc
+          vl2mtu/ - 16 (0 - 15) files used to determine MTU for vl
diff --git a/MAINTAINERS b/MAINTAINERS
index db1a523..2249ac8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9809,6 +9809,12 @@ M:	Arnaud Patard <arnaud.patard@rtp-net.org>
 S:	Odd Fixes
 F:	drivers/staging/xgifb/
 
+HFI1 DRIVER
+M:	Mike Marciniszyn <infinipath@intel.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/staging/rdma/hfi1
+
 STARFIRE/DURALAN NETWORK DRIVER
 M:	Ion Badulescu <ionut@badula.org>
 S:	Odd Fixes
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
index 5084088..cf5fe9b 100644
--- a/drivers/staging/rdma/Kconfig
+++ b/drivers/staging/rdma/Kconfig
@@ -24,6 +24,8 @@ if STAGING_RDMA
 
 source "drivers/staging/rdma/amso1100/Kconfig"
 
+source "drivers/staging/rdma/hfi1/Kconfig"
+
 source "drivers/staging/rdma/ipath/Kconfig"
 
 endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
index a2a459a..cbd915a 100644
--- a/drivers/staging/rdma/Makefile
+++ b/drivers/staging/rdma/Makefile
@@ -1,3 +1,4 @@
 # Entries for RDMA_STAGING tree
 obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
+obj-$(CONFIG_INFINIBAND_HFI1)	+= hfi1/
 obj-$(CONFIG_INFINIBAND_IPATH)	+= ipath/
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
new file mode 100644
index 0000000..fd25078
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Kconfig
@@ -0,0 +1,37 @@
+config INFINIBAND_HFI1
+	tristate "Intel OPA Gen1 support"
+	depends on X86_64
+	default m
+	---help---
+	This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+	bool "HFI1 SDMA Order debug"
+	depends on INFINIBAND_HFI1
+	default n
+	---help---
+	This is a debug flag to test for out of order
+	sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+	bool "HFI1 enable 31 bit PSN"
+	depends on INFINIBAND_HFI1
+	default y
+	---help---
+	Setting this enables 31 BIT PSN
+	For verbs RC/UC
+config SDMA_VERBOSITY
+	bool "Config SDMA Verbosity"
+	depends on INFINIBAND_HFI1
+	default n
+	---help---
+	This is a configuration flag to enable verbose
+	SDMA debug
+config PRESCAN_RXQ
+	bool "Enable prescanning of the RX queue for ECNs"
+	depends on INFINIBAND_HFI1
+	default n
+	---help---
+	This option toggles the prescanning of the receive queue for
+	Explicit Congestion Notifications. If an ECN is detected, it
+	is processed as quickly as possible, the ECN is toggled off.
+	After the prescanning step, the receive queue is processed as
+	usual.
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
new file mode 100644
index 0000000..2e5daa6
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Makefile
@@ -0,0 +1,19 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \
+	init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \
+	qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \
+	uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
new file mode 100644
index 0000000..05de0da
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/TODO
@@ -0,0 +1,6 @@
+July, 2015
+
+- Remove unneeded file entries in sysfs
+- Remove software processing of IB protocol and place in library for use
+  by qib, ipath (if still present), hfi1, and eventually soft-roce
+
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
new file mode 100644
index 0000000..654eafe
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -0,0 +1,10798 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+	u64 flag;	/* the flag */
+	char *str;	/* description string */
+	u16 extra;	/* extra information */
+	u16 unused0;
+	u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED	0x1
+#define SEC_PACKET_DROPPED	0x2
+#define SEC_SC_HALTED		0x4	/* per-context only */
+#define SEC_SPC_FREEZE		0x8	/* per-HFI only */
+
+#define VL15CTXT                  1
+#define MIN_KERNEL_KCTXTS         2
+#define NUM_MAP_REGS             32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+	num, \
+	sc0, sc0val, \
+	sc1, sc1val, \
+	sc2, sc2val, \
+	sc3, sc3val, \
+	sc4, sc4val, \
+	sc5, sc5val, \
+	sc6, sc6val, \
+	sc7, sc7val) \
+( \
+	((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+	((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+	((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+	((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+	((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+	((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+	((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+	((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+	range, \
+	e0, e0val, \
+	e1, e1val, \
+	e2, e2val, \
+	e3, e3val, \
+	e4, e4val, \
+	e5, e5val, \
+	e6, e6val, \
+	e7, e7val, \
+	e8, e8val, \
+	e9, e9val, \
+	e10, e10val, \
+	e11, e11val, \
+	e12, e12val, \
+	e13, e13val, \
+	e14, e14val, \
+	e15, e15val) \
+( \
+	((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+	((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+	((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+	((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+	((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+	((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+	((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+	((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+	((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+	((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+	((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+	((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+	((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+	((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+	((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+	((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+			| CCE_STATUS_RXE_FROZE_SMASK \
+			| CCE_STATUS_TXE_FROZE_SMASK \
+			| CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+			| CCE_STATUS_TXE_PAUSED_SMASK \
+			| CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("CceCsrParityErr",
+		CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/	FLAG_ENTRY0("CceCsrReadBadAddrErr",
+		CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/	FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+		CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/	FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/	FLAG_ENTRY0("CceTrgtAccessErr",
+		CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/	FLAG_ENTRY0("CceRspdDataParityErr",
+		CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/	FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/	FLAG_ENTRY0("CceCsrCfgBusParityErr",
+		CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/	FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/	FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/	FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/	FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/	FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+		CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/	FLAG_ENTRY0("PcicRetryMemCorErr",
+		CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/	FLAG_ENTRY0("PcicRetryMemCorErr",
+		CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/	FLAG_ENTRY0("PcicCplDatQCorErr",
+		CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/	FLAG_ENTRY0("PcicNPostHQParityErr",
+		CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/	FLAG_ENTRY0("PcicNPostDatQParityErr",
+		CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/	FLAG_ENTRY0("PcicRetryMemUncErr",
+		CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/	FLAG_ENTRY0("PcicRetrySotMemUncErr",
+		CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/	FLAG_ENTRY0("PcicPostHdQUncErr",
+		CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/	FLAG_ENTRY0("PcicPostDatQUncErr",
+		CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/	FLAG_ENTRY0("PcicCplHdQUncErr",
+		CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/	FLAG_ENTRY0("PcicCplDatQUncErr",
+		CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/	FLAG_ENTRY0("PcicTransmitFrontParityErr",
+		CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/	FLAG_ENTRY0("PcicTransmitBackParityErr",
+		CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/	FLAG_ENTRY0("PcicReceiveParityErr",
+		CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/	FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+		CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/	FLAG_ENTRY0("LATriggered",
+		CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/	FLAG_ENTRY0("CceSegReadBadAddrErr",
+		CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/	FLAG_ENTRY0("CceSegWriteBadAddrErr",
+		CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/	FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/	FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+		CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/	FLAG_ENTRY0("CceMsixTableCorErr",
+		CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/	FLAG_ENTRY0("CceMsixTableUncErr",
+		CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/	FLAG_ENTRY0("CceIntMapCorErr",
+		CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/	FLAG_ENTRY0("CceIntMapUncErr",
+		CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/	FLAG_ENTRY0("CceMsixCsrParityErr",
+		CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/	FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/	FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/	FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/	FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/	FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/	FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/	FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/	FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/	FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/	FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/	FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/	FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY("PioWriteBadCtxt",
+	SEC_WRITE_DROPPED,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/	FLAG_ENTRY("PioWriteAddrParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/	FLAG_ENTRY("PioCsrParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/	FLAG_ENTRY("PioSbMemFifo0",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/	FLAG_ENTRY("PioSbMemFifo1",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/	FLAG_ENTRY("PioPccFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/	FLAG_ENTRY("PioPecFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/	FLAG_ENTRY("PioSbrdctlCrrelParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/	FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/	FLAG_ENTRY("PioPktEvictFifoParityErr",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/	FLAG_ENTRY("PioSmPktResetParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/	FLAG_ENTRY("PioVlLenMemBank0Unc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/	FLAG_ENTRY("PioVlLenMemBank1Unc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/	FLAG_ENTRY("PioVlLenMemBank0Cor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/	FLAG_ENTRY("PioVlLenMemBank1Cor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/	FLAG_ENTRY("PioCreditRetFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/	FLAG_ENTRY("PioPpmcPblFifo",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/	FLAG_ENTRY("PioInitSmIn",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/	FLAG_ENTRY("PioPktEvictSmOrArbSm",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/	FLAG_ENTRY("PioHostAddrMemUnc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/	FLAG_ENTRY("PioHostAddrMemCor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/	FLAG_ENTRY("PioWriteDataParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/	FLAG_ENTRY("PioStateMachine",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/	FLAG_ENTRY("PioWriteQwValidParity",
+	SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/	FLAG_ENTRY("PioBlockQwCountParity",
+	SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/	FLAG_ENTRY("PioVlfVlLenParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/	FLAG_ENTRY("PioVlfSopParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/	FLAG_ENTRY("PioVlFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/	FLAG_ENTRY("PioPpmcBqcMemParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/	FLAG_ENTRY("PioPpmcSopLen",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/	FLAG_ENTRY("PioCurrentFreeCntParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/	FLAG_ENTRY("PioLastReturnedCntParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/	FLAG_ENTRY("PioPccSopHeadParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/	FLAG_ENTRY("PioPecSopHeadParityErr",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+	(SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("SDmaRpyTagErr",
+		SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/	FLAG_ENTRY0("SDmaCsrParityErr",
+		SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/	FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+		SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/	FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+		SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+		(SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+		| SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+		| SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/	FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/	FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+		SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/	FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/	FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/	FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+		SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/	FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+		SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/	FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+		SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/	FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/	FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/	FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/	FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/	FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+		SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/	FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+		SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/	FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+		SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/	FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+		SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/	FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+		SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/	FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+		SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/	FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+		SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/	FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+		SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/	FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+		SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/	FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+		SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/	FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+		SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/	FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+		SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/	FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+		SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/	FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+		SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/	FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+		SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/	FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+		SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/	FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/	FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/	FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/	FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/	FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/	FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/	FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/	FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/	FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/	FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/	FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/	FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/	FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/	FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/	FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/	FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/	FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/	FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/	FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/	FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/	FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/	FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/	FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/	FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/	FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/	FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/	FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/	FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/	FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/	FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/	FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+		SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/	FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+		SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/	FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/	FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/	FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/	FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/	FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/	FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/	FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/	FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/	FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/	FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/	FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/	FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/	FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/	FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/	FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/	FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/	FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/	FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/	FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/	FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/	FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/	FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+	(SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+	| SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+	| SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+	| SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+	| SEES(TX_LAUNCH_CSR_PARITY) \
+	| SEES(TX_SBRD_CTL_CSR_PARITY) \
+	| SEES(TX_CONFIG_PARITY) \
+	| SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+	| SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)),
+/* 1*/	FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/	FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY("InconsistentSop",
+		SEC_PACKET_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/	FLAG_ENTRY("DisallowedPacket",
+		SEC_PACKET_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/	FLAG_ENTRY("WriteCrossesBoundary",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/	FLAG_ENTRY("WriteOverflow",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/	FLAG_ENTRY("WriteOutOfBounds",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/	FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/	FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/	FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/	FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/	FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/	FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/	FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/	FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/	FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/	FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/	FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/	FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/	FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/	FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/	FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/	FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+		RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/	FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/	FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/	FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+		RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/	FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+		RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/	FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+		RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/	FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+		RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/	FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+		RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/	FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+		RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/	FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/	FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/	FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+		RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/	FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/	FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/	FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/	FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/	FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/	FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/	FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/	FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+		RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/	FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+		RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/	FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/	FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/	FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/	FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+		RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/	FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+		RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/	FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/	FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/	FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/	FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/	FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/	FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/	FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/	FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/	FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/	FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/	FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/	FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/	FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/	FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/	FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/	FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/	FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/	FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/	FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/	FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/	FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/	FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+	(RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+	(RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+	RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+	RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+	FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+	FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+	FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+	FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+	FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+	FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+	FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+	FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+	FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+	FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+	FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+	FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+	FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+	FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+	FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+	FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+	FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+	FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+	FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+	FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+	FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+	FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+	FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+	FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+	FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+	FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+	FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+	FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+	FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+	FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+	FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+	FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+	FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+	FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+	FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+	FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+	FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+	FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+	FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+	FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+	FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+	FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+	FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+	FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+	FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+	FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/	FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/	FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/	FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/	FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+		LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/	FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/	FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/	FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/	FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/	FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/	FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/	FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/	FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/	FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/	FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+		LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/	FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/	FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/	FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/	FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/	FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/	FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+		LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/	FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/	FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/	FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/	FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/	FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/	FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/	FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+		LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/	FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/	FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+		LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/	FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+		LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+	FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+	FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+	FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+	FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+	FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+	FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+	FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+	FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+	FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+		D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+	FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+	FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+	FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+	FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+	FLAG_ENTRY0("Serdes internal loopback failure",
+					FAILED_SERDES_INTERNAL_LOOPBACK),
+	FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+	FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+	FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+	FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+	FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+	FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+	FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+	FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+	FLAG_ENTRY0("Host request done", 0x0001),
+	FLAG_ENTRY0("BC SMA message", 0x0002),
+	FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+	FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+	FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+	FLAG_ENTRY0("External device config request", 0x0020),
+	FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+	FLAG_ENTRY0("LinkUp achieved", 0x0080),
+	FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+			       u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+				  u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+				      u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+				     u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+				  u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+			    u8 *tx_polarity_inversion,
+			    u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+				unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+			   unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+			   unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+					  u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+			   u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				  int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+	u32 status;		/* status CSR offset */
+	u32 clear;		/* clear CSR offset */
+	u32 mask;		/* mask CSR offset */
+	void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+	const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+	{ reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+		handler, desc }
+#define DC_EE1(reg, handler, desc) \
+	{ reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+	{ reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/	EE(CCE_ERR,		handle_cce_err,    "CceErr"),
+/* 1*/	EE(RCV_ERR,		handle_rxe_err,    "RxeErr"),
+/* 2*/	EE(MISC_ERR,	handle_misc_err,   "MiscErr"),
+/* 3*/	{ 0, 0, 0, NULL }, /* reserved */
+/* 4*/	EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/	EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/	EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/	EE(SEND_ERR,	handle_txe_err,    "TxeErr")
+	/* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+	EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/	{ 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/	{ 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/	EE(ASIC_QSFP1,	handle_qsfp_int,	"QSFP1"),
+/* 3*/	EE(ASIC_QSFP2,	handle_qsfp_int,	"QSFP2"),
+/* 4*/	{ 0, 0, 0, NULL }, /* TCritInt */
+	/* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/	DC_EE1(DCC_ERR,		handle_dcc_err,	       "DCC Err"),
+/* 1*/	DC_EE2(DC_LCB_ERR,	handle_lcb_err,	       "LCB Err"),
+/* 2*/	DC_EE2(DC_DC8051_ERR,	handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/	/* dc_lbm_int - special, see is_dc_int() */
+	/* the rest are reserved */
+};
+
+struct cntr_entry {
+	/*
+	 * counter name
+	 */
+	char *name;
+
+	/*
+	 * csr to read for name (if applicable)
+	 */
+	u64 csr;
+
+	/*
+	 * offset into dd or ppd to store the counter's value
+	 */
+	int offset;
+
+	/*
+	 * flags
+	 */
+	u8 flags;
+
+	/*
+	 * accessor for stat element, context either dd or ppd
+	 */
+	u64 (*rw_cntr)(const struct cntr_entry *,
+			       void *context,
+			       int vl,
+			       int mode,
+			       u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+	name, \
+	csr, \
+	offset, \
+	flags, \
+	accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY64), \
+	  0, flags, \
+	  port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY64), \
+	  0, flags, \
+	  dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+	  (RCV_HDR_OVFL_CNT + ctx*0x100), \
+	  0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + SEND_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + SEND_COUNTER_ARRAY64), \
+	  0, flags, \
+	  port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+	  counter * 8 + SEND_COUNTER_ARRAY64, \
+	  0, \
+	  flags, \
+	  dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + CCE_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  counter, \
+	  0, \
+	  flags, \
+	  dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  counter, \
+	  0, \
+	  flags, \
+	  dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+	  0, \
+	  0, \
+	  CNTR_SYNTH, \
+	  access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+	u64 val;
+
+	if (dd->flags & HFI1_PRESENT) {
+		val = readq((void __iomem *)dd->kregbase + offset);
+		return val;
+	}
+	return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+	if (dd->flags & HFI1_PRESENT)
+		writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+	struct hfi1_devdata *dd,
+	u32 offset)
+{
+	return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+				 int mode, u64 value)
+{
+	u64 ret;
+
+
+	if (mode == CNTR_MODE_R) {
+		ret = read_csr(dd, csr);
+	} else if (mode == CNTR_MODE_W) {
+		write_csr(dd, csr, value);
+		ret = value;
+	} else {
+		dd_dev_err(dd, "Invalid cntr register access mode");
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+	return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+			    void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_csr(dd, entry->csr, mode, data);
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+			    int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	u64 val = 0;
+	u64 csr = entry->csr;
+
+	if (entry->flags & CNTR_VL) {
+		if (vl == CNTR_INVALID_VL)
+			return 0;
+		csr += 8 * vl;
+	} else {
+		if (vl != CNTR_INVALID_VL)
+			return 0;
+	}
+
+	val = read_write_csr(dd, csr, mode, data);
+	return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+			    int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+	u32 csr = entry->csr;
+	int ret = 0;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	if (mode == CNTR_MODE_R)
+		ret = read_lcb_csr(dd, csr, &data);
+	else if (mode == CNTR_MODE_W)
+		ret = write_lcb_csr(dd, csr, data);
+
+	if (ret) {
+		dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+	return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+			     int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+			     void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+	u64 val;
+	u64 csr = entry->csr;
+
+	if (entry->flags & CNTR_VL) {
+		if (vl == CNTR_INVALID_VL)
+			return 0;
+		csr += 8 * vl;
+	} else {
+		if (vl != CNTR_INVALID_VL)
+			return 0;
+	}
+	val = read_write_csr(ppd->dd, csr, mode, data);
+	return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+				u64 data)
+{
+	u64 ret;
+
+	if (mode == CNTR_MODE_R) {
+		ret = *cntr;
+	} else if (mode == CNTR_MODE_W) {
+		*cntr = data;
+		ret = data;
+	} else {
+		dd_dev_err(dd, "Invalid cntr sw access mode");
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+	return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+			       int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+			       int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+				    void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+				     void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+			     mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+				     void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+			     mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+	int cpu;
+	u64 counter = 0;
+
+	for_each_possible_cpu(cpu)
+		counter += *per_cpu_ptr(cntr, cpu);
+	return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+			  u64 __percpu *cntr,
+			  int vl, int mode, u64 data)
+{
+
+	u64 ret = 0;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	if (mode == CNTR_MODE_R) {
+		ret = get_all_cpu_total(cntr) - *z_val;
+	} else if (mode == CNTR_MODE_W) {
+		/* A write can only zero the counter */
+		if (data == 0)
+			*z_val = get_all_cpu_total(cntr);
+		else
+			dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+	} else {
+		dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+		return 0;
+	}
+
+	return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+			      mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+			      mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+			       void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->verbs_dev.n_kmem_wait;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,		      \
+			      void *context, int vl, int mode, u64 data)      \
+{									      \
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;	      \
+	return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr,	      \
+			      ppd->ibport_data.cntr, vl,		      \
+			      mode, data);				      \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,		      \
+				void *context, int vl, int mode, u64 data)    \
+{									      \
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;	      \
+									      \
+	if (vl != CNTR_INVALID_VL)					      \
+		return 0;						      \
+									      \
+	return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr,	      \
+			     mode, data);				      \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+			CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+			CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+			RCV_TID_FLOW_GEN_MISMATCH_CNT,
+			CNTR_NORMAL),
+[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL,
+			CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+			CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+			RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+			CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+			CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+			CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+			CCE_RCV_URGENT_INT_CNT,	CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+			CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+			      CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+				 CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+				  CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+				  CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+				   DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+				  DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+				  CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+				DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+			       CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+			      CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+			       CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+				 CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+				CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+				CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+			       CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+				CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+			      CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+				CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+				   CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+	DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+			 CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+				  CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+	DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+			 CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+				    CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+				    CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+	DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+			 CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+	DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+			 CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+				  CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+	DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+			 CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+	DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+	DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+			 CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+	DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+	DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+	DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+	DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+	DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+			 CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+	DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+	DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+			 CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+			    access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+			    access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+			    access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+			    access_sw_pio_wait),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+			    access_sw_kmem_wait),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+			CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+			CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+			CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+			CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+			CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+			CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			access_sw_link_up_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+			CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+			access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+			access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+			access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+			       access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+			       access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+			       access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a0 */
+int is_a0(struct hfi1_devdata *dd)
+{
+	return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+			& CCE_REVISION_CHIP_REV_MINOR_MASK) == 0;
+}
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+	u8 chip_rev_minor =
+		dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+	return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+	u8 chip_rev_minor =
+		dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+	return !!(chip_rev_minor & 0x10);
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+	char *p = *curp;
+	int len = *lenp;
+	int result = 0; /* success */
+	char c;
+
+	/* add a comma, if first in the buffer */
+	if (p != buf) {
+		if (len == 0) {
+			result = 1; /* out of room */
+			goto done;
+		}
+		*p++ = ',';
+		len--;
+	}
+
+	/* copy the string */
+	while ((c = *s++) != 0) {
+		if (len == 0) {
+			result = 1; /* out of room */
+			goto done;
+		}
+		*p++ = c;
+		len--;
+	}
+
+done:
+	/* write return values */
+	*curp = p;
+	*lenp = len;
+
+	return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+				struct flag_table *table, int table_size)
+{
+	char extra[32];
+	char *p = buf;
+	int len = buf_len;
+	int no_room = 0;
+	int i;
+
+	/* make sure there is at least 2 so we can form "*" */
+	if (len < 2)
+		return "";
+
+	len--;	/* leave room for a nul */
+	for (i = 0; i < table_size; i++) {
+		if (flags & table[i].flag) {
+			no_room = append_str(buf, &p, &len, table[i].str);
+			if (no_room)
+				break;
+			flags &= ~table[i].flag;
+		}
+	}
+
+	/* any undocumented bits left? */
+	if (!no_room && flags) {
+		snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+		no_room = append_str(buf, &p, &len, extra);
+	}
+
+	/* add * if ran out of room */
+	if (no_room) {
+		/* may need to back up to add space for a '*' */
+		if (len == 0)
+			--p;
+		*p++ = '*';
+	}
+
+	/* add final nul - space already allocated above */
+	*p = 0;
+	return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+	"CceErrInt",		/* 0 */
+	"RxeErrInt",		/* 1 */
+	"MiscErrInt",		/* 2 */
+	"Reserved3",		/* 3 */
+	"PioErrInt",		/* 4 */
+	"SDmaErrInt",		/* 5 */
+	"EgressErrInt",		/* 6 */
+	"TxeErrInt"		/* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	if (source < ARRAY_SIZE(cce_misc_names))
+		strncpy(buf, cce_misc_names[source], bsize);
+	else
+		snprintf(buf,
+			bsize,
+			"Reserved%u",
+			source + IS_GENERAL_ERR_START);
+
+	return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+	return buf;
+}
+
+static const char * const various_names[] = {
+	"PbcInt",
+	"GpioAssertInt",
+	"Qsfp1Int",
+	"Qsfp2Int",
+	"TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+	if (source < ARRAY_SIZE(various_names))
+		strncpy(buf, various_names[source], bsize);
+	else
+		snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
+	return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+	static const char * const dc_int_names[] = {
+		"common",
+		"lcb",
+		"8051",
+		"lbm"	/* local block merge */
+	};
+
+	if (source < ARRAY_SIZE(dc_int_names))
+		snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+	else
+		snprintf(buf, bsize, "DCInt%u", source);
+	return buf;
+}
+
+static const char * const sdma_int_names[] = {
+	"SDmaInt",
+	"SdmaIdleInt",
+	"SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+	/* what interrupt */
+	unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+	/* which engine */
+	unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+	if (likely(what < 3))
+		snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+	else
+		snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+	return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "RcvAvailInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "RcvUrgentInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SendCreditInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+	return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, misc_err_status_flags,
+			ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			sdma_err_status_flags,
+			ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+		egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+		egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			send_err_status_flags,
+			ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	/*
+	 * For most these errors, there is nothing that can be done except
+	 * report or record it.
+	 */
+	dd_dev_info(dd, "CCE Error: %s\n",
+		cce_err_status_string(buf, sizeof(buf), reg));
+
+	if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK)
+			&& is_a0(dd)
+			&& (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+		/* this error requires a manual drop into SPC freeze mode */
+		/* then a fix up */
+		start_freeze_handling(dd->pport, FREEZE_SELF);
+	}
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+	struct hfi1_pportdata *ppd = dd->pport;
+	u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+	if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+		ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+		dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+		set_link_down_reason(ppd,
+		  OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+			OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+		queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+	}
+	dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
+
+	mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+	init_timer(&dd->rcverr_timer);
+	dd->rcverr_timer.function = update_rcverr_timer;
+	dd->rcverr_timer.data = (unsigned long) dd;
+	/* Assume the hardware counter has been reset */
+	dd->rcv_ovfl_cnt = 0;
+	return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+	if (dd->rcverr_timer.data)
+		del_timer_sync(&dd->rcverr_timer);
+	dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "Receive Error: %s\n",
+		rxe_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_RXE_FREEZE_ERR) {
+		int flags = 0;
+
+		/*
+		 * Freeze mode recovery is disabled for the errors
+		 * in RXE_FREEZE_ABORT_MASK
+		 */
+		if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+			flags = FREEZE_ABORT;
+
+		start_freeze_handling(dd->pport, flags);
+	}
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "Misc Error: %s",
+		misc_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "PIO Error: %s\n",
+		pio_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_PIO_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "SDMA Error: %s\n",
+		sdma_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_SDMA_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	if (ppd->port_xmit_discards < ~(u64)0)
+		ppd->port_xmit_discards++;
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+	u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+	char buf[96];
+
+	/* clear down all observed info as quickly as possible after read */
+	write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+	dd_dev_info(dd,
+		"Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+		info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+	/* Eventually add other counters for each bit */
+
+	if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
+		if (ppd->port_xmit_discards < ~(u64)0)
+			ppd->port_xmit_discards++;
+	}
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+	return (posn >= SEES(TX_LINKDOWN) &&
+		posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(u64 posn)
+{
+	return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+		posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	u64 reg_copy = reg, handled = 0;
+	char buf[96];
+
+	if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+	if (is_a0(dd) && (reg &
+		    SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
+		    && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+		start_freeze_handling(dd->pport, 0);
+
+	while (reg_copy) {
+		int posn = fls64(reg_copy);
+		/*
+		 * fls64() returns a 1-based offset, but we generally
+		 * want 0-based offsets.
+		 */
+		int shift = posn - 1;
+
+		if (port_inactive_err(shift)) {
+			count_port_inactive(dd);
+			handled |= (1ULL << shift);
+		} else if (disallowed_pkt_err(shift)) {
+			handle_send_egress_err_info(dd);
+			handled |= (1ULL << shift);
+		}
+		clear_bit(shift, (unsigned long *)&reg_copy);
+	}
+
+	reg &= ~handled;
+
+	if (reg)
+		dd_dev_info(dd, "Egress Error: %s\n",
+			egress_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "Send Error: %s\n",
+		send_err_status_string(buf, sizeof(buf), reg));
+
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+				 u32 context,
+				 const struct err_reg_info *eri)
+{
+	u64 reg;
+	u32 count;
+
+	/* read in a loop until no more errors are seen */
+	count = 0;
+	while (1) {
+		reg = read_kctxt_csr(dd, context, eri->status);
+		if (reg == 0)
+			break;
+		write_kctxt_csr(dd, context, eri->clear, reg);
+		if (likely(eri->handler))
+			eri->handler(dd, context, reg);
+		count++;
+		if (count > MAX_CLEAR_COUNT) {
+			u64 mask;
+
+			dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+				eri->desc, reg);
+			/*
+			 * Read-modify-write so any other masked bits
+			 * remain masked.
+			 */
+			mask = read_kctxt_csr(dd, context, eri->mask);
+			mask &= ~reg;
+			write_kctxt_csr(dd, context, eri->mask, mask);
+			break;
+		}
+	}
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &misc_errs[source];
+
+	if (eri->handler) {
+		interrupt_clear_down(dd, 0, eri);
+	} else {
+		dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+			source);
+	}
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+				unsigned int hw_context)
+{
+	struct send_context_info *sci;
+	struct send_context *sc;
+	char flags[96];
+	u64 status;
+	u32 sw_index;
+
+	sw_index = dd->hw_to_sw[hw_context];
+	if (sw_index >= dd->num_send_contexts) {
+		dd_dev_err(dd,
+			"out of range sw index %u for send context %u\n",
+			sw_index, hw_context);
+		return;
+	}
+	sci = &dd->send_contexts[sw_index];
+	sc = sci->sc;
+	if (!sc) {
+		dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+			sw_index, hw_context);
+		return;
+	}
+
+	/* tell the software that a halt has begun */
+	sc_stop(sc, SCF_HALTED);
+
+	status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+	dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+		send_context_err_status_string(flags, sizeof(flags), status));
+
+	if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+		handle_send_egress_err_info(dd);
+
+	/*
+	 * Automatically restart halted kernel contexts out of interrupt
+	 * context.  User contexts must ask the driver to restart the context.
+	 */
+	if (sc->type != SC_USER)
+		queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+				unsigned int source, u64 status)
+{
+	struct sdma_engine *sde;
+
+	sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+		   sde->this_idx, source, (unsigned long long)status);
+#endif
+	sdma_engine_error(sde, status);
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+	struct sdma_engine *sde = &dd->per_sdma[source];
+
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+		   source);
+	sdma_dumpstate(sde);
+#endif
+	interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &various_err[source];
+
+	/*
+	 * TCritInt cannot go through interrupt_clear_down()
+	 * because it is not a second tier interrupt. The handler
+	 * should be called directly.
+	 */
+	if (source == TCRIT_INT_SOURCE)
+		handle_temp_err(dd);
+	else if (eri->handler)
+		interrupt_clear_down(dd, 0, eri);
+	else
+		dd_dev_info(dd,
+			"%s: Unimplemented/reserved interrupt %d\n",
+			__func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+	/* source is always zero */
+	struct hfi1_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+	if (reg & QSFP_HFI0_MODPRST_N) {
+
+		dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
+				__func__);
+
+		if (!qsfp_mod_present(ppd)) {
+			ppd->driver_link_ready = 0;
+			/*
+			 * Cable removed, reset all our information about the
+			 * cache and cable capabilities
+			 */
+
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			/*
+			 * We don't set cache_refresh_required here as we expect
+			 * an interrupt when a cable is inserted
+			 */
+			ppd->qsfp_info.cache_valid = 0;
+			ppd->qsfp_info.qsfp_interrupt_functional = 0;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+						flags);
+			write_csr(dd,
+					dd->hfi1_id ?
+						ASIC_QSFP2_INVERT :
+						ASIC_QSFP1_INVERT,
+				qsfp_int_mgmt);
+			if (ppd->host_link_state == HLS_DN_POLL) {
+				/*
+				 * The link is still in POLL. This means
+				 * that the normal link down processing
+				 * will not happen. We have to do it here
+				 * before turning the DC off.
+				 */
+				queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+			}
+		} else {
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			ppd->qsfp_info.cache_valid = 0;
+			ppd->qsfp_info.cache_refresh_required = 1;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+						flags);
+
+			qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+			write_csr(dd,
+					dd->hfi1_id ?
+						ASIC_QSFP2_INVERT :
+						ASIC_QSFP1_INVERT,
+				qsfp_int_mgmt);
+		}
+	}
+
+	if (reg & QSFP_HFI0_INT_N) {
+
+		dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
+				__func__);
+		spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+		ppd->qsfp_info.check_interrupt_flags = 1;
+		ppd->qsfp_info.qsfp_interrupt_functional = 1;
+		spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+	}
+
+	/* Schedule the QSFP work only if there is a cable attached. */
+	if (qsfp_mod_present(ppd))
+		queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_MISC,
+		(u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+		NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "%s: command failed with error %d\n",
+			__func__, ret);
+	}
+	return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_MISC,
+		(u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+		NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "%s: command failed with error %d\n",
+			__func__, ret);
+	}
+	return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+	write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+				DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
+				| DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+	write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+				DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *	0 on success
+ *	-EBUSY if the 8051 has control and cannot be disturbed
+ *	-errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	int ret = 0;
+
+	/*
+	 * Use the host link state lock so the operation of this routine
+	 * { link state check, selector change, count increment } can occur
+	 * as a unit against a link state change.  Otherwise there is a
+	 * race between the state change and the count increment.
+	 */
+	if (sleep_ok) {
+		mutex_lock(&ppd->hls_lock);
+	} else {
+		while (mutex_trylock(&ppd->hls_lock) == EBUSY)
+			udelay(1);
+	}
+
+	/* this access is valid only when the link is up */
+	if ((ppd->host_link_state & HLS_UP) == 0) {
+		dd_dev_info(dd, "%s: link state %s not up\n",
+			__func__, link_state_name(ppd->host_link_state));
+		ret = -EBUSY;
+		goto done;
+	}
+
+	if (dd->lcb_access_count == 0) {
+		ret = request_host_lcb_access(dd);
+		if (ret) {
+			dd_dev_err(dd,
+				"%s: unable to acquire LCB access, err %d\n",
+				__func__, ret);
+			goto done;
+		}
+		set_host_lcb_access(dd);
+	}
+	dd->lcb_access_count++;
+done:
+	mutex_unlock(&ppd->hls_lock);
+	return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *	0 on success
+ *	-errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+	int ret = 0;
+
+	/*
+	 * Use the host link state lock because the acquire needed it.
+	 * Here, we only need to keep { selector change, count decrement }
+	 * as a unit.
+	 */
+	if (sleep_ok) {
+		mutex_lock(&dd->pport->hls_lock);
+	} else {
+		while (mutex_trylock(&dd->pport->hls_lock) == EBUSY)
+			udelay(1);
+	}
+
+	if (dd->lcb_access_count == 0) {
+		dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+			__func__);
+		goto done;
+	}
+
+	if (dd->lcb_access_count == 1) {
+		set_8051_lcb_access(dd);
+		ret = request_8051_lcb_access(dd);
+		if (ret) {
+			dd_dev_err(dd,
+				"%s: unable to release LCB access, err %d\n",
+				__func__, ret);
+			/* restore host access if the grant didn't work */
+			set_host_lcb_access(dd);
+			goto done;
+		}
+	}
+	dd->lcb_access_count--;
+done:
+	mutex_unlock(&dd->pport->hls_lock);
+	return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+	dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+	write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+		DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
+		| (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
+		| (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	u16 data;
+	u8 type;
+
+	reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+	if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+		return;	/* no request */
+
+	/* zero out COMPLETED so the response is seen */
+	write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+	/* extract request details */
+	type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+			& DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+	data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+			& DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+	switch (type) {
+	case HREQ_LOAD_CONFIG:
+	case HREQ_SAVE_CONFIG:
+	case HREQ_READ_CONFIG:
+	case HREQ_SET_TX_EQ_ABS:
+	case HREQ_SET_TX_EQ_REL:
+	case HREQ_ENABLE:
+		dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+			type);
+		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+		break;
+
+	case HREQ_CONFIG_DONE:
+		hreq_response(dd, HREQ_SUCCESS, 0);
+		break;
+
+	case HREQ_INTERFACE_TEST:
+		hreq_response(dd, HREQ_SUCCESS, data);
+		break;
+
+	default:
+		dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+		break;
+	}
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+				u8 vau, u16 total, u16 shared)
+{
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+		((u64)total
+			<< SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+		| ((u64)shared
+			<< SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+		| ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+	/* leave shared count at zero for both global and VL15 */
+	write_global_credit(dd, vau, vl15buf, 0);
+
+	/* We may need some credits for another VL when sending packets
+	 * with the snoop interface. Dividing it down the middle for VL15
+	 * and VL0 should suffice.
+	 */
+	if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+		write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+		    << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+		write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+		    << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+	} else {
+		write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+			<< SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+	}
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* remove all previous VL credit limits */
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+	write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+	write_global_credit(dd, 0, 0, 0);
+	/* reset the CM block */
+	pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+	return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+	return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+	return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+	ppd->sm_trap_qp = 0x0;
+	ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+	u64 reg;
+
+	/* clear lcb run: LCB_CFG_RUN.EN = 0 */
+	write_csr(dd, DC_LCB_CFG_RUN, 0);
+	/* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+		1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+	/* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+	dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+	reg = read_csr(dd, DCC_CFG_RESET);
+	write_csr(dd, DCC_CFG_RESET,
+		reg
+		| (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
+		| (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+	(void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+	if (!abort) {
+		udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+		write_csr(dd, DCC_CFG_RESET, reg);
+		write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+	}
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->dc8051_lock, flags);
+	if (dd->dc_shutdown) {
+		spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+		return;
+	}
+	dd->dc_shutdown = 1;
+	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+	/* Shutdown the LCB */
+	lcb_shutdown(dd, 1);
+	/* Going to OFFLINE would have causes the 8051 to put the
+	 * SerDes into reset already. Just need to shut down the 8051,
+	 * itself. */
+	write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/* Calling this after the DC has been brought out of reset should not
+ * do any damage. */
+static void dc_start(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dd->dc8051_lock, flags);
+	if (!dd->dc_shutdown)
+		goto done;
+	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+	/* Take the 8051 out of reset */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+	/* Wait until 8051 is ready */
+	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+	if (ret) {
+		dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+			__func__);
+	}
+	/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+	write_csr(dd, DCC_CFG_RESET, 0x10);
+	/* lcb_shutdown() with abort=1 does not restore these */
+	write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+	spin_lock_irqsave(&dd->dc8051_lock, flags);
+	dd->dc_shutdown = 0;
+done:
+	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+	u64 rx_radr, tx_radr;
+	u32 version;
+
+	if (dd->icode != ICODE_FPGA_EMULATION)
+		return;
+
+	/*
+	 * These LCB defaults on emulator _s are good, nothing to do here:
+	 *	LCB_CFG_TX_FIFOS_RADR
+	 *	LCB_CFG_RX_FIFOS_RADR
+	 *	LCB_CFG_LN_DCLK
+	 *	LCB_CFG_IGNORE_LOST_RCLK
+	 */
+	if (is_emulator_s(dd))
+		return;
+	/* else this is _p */
+
+	version = emulator_rev(dd);
+	if (!is_a0(dd))
+		version = 0x2d;	/* all B0 use 0x2d or higher settings */
+
+	if (version <= 0x12) {
+		/* release 0x12 and below */
+
+		/*
+		 * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+		 * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+		 * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+		 */
+		rx_radr =
+		      0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		/*
+		 * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+		 * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+		 */
+		tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version <= 0x18) {
+		/* release 0x13 up to 0x18 */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+		rx_radr =
+		      0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version == 0x19) {
+		/* release 0x19 */
+		/* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+		rx_radr =
+		      0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version == 0x1a) {
+		/* release 0x1a */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+		rx_radr =
+		      0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+		write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+	} else {
+		/* release 0x1b and higher */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+		rx_radr =
+		      0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	}
+
+	write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+	/* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+	write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+		DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							sma_message_work);
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 msg;
+	int ret;
+
+	/* msg is bytes 1-4 of the 40-bit idle message - the command code
+	   is stripped off */
+	ret = read_idle_sma(dd, &msg);
+	if (ret)
+		return;
+	dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+	/*
+	 * React to the SMA message.  Byte[1] (0 for us) is the command.
+	 */
+	switch (msg & 0xff) {
+	case SMA_IDLE_ARM:
+		/*
+		 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+		 * State Transitions
+		 *
+		 * Only expected in INIT or ARMED, discard otherwise.
+		 */
+		if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+			ppd->neighbor_normal = 1;
+		break;
+	case SMA_IDLE_ACTIVE:
+		/*
+		 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+		 * State Transitions
+		 *
+		 * Can activate the node.  Discard otherwise.
+		 */
+		if (ppd->host_link_state == HLS_UP_ARMED
+					&& ppd->is_active_optimize_enabled) {
+			ppd->neighbor_normal = 1;
+			ret = set_link_state(ppd, HLS_UP_ACTIVE);
+			if (ret)
+				dd_dev_err(
+					dd,
+					"%s: received Active SMA idle message, couldn't set link to Active\n",
+					__func__);
+		}
+		break;
+	default:
+		dd_dev_err(dd,
+			"%s: received unexpected SMA idle message 0x%llx\n",
+			__func__, msg);
+		break;
+	}
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+	u64 rcvctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+	rcvctrl = read_csr(dd, RCV_CTRL);
+	rcvctrl |= add;
+	rcvctrl &= ~clear;
+	write_csr(dd, RCV_CTRL, rcvctrl);
+	spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+	adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+	adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct send_context *sc;
+	int i;
+
+	if (flags & FREEZE_SELF)
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+	/* enter frozen mode */
+	dd->flags |= HFI1_FROZEN;
+
+	/* notify all SDMA engines that they are going into a freeze */
+	sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+	/* do halt pre-handling on all enabled send contexts */
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (sc && (sc->flags & SCF_ENABLED))
+			sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+	}
+
+	/* Send context are frozen. Notify user space */
+	hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+	if (flags & FREEZE_ABORT) {
+		dd_dev_err(dd,
+			   "Aborted freeze recovery. Please REBOOT system\n");
+		return;
+	}
+	/* queue non-interrupt handler */
+	queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, CCE_STATUS);
+		if (freeze) {
+			/* waiting until all indicators are set */
+			if ((reg & ALL_FROZE) == ALL_FROZE)
+				return;	/* all done */
+		} else {
+			/* waiting until all indicators are clear */
+			if ((reg & ALL_FROZE) == 0)
+				return; /* all done */
+		}
+
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				"Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+				freeze ? "" : "un",
+				reg & ALL_FROZE,
+				freeze ? ALL_FROZE : 0ull);
+			return;
+		}
+		usleep_range(80, 120);
+	}
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* disable port */
+	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+	/* disable all receive contexts */
+	for (i = 0; i < dd->num_rcv_contexts; i++)
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* enable all kernel contexts */
+	for (i = 0; i < dd->n_krcv_queues; i++)
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
+
+	/* enable port */
+	add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								freeze_work);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/* wait for freeze indicators on all affected blocks */
+	dd_dev_info(dd, "Entering SPC freeze\n");
+	wait_for_freeze_status(dd, 1);
+
+	/* SPC is now frozen */
+
+	/* do send PIO freeze steps */
+	pio_freeze(dd);
+
+	/* do send DMA freeze steps */
+	sdma_freeze(dd);
+
+	/* do send egress freeze steps - nothing to do */
+
+	/* do receive freeze steps */
+	rxe_freeze(dd);
+
+	/*
+	 * Unfreeze the hardware - clear the freeze, wait for each
+	 * block's frozen bit to clear, then clear the frozen flag.
+	 */
+	write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+	wait_for_freeze_status(dd, 0);
+
+	if (is_a0(dd)) {
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+		wait_for_freeze_status(dd, 1);
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+		wait_for_freeze_status(dd, 0);
+	}
+
+	/* do send PIO unfreeze steps for kernel contexts */
+	pio_kernel_unfreeze(dd);
+
+	/* do send DMA unfreeze steps */
+	sdma_unfreeze(dd);
+
+	/* do send egress unfreeze steps - nothing to do */
+
+	/* do receive unfreeze steps for kernel contexts */
+	rxe_kernel_unfreeze(dd);
+
+	/*
+	 * The unfreeze procedure touches global device registers when
+	 * it disables and re-enables RXE. Mark the device unfrozen
+	 * after all that is done so other parts of the driver waiting
+	 * for the device to unfreeze don't do things out of order.
+	 *
+	 * The above implies that the meaning of HFI1_FROZEN flag is
+	 * "Device has gone into freeze mode and freeze mode handling
+	 * is still in progress."
+	 *
+	 * The flag will be removed when freeze mode processing has
+	 * completed.
+	 */
+	dd->flags &= ~HFI1_FROZEN;
+	wake_up(&dd->event_queue);
+
+	/* no longer frozen */
+	dd_dev_err(dd, "Exiting SPC freeze\n");
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								link_up_work);
+	set_link_state(ppd, HLS_UP_INIT);
+
+	/* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+	read_ltp_rtt(ppd->dd);
+	/*
+	 * OPA specifies that certain counters are cleared on a transition
+	 * to link up, so do that.
+	 */
+	clear_linkup_counters(ppd->dd);
+	/*
+	 * And (re)set link up default values.
+	 */
+	set_linkup_defaults(ppd);
+
+	/* enforce link speed enabled */
+	if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+		/* oops - current speed is not enabled, bounce */
+		dd_dev_err(ppd->dd,
+			"Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+			ppd->link_speed_active, ppd->link_speed_enabled);
+		set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+			OPA_LINKDOWN_REASON_SPEED_POLICY);
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	}
+}
+
+/* Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+	ppd->neighbor_guid = 0;
+	ppd->neighbor_port_number = 0;
+	ppd->neighbor_type = 0;
+	ppd->neighbor_fm_security = 0;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+	u8 lcl_reason, neigh_reason = 0;
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								link_down_work);
+
+	/* go offline first, then deal with reasons */
+	set_link_state(ppd, HLS_DN_OFFLINE);
+
+	lcl_reason = 0;
+	read_planned_down_reason_code(ppd->dd, &neigh_reason);
+
+	/*
+	 * If no reason, assume peer-initiated but missed
+	 * LinkGoingDown idle flits.
+	 */
+	if (neigh_reason == 0)
+		lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+
+	set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+	reset_neighbor_info(ppd);
+
+	/* disable the port */
+	clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+	/* If there is no cable attached, turn the DC off. Otherwise,
+	 * start the link bring up. */
+	if (!qsfp_mod_present(ppd))
+		dc_shutdown(ppd->dd);
+	else
+		start_link(ppd);
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							link_bounce_work);
+
+	/*
+	 * Only do something if the link is currently up.
+	 */
+	if (ppd->host_link_state & HLS_UP) {
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	} else {
+		dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+			__func__, link_state_name(ppd->host_link_state));
+	}
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+	int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+	if (cap & CAP_CRC_14B)
+		port_ltp |= PORT_LTP_CRC_MODE_14;
+	if (cap & CAP_CRC_48B)
+		port_ltp |= PORT_LTP_CRC_MODE_48;
+	if (cap & CAP_CRC_12B_16B_PER_LANE)
+		port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+	return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+	int cap_mask = 0;
+
+	if (port_ltp & PORT_LTP_CRC_MODE_14)
+		cap_mask |= CAP_CRC_14B;
+	if (port_ltp & PORT_LTP_CRC_MODE_48)
+		cap_mask |= CAP_CRC_48B;
+	if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+		cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+	return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+	int port_ltp = 0;
+
+	if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+		port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+	else if (lcb_crc == LCB_CRC_48B)
+		port_ltp = PORT_LTP_CRC_MODE_48;
+	else if (lcb_crc == LCB_CRC_14B)
+		port_ltp = PORT_LTP_CRC_MODE_14;
+	else
+		port_ltp = PORT_LTP_CRC_MODE_16;
+
+	return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/* Sanity check - ppd->pkeys[2] should be 0 */
+	if (ppd->pkeys[2] != 0)
+		dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+			   __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+	ppd->pkeys[2] = FULL_MGMT_P_KEY;
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+	switch (width) {
+	case 0:
+		/*
+		 * Simulator and quick linkup do not set the width.
+		 * Just set it to 4x without complaint.
+		 */
+		if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+			return OPA_LINK_WIDTH_4X;
+		return 0; /* no lanes up */
+	case 1: return OPA_LINK_WIDTH_1X;
+	case 2: return OPA_LINK_WIDTH_2X;
+	case 3: return OPA_LINK_WIDTH_3X;
+	default:
+		dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+			__func__, width);
+		/* fall through */
+	case 4: return OPA_LINK_WIDTH_4X;
+	}
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+static inline u8 nibble_to_count(u8 nibble)
+{
+	return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *	enable_lane_tx
+ *	enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+			    u16 *rx_width)
+{
+	u16 tx, rx;
+	u8 enable_lane_rx;
+	u8 enable_lane_tx;
+	u8 tx_polarity_inversion;
+	u8 rx_polarity_inversion;
+	u8 max_rate;
+
+	/* read the active lanes */
+	read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+				&rx_polarity_inversion, &max_rate);
+	read_local_lni(dd, &enable_lane_rx);
+
+	/* convert to counts */
+	tx = nibble_to_count(enable_lane_tx);
+	rx = nibble_to_count(enable_lane_rx);
+
+	/*
+	 * Set link_speed_active here, overriding what was set in
+	 * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+	 * set the max_rate field in handle_verify_cap until v0.19.
+	 */
+	if ((dd->icode == ICODE_RTL_SILICON)
+				&& (dd->dc8051_ver < dc8051_ver(0, 19))) {
+		/* max_rate: 0 = 12.5G, 1 = 25G */
+		switch (max_rate) {
+		case 0:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+			break;
+		default:
+			dd_dev_err(dd,
+				"%s: unexpected max rate %d, using 25Gb\n",
+				__func__, (int)max_rate);
+			/* fall through */
+		case 1:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+			break;
+		}
+	}
+
+	dd_dev_info(dd,
+		"Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+		enable_lane_tx, tx, enable_lane_rx, rx);
+	*tx_width = link_width_to_bits(dd, tx);
+	*rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *	+ bits [7:4] contain the number of active transmitters
+ *	+ bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+			      u16 *rx_width)
+{
+	u16 widths, tx, rx;
+	u8 misc_bits, local_flags;
+	u16 active_tx, active_rx;
+
+	read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+	tx = widths >> 12;
+	rx = (widths >> 8) & 0xf;
+
+	*tx_width = link_width_to_bits(dd, tx);
+	*rx_width = link_width_to_bits(dd, rx);
+
+	/* print the active widths */
+	get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+	u16 tx_width, rx_width;
+
+	/* get end-of-LNI link widths */
+	get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+	/* use tx_width as the link is supposed to be symmetric on link up */
+	ppd->link_width_active = tx_width;
+	/* link width downgrade active (LWD.A) starts out matching LW.A */
+	ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+	ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+	/* per OPA spec, on link up LWD.E resets to LWD.S */
+	ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+	/* cache the active egress rate (units {10^6 bits/sec]) */
+	ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								link_vc_work);
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+	u8 power_management;
+	u8 continious;
+	u8 vcu;
+	u8 vau;
+	u8 z;
+	u16 vl15buf;
+	u16 link_widths;
+	u16 crc_mask;
+	u16 crc_val;
+	u16 device_id;
+	u16 active_tx, active_rx;
+	u8 partner_supported_crc;
+	u8 remote_tx_rate;
+	u8 device_rev;
+
+	set_link_state(ppd, HLS_VERIFY_CAP);
+
+	lcb_shutdown(dd, 0);
+	adjust_lcb_for_fpga_serdes(dd);
+
+	/*
+	 * These are now valid:
+	 *	remote VerifyCap fields in the general LNI config
+	 *	CSR DC8051_STS_REMOTE_GUID
+	 *	CSR DC8051_STS_REMOTE_NODE_TYPE
+	 *	CSR DC8051_STS_REMOTE_FM_SECURITY
+	 *	CSR DC8051_STS_REMOTE_PORT_NO
+	 */
+
+	read_vc_remote_phy(dd, &power_management, &continious);
+	read_vc_remote_fabric(
+		dd,
+		&vau,
+		&z,
+		&vcu,
+		&vl15buf,
+		&partner_supported_crc);
+	read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+	read_remote_device_id(dd, &device_id, &device_rev);
+	/*
+	 * And the 'MgmtAllowed' information, which is exchanged during
+	 * LNI, is also be available at this point.
+	 */
+	read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+	/* print the active widths */
+	get_link_widths(dd, &active_tx, &active_rx);
+	dd_dev_info(dd,
+		"Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+		(int)power_management, (int)continious);
+	dd_dev_info(dd,
+		"Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+		(int)vau,
+		(int)z,
+		(int)vcu,
+		(int)vl15buf,
+		(int)partner_supported_crc);
+	dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+		(u32)remote_tx_rate, (u32)link_widths);
+	dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+		(u32)device_id, (u32)device_rev);
+	/*
+	 * The peer vAU value just read is the peer receiver value.  HFI does
+	 * not support a transmit vAU of 0 (AU == 8).  We advertised that
+	 * with Z=1 in the fabric capabilities sent to the peer.  The peer
+	 * will see our Z=1, and, if it advertised a vAU of 0, will move its
+	 * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+	 * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+	 * subject to the Z value exception.
+	 */
+	if (vau == 0)
+		vau = 1;
+	set_up_vl15(dd, vau, vl15buf);
+
+	/* set up the LCB CRC mode */
+	crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+	/* order is important: use the lowest bit in common */
+	if (crc_mask & CAP_CRC_14B)
+		crc_val = LCB_CRC_14B;
+	else if (crc_mask & CAP_CRC_48B)
+		crc_val = LCB_CRC_48B;
+	else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+		crc_val = LCB_CRC_12B_16B_PER_LANE;
+	else
+		crc_val = LCB_CRC_16B;
+
+	dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+	write_csr(dd, DC_LCB_CFG_CRC_MODE,
+		  (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+	/* set (14b only) or clear sideband credit */
+	reg = read_csr(dd, SEND_CM_CTRL);
+	if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+		write_csr(dd, SEND_CM_CTRL,
+			reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+	} else {
+		write_csr(dd, SEND_CM_CTRL,
+			reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+	}
+
+	ppd->link_speed_active = 0;	/* invalid value */
+	if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+		/* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+		switch (remote_tx_rate) {
+		case 0:
+			ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+			break;
+		case 1:
+			ppd->link_speed_active = OPA_LINK_SPEED_25G;
+			break;
+		}
+	} else {
+		/* actual rate is highest bit of the ANDed rates */
+		u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+		if (rate & 2)
+			ppd->link_speed_active = OPA_LINK_SPEED_25G;
+		else if (rate & 1)
+			ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+	}
+	if (ppd->link_speed_active == 0) {
+		dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+			__func__, (int)remote_tx_rate);
+		ppd->link_speed_active = OPA_LINK_SPEED_25G;
+	}
+
+	/*
+	 * Cache the values of the supported, enabled, and active
+	 * LTP CRC modes to return in 'portinfo' queries. But the bit
+	 * flags that are returned in the portinfo query differ from
+	 * what's in the link_crc_mask, crc_sizes, and crc_val
+	 * variables. Convert these here.
+	 */
+	ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+		/* supported crc modes */
+	ppd->port_ltp_crc_mode |=
+		cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+		/* enabled crc modes */
+	ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+		/* active crc mode */
+
+	/* set up the remote credit return table */
+	assign_remote_cm_au_table(dd, vcu);
+
+	/*
+	 * The LCB is reset on entry to handle_verify_cap(), so this must
+	 * be applied on every link up.
+	 *
+	 * Adjust LCB error kill enable to kill the link if
+	 * these RBUF errors are seen:
+	 *	REPLAY_BUF_MBE_SMASK
+	 *	FLIT_INPUT_BUF_MBE_SMASK
+	 */
+	if (is_a0(dd)) {			/* fixed in B0 */
+		reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+		reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+			| DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+		write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+	}
+
+	/* pull LCB fifos out of reset - all fifo clocks must be stable */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+	/* give 8051 access to the LCB CSRs */
+	write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+	set_8051_lcb_access(dd);
+
+	ppd->neighbor_guid =
+		read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+	ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+					DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+	ppd->neighbor_type =
+		read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+		DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+	ppd->neighbor_fm_security =
+		read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+		DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+	dd_dev_info(dd,
+		"Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+		ppd->neighbor_guid, ppd->neighbor_type,
+		ppd->mgmt_allowed, ppd->neighbor_fm_security);
+	if (ppd->mgmt_allowed)
+		add_full_mgmt_pkey(ppd);
+
+	/* tell the 8051 to go to LinkUp */
+	set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+	int skip = 1;
+	int do_bounce = 0;
+	u16 lwde = ppd->link_width_downgrade_enabled;
+	u16 tx, rx;
+
+	mutex_lock(&ppd->hls_lock);
+	/* only apply if the link is up */
+	if (ppd->host_link_state & HLS_UP)
+		skip = 0;
+	mutex_unlock(&ppd->hls_lock);
+	if (skip)
+		return;
+
+	if (refresh_widths) {
+		get_link_widths(ppd->dd, &tx, &rx);
+		ppd->link_width_downgrade_tx_active = tx;
+		ppd->link_width_downgrade_rx_active = rx;
+	}
+
+	if (lwde == 0) {
+		/* downgrade is disabled */
+
+		/* bounce if not at starting active width */
+		if ((ppd->link_width_active !=
+					ppd->link_width_downgrade_tx_active)
+				|| (ppd->link_width_active !=
+					ppd->link_width_downgrade_rx_active)) {
+			dd_dev_err(ppd->dd,
+				"Link downgrade is disabled and link has downgraded, downing link\n");
+			dd_dev_err(ppd->dd,
+				"  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+				ppd->link_width_active,
+				ppd->link_width_downgrade_tx_active,
+				ppd->link_width_downgrade_rx_active);
+			do_bounce = 1;
+		}
+	} else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
+		|| (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+		/* Tx or Rx is outside the enabled policy */
+		dd_dev_err(ppd->dd,
+			"Link is outside of downgrade allowed, downing link\n");
+		dd_dev_err(ppd->dd,
+			"  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+			lwde,
+			ppd->link_width_downgrade_tx_active,
+			ppd->link_width_downgrade_rx_active);
+		do_bounce = 1;
+	}
+
+	if (do_bounce) {
+		set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+		  OPA_LINKDOWN_REASON_WIDTH_POLICY);
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	}
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							link_downgrade_work);
+
+	dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+	apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dcc_err_flags,
+		ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, lcb_err_flags,
+		ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_err_flags,
+		ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+		ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+		ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 info, err, host_msg;
+	int queue_link_down = 0;
+	char buf[96];
+
+	/* look at the flags */
+	if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+		/* 8051 information set by firmware */
+		/* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+		info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+		err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+			& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+		host_msg = (info >>
+			DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+			& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+		/*
+		 * Handle error flags.
+		 */
+		if (err & FAILED_LNI) {
+			/*
+			 * LNI error indications are cleared by the 8051
+			 * only when starting polling.  Only pay attention
+			 * to them when in the states that occur during
+			 * LNI.
+			 */
+			if (ppd->host_link_state
+			    & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+				queue_link_down = 1;
+				dd_dev_info(dd, "Link error: %s\n",
+					dc8051_info_err_string(buf,
+						sizeof(buf),
+						err & FAILED_LNI));
+			}
+			err &= ~(u64)FAILED_LNI;
+		}
+		if (err) {
+			/* report remaining errors, but do not do anything */
+			dd_dev_err(dd, "8051 info error: %s\n",
+				dc8051_info_err_string(buf, sizeof(buf), err));
+		}
+
+		/*
+		 * Handle host message flags.
+		 */
+		if (host_msg & HOST_REQ_DONE) {
+			/*
+			 * Presently, the driver does a busy wait for
+			 * host requests to complete.  This is only an
+			 * informational message.
+			 * NOTE: The 8051 clears the host message
+			 * information *on the next 8051 command*.
+			 * Therefore, when linkup is achieved,
+			 * this flag will still be set.
+			 */
+			host_msg &= ~(u64)HOST_REQ_DONE;
+		}
+		if (host_msg & BC_SMA_MSG) {
+			queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+			host_msg &= ~(u64)BC_SMA_MSG;
+		}
+		if (host_msg & LINKUP_ACHIEVED) {
+			dd_dev_info(dd, "8051: Link up\n");
+			queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+			host_msg &= ~(u64)LINKUP_ACHIEVED;
+		}
+		if (host_msg & EXT_DEVICE_CFG_REQ) {
+			handle_8051_request(dd);
+			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+		}
+		if (host_msg & VERIFY_CAP_FRAME) {
+			queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+			host_msg &= ~(u64)VERIFY_CAP_FRAME;
+		}
+		if (host_msg & LINK_GOING_DOWN) {
+			const char *extra = "";
+			/* no downgrade action needed if going down */
+			if (host_msg & LINK_WIDTH_DOWNGRADED) {
+				host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+				extra = " (ignoring downgrade)";
+			}
+			dd_dev_info(dd, "8051: Link down%s\n", extra);
+			queue_link_down = 1;
+			host_msg &= ~(u64)LINK_GOING_DOWN;
+		}
+		if (host_msg & LINK_WIDTH_DOWNGRADED) {
+			queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+			host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+		}
+		if (host_msg) {
+			/* report remaining messages, but do not do anything */
+			dd_dev_info(dd, "8051 info host message: %s\n",
+				dc8051_info_host_msg_string(buf, sizeof(buf),
+					host_msg));
+		}
+
+		reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+	}
+	if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+		/*
+		 * Lost the 8051 heartbeat.  If this happens, we
+		 * receive constant interrupts about it.  Disable
+		 * the interrupt after the first.
+		 */
+		dd_dev_err(dd, "Lost 8051 heartbeat\n");
+		write_csr(dd, DC_DC8051_ERR_EN,
+			read_csr(dd, DC_DC8051_ERR_EN)
+			  & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+		reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+	}
+	if (reg) {
+		/* report the error, but do not do anything */
+		dd_dev_err(dd, "8051 error: %s\n",
+			dc8051_err_string(buf, sizeof(buf), reg));
+	}
+
+	if (queue_link_down) {
+		/* if the link is already going down or disabled, do not
+		 * queue another */
+		if ((ppd->host_link_state
+				    & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
+				|| ppd->link_enabled == 0) {
+			dd_dev_info(dd, "%s: not queuing link down\n",
+				__func__);
+		} else {
+			queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+		}
+	}
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+	"BadHeadDist: Distance violation between two head flits",
+[1] =
+	"BadTailDist: Distance violation between two tail flits",
+[2] =
+	"BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+	"BadCrdAck: Credits return for unsupported VL",
+[4] =
+	"UnsupportedVLMarker: Received VL Marker",
+[5] =
+	"BadPreempt: Exceeded the preemption nesting level",
+[6] =
+	"BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+	"UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+	"BadPktLen: Illegal PktLen",
+[2] =
+	"PktLenTooLong: Packet longer than PktLen",
+[3] =
+	"PktLenTooShort: Packet shorter than PktLen",
+[4] =
+	"BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+	"BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+	"BadL2: Illegal L2 opcode",
+[7] =
+	"BadSC: Unsupported SC",
+[9] =
+	"BadRC: Illegal RC",
+[11] =
+	"PreemptError: Preempting with same VL",
+[12] =
+	"PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	u64 info, hdr0, hdr1;
+	const char *extra;
+	char buf[96];
+	struct hfi1_pportdata *ppd = dd->pport;
+	u8 lcl_reason = 0;
+	int do_bounce = 0;
+
+	if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+		if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+			info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+			dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+		}
+		reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+		struct hfi1_pportdata *ppd = dd->pport;
+		/* this counter saturates at (2^32) - 1 */
+		if (ppd->link_downed < (u32)UINT_MAX)
+			ppd->link_downed++;
+		reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+		u8 reason_valid = 1;
+
+		info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+		if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+			dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+		}
+		switch (info) {
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+			extra = fm_config_txt[info];
+			break;
+		case 8:
+			extra = fm_config_txt[info];
+			if (ppd->port_error_action &
+			    OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+				do_bounce = 1;
+				/*
+				 * lcl_reason cannot be derived from info
+				 * for this error
+				 */
+				lcl_reason =
+				  OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+			}
+			break;
+		default:
+			reason_valid = 0;
+			snprintf(buf, sizeof(buf), "reserved%lld", info);
+			extra = buf;
+			break;
+		}
+
+		if (reason_valid && !do_bounce) {
+			do_bounce = ppd->port_error_action &
+					(1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+			lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+		}
+
+		/* just report this */
+		dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+		reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+		u8 reason_valid = 1;
+
+		info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+		hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+		hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+		if (!(dd->err_info_rcvport.status_and_code &
+		      OPA_EI_STATUS_SMASK)) {
+			dd->err_info_rcvport.status_and_code =
+				info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_rcvport.status_and_code |=
+				OPA_EI_STATUS_SMASK;
+			/* save first 2 flits in the packet that caused
+			 * the error */
+			 dd->err_info_rcvport.packet_flit1 = hdr0;
+			 dd->err_info_rcvport.packet_flit2 = hdr1;
+		}
+		switch (info) {
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+		case 7:
+		case 9:
+		case 11:
+		case 12:
+			extra = port_rcv_txt[info];
+			break;
+		default:
+			reason_valid = 0;
+			snprintf(buf, sizeof(buf), "reserved%lld", info);
+			extra = buf;
+			break;
+		}
+
+		if (reason_valid && !do_bounce) {
+			do_bounce = ppd->port_error_action &
+					(1 << (OPA_LDR_PORTRCV_OFFSET + info));
+			lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+		}
+
+		/* just report this */
+		dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+		dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
+			hdr0, hdr1);
+
+		reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+		/* informative only */
+		dd_dev_info(dd, "8051 access to LCB blocked\n");
+		reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+	}
+	if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+		/* informative only */
+		dd_dev_info(dd, "host access to LCB blocked\n");
+		reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+	}
+
+	/* report any remaining errors */
+	if (reg)
+		dd_dev_info(dd, "DCC Error: %s\n",
+			dcc_err_string(buf, sizeof(buf), reg));
+
+	if (lcl_reason == 0)
+		lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+	if (do_bounce) {
+		dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+		set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+		queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+	}
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "LCB Error: %s\n",
+		lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &dc_errs[source];
+
+	if (eri->handler) {
+		interrupt_clear_down(dd, 0, eri);
+	} else if (source == 3 /* dc_lbm_int */) {
+		/*
+		 * This indicates that a parity error has occurred on the
+		 * address/control lines presented to the LBM.  The error
+		 * is a single pulse, there is no associated error flag,
+		 * and it is non-maskable.  This is because if a parity
+		 * error occurs on the request the request is dropped.
+		 * This should never occur, but it is nice to know if it
+		 * ever does.
+		 */
+		dd_dev_err(dd, "Parity error in DC LBM block\n");
+	} else {
+		dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+	}
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *	 0 -  N-1 = SDma
+ *	 N - 2N-1 = SDmaProgress
+ *	2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	/* what interrupt */
+	unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+	/* which engine */
+	unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+	if (likely(what < 3 && which < dd->num_sdma)) {
+		sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+	} else {
+		/* should not happen */
+		dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+	}
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	struct hfi1_ctxtdata *rcd;
+	char *err_detail;
+
+	if (likely(source < dd->num_rcv_contexts)) {
+		rcd = dd->rcd[source];
+		if (rcd) {
+			if (source < dd->first_user_ctxt)
+				rcd->do_interrupt(rcd);
+			else
+				handle_user_interrupt(rcd);
+			return;	/* OK */
+		}
+		/* received an interrupt, but no rcd */
+		err_detail = "dataless";
+	} else {
+		/* received an interrupt, but are not using that context */
+		err_detail = "out of range";
+	}
+	dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+		err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	struct hfi1_ctxtdata *rcd;
+	char *err_detail;
+
+	if (likely(source < dd->num_rcv_contexts)) {
+		rcd = dd->rcd[source];
+		if (rcd) {
+			/* only pay attention to user urgent interrupts */
+			if (source >= dd->first_user_ctxt)
+				handle_user_interrupt(rcd);
+			return;	/* OK */
+		}
+		/* received an interrupt, but no rcd */
+		err_detail = "dataless";
+	} else {
+		/* received an interrupt, but are not using that context */
+		err_detail = "out of range";
+	}
+	dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+		err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	char name[64];
+
+	dd_dev_err(dd, "unexpected %s interrupt\n",
+				is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/* start		     end
+				name func		interrupt func */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+				is_misc_err_name,	is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+				is_sdma_eng_err_name,	is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+				is_sendctxt_err_name,	is_sendctxt_err_int },
+{ IS_SDMA_START,	     IS_SDMA_END,
+				is_sdma_eng_name,	is_sdma_eng_int },
+{ IS_VARIOUS_START,	     IS_VARIOUS_END,
+				is_various_name,	is_various_int },
+{ IS_DC_START,	     IS_DC_END,
+				is_dc_name,		is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+				is_rcv_avail_name,	is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+				is_rcv_urgent_name,	is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+				is_send_credit_name,	is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+				is_reserved_name,	is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct is_table *entry;
+
+	/* avoids a double compare by walking the table in-order */
+	for (entry = &is_table[0]; entry->is_name; entry++) {
+		if (source < entry->end) {
+			trace_hfi1_interrupt(dd, entry, source);
+			entry->is_int(dd, source - entry->start);
+			return;
+		}
+	}
+	/* fell off the end */
+	dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+	struct hfi1_devdata *dd = data;
+	u64 regs[CCE_NUM_INT_CSRS];
+	u32 bit;
+	int i;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* phase 1: scan and clear all handled interrupts */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+		if (dd->gi_mask[i] == 0) {
+			regs[i] = 0;	/* used later */
+			continue;
+		}
+		regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+				dd->gi_mask[i];
+		/* only clear if anything is set */
+		if (regs[i])
+			write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+	}
+
+	/* phase 2: call the appropriate handler */
+	for_each_set_bit(bit, (unsigned long *)&regs[0],
+						CCE_NUM_INT_CSRS*64) {
+		is_interrupt(dd, bit);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+	struct sdma_engine *sde = data;
+	struct hfi1_devdata *dd = sde->dd;
+	u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	sdma_dumpstate(sde);
+#endif
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* This read_csr is really bad in the hot path */
+	status = read_csr(dd,
+			CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
+			& sde->imask;
+	if (likely(status)) {
+		/* clear the interrupt(s) */
+		write_csr(dd,
+			CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
+			status);
+
+		/* handle the interrupt(s) */
+		sdma_engine_interrupt(sde, status);
+	} else
+		dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+			sde->this_idx);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * NOTE: this routine expects to be on its own MSI-X interrupt.  If
+ * multiple receive contexts share the same MSI-X interrupt, then this
+ * routine must check for who received it.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+	struct hfi1_ctxtdata *rcd = data;
+	struct hfi1_devdata *dd = rcd->dd;
+
+	trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+	this_cpu_inc(*dd->int_counter);
+
+	/* clear the interrupt */
+	write_csr(rcd->dd, CCE_INT_CLEAR + (8*rcd->ireg), rcd->imask);
+
+	/* handle the interrupt */
+	rcd->do_interrupt(rcd);
+
+	return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+	return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+				& DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+static u32 read_logical_state(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+	return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+				& DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+	/* clear current state, set new state */
+	reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+	reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+	write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+	u32 regno;
+	int ret;
+
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		if (acquire_lcb_access(dd, 0) == 0) {
+			*data = read_csr(dd, addr);
+			release_lcb_access(dd, 0);
+			return 0;
+		}
+		return -EBUSY;
+	}
+
+	/* register is an index of LCB registers: (offset - base) / 8 */
+	regno = (addr - DC_LCB_CFG_RUN) >> 3;
+	ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+	if (ret != HCMD_SUCCESS)
+		return -EBUSY;
+	return 0;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	/* if up, go through the 8051 for the value */
+	if (ppd->host_link_state & HLS_UP)
+		return read_lcb_via_8051(dd, addr, data);
+	/* if going up or down, no access */
+	if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+		return -EBUSY;
+	/* otherwise, host has access */
+	*data = read_csr(dd, addr);
+	return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+
+	if (acquire_lcb_access(dd, 0) == 0) {
+		write_csr(dd, addr, data);
+		release_lcb_access(dd, 0);
+		return 0;
+	}
+	return -EBUSY;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	/* if up, go through the 8051 for the value */
+	if (ppd->host_link_state & HLS_UP)
+		return write_lcb_via_8051(dd, addr, data);
+	/* if going up or down, no access */
+	if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+		return -EBUSY;
+	/* otherwise, host has access */
+	write_csr(dd, addr, data);
+	return 0;
+}
+
+/*
+ * Returns:
+ *	< 0 = Linux error, not able to get access
+ *	> 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+	struct hfi1_devdata *dd,
+	u32 type,
+	u64 in_data,
+	u64 *out_data)
+{
+	u64 reg, completed;
+	int return_code;
+	unsigned long flags;
+	unsigned long timeout;
+
+	hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+	/*
+	 * Alternative to holding the lock for a long time:
+	 * - keep busy wait - have other users bounce off
+	 */
+	spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+	/* We can't send any commands to the 8051 if it's in reset */
+	if (dd->dc_shutdown) {
+		return_code = -ENODEV;
+		goto fail;
+	}
+
+	/*
+	 * If an 8051 host command timed out previously, then the 8051 is
+	 * stuck.
+	 *
+	 * On first timeout, attempt to reset and restart the entire DC
+	 * block (including 8051). (Is this too big of a hammer?)
+	 *
+	 * If the 8051 times out a second time, the reset did not bring it
+	 * back to healthy life. In that case, fail any subsequent commands.
+	 */
+	if (dd->dc8051_timed_out) {
+		if (dd->dc8051_timed_out > 1) {
+			dd_dev_err(dd,
+				   "Previous 8051 host command timed out, skipping command %u\n",
+				   type);
+			return_code = -ENXIO;
+			goto fail;
+		}
+		spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+		dc_shutdown(dd);
+		dc_start(dd);
+		spin_lock_irqsave(&dd->dc8051_lock, flags);
+	}
+
+	/*
+	 * If there is no timeout, then the 8051 command interface is
+	 * waiting for a command.
+	 */
+
+	/*
+	 * Do two writes: the first to stabilize the type and req_data, the
+	 * second to activate.
+	 */
+	reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+			<< DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+		| (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+			<< DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+	reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+	/* wait for completion, alternate: interrupt */
+	timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+		completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+		if (completed)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd->dc8051_timed_out++;
+			dd_dev_err(dd, "8051 host command %u timeout\n", type);
+			if (out_data)
+				*out_data = 0;
+			return_code = -ETIMEDOUT;
+			goto fail;
+		}
+		udelay(2);
+	}
+
+	if (out_data) {
+		*out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+				& DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+		if (type == HCMD_READ_LCB_CSR) {
+			/* top 16 bits are in a different register */
+			*out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+				& DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+				<< (48
+				    - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+		}
+	}
+	return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+				& DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+	dd->dc8051_timed_out = 0;
+	/*
+	 * Clear command for next user.
+	 */
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+	return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+	return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+			    u8 lane_id, u32 config_data)
+{
+	u64 data;
+	int ret;
+
+	data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+		| (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+		| (u64)config_data << LOAD_DATA_DATA_SHIFT;
+	ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			"load 8051 config: field id %d, lane %d, err %d\n",
+			(int)field_id, (int)lane_id, ret);
+	}
+	return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+			    u32 *result)
+{
+	u64 big_data;
+	u32 addr;
+	int ret;
+
+	/* address start depends on the lane_id */
+	if (lane_id < 4)
+		addr = (4 * NUM_GENERAL_FIELDS)
+			+ (lane_id * 4 * NUM_LANE_FIELDS);
+	else
+		addr = 0;
+	addr += field_id * 4;
+
+	/* read is in 8-byte chunks, hardware will truncate the address down */
+	ret = read_8051_data(dd, addr, 8, &big_data);
+
+	if (ret == 0) {
+		/* extract the 4 bytes we want */
+		if (addr & 0x4)
+			*result = (u32)(big_data >> 32);
+		else
+			*result = (u32)big_data;
+	} else {
+		*result = 0;
+		dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+			__func__, lane_id, field_id);
+	}
+
+	return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+			      u8 continuous)
+{
+	u32 frame;
+
+	frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+		| power_management << POWER_MANAGEMENT_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+				GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+				 u16 vl15buf, u8 crc_sizes)
+{
+	u32 frame;
+
+	frame = (u32)vau << VAU_SHIFT
+		| (u32)z << Z_SHIFT
+		| (u32)vcu << VCU_SHIFT
+		| (u32)vl15buf << VL15BUF_SHIFT
+		| (u32)crc_sizes << CRC_SIZES_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+				GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+				     u8 *flag_bits, u16 *link_widths)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+				&frame);
+	*misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+	*flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+	*link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+				     u8 misc_bits,
+				     u8 flag_bits,
+				     u16 link_widths)
+{
+	u32 frame;
+
+	frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+		| (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+		| (u32)link_widths << LINK_WIDTH_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+		     frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+				 u8 device_rev)
+{
+	u32 frame;
+
+	frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+		| ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+	return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+				  u8 *device_rev)
+{
+	u32 frame;
+
+	read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+	*device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+	*device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+			& REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+	u32 frame;
+
+	read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+	*ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+	*ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+			       u8 *continuous)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+	*power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+					& POWER_MANAGEMENT_MASK;
+	*continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+					& CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+				  u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+	*vau = (frame >> VAU_SHIFT) & VAU_MASK;
+	*z = (frame >> Z_SHIFT) & Z_MASK;
+	*vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+	*vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+	*crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+				      u8 *remote_tx_rate,
+				      u16 *link_widths)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+				&frame);
+	*remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+				& REMOTE_TX_RATE_MASK;
+	*link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+	u32 frame;
+
+	read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+	*enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+	u32 frame;
+
+	read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+	*mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+	read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+	read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+	u32 frame;
+	int ret;
+
+	*link_quality = 0;
+	if (dd->pport->host_link_state & HLS_UP) {
+		ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+					&frame);
+		if (ret == 0)
+			*link_quality = (frame >> LINK_QUALITY_SHIFT)
+						& LINK_QUALITY_MASK;
+	}
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+	u32 frame;
+
+	read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+	*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+			    u8 *enable_lane_tx,
+			    u8 *tx_polarity_inversion,
+			    u8 *rx_polarity_inversion,
+			    u8 *max_rate)
+{
+	u32 frame;
+	int ret;
+
+	ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+	*enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+				& ENABLE_LANE_TX_MASK;
+	*tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+				& TX_POLARITY_INVERSION_MASK;
+	*rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+				& RX_POLARITY_INVERSION_MASK;
+	*max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+	return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+			     u8 enable_lane_tx,
+			     u8 tx_polarity_inversion,
+			     u8 rx_polarity_inversion,
+			     u8 max_rate)
+{
+	u32 frame;
+
+	/* no need to mask, all variable sizes match field widths */
+	frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+		| tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+		| rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+		| max_rate << MAX_RATE_SHIFT;
+	return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+	u32 frame, version, prod_id;
+	int ret, lane;
+
+	/* 4 lanes */
+	for (lane = 0; lane < 4; lane++) {
+		ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+		if (ret) {
+			dd_dev_err(
+				dd,
+				"Unable to read lane %d firmware details\n",
+				lane);
+			continue;
+		}
+		version = (frame >> SPICO_ROM_VERSION_SHIFT)
+					& SPICO_ROM_VERSION_MASK;
+		prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+					& SPICO_ROM_PROD_ID_MASK;
+		dd_dev_info(dd,
+			"Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+			lane, version, prod_id);
+	}
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
+		type, data_out);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "read idle message: type %d, err %d\n",
+			(u32)type, ret);
+		return -EINVAL;
+	}
+	dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+	/* return only the payload as we already know the type */
+	*data_out >>= IDLE_PAYLOAD_SHIFT;
+	return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+	return read_idle_message(dd,
+			(u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+	int ret;
+
+	dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+	ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+			data, ret);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+	u64 data;
+
+	data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
+		| ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+	return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	unsigned long timeout;
+	int ret;
+
+	lcb_shutdown(dd, 0);
+
+	if (loopback) {
+		/* LCB_CFG_LOOPBACK.VAL = 2 */
+		/* LCB_CFG_LANE_WIDTH.VAL = 0 */
+		write_csr(dd, DC_LCB_CFG_LOOPBACK,
+			IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+		write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+	}
+
+	/* start the LCBs */
+	/* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+	/* simulator only loopback steps */
+	if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		/* LCB_CFG_RUN.EN = 1 */
+		write_csr(dd, DC_LCB_CFG_RUN,
+			1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+		/* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+		timeout = jiffies + msecs_to_jiffies(10);
+		while (1) {
+			reg = read_csr(dd,
+				DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+			if (reg)
+				break;
+			if (time_after(jiffies, timeout)) {
+				dd_dev_err(dd,
+					"timeout waiting for LINK_TRANSFER_ACTIVE\n");
+				return -ETIMEDOUT;
+			}
+			udelay(2);
+		}
+
+		write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+			1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+	}
+
+	if (!loopback) {
+		/*
+		 * When doing quick linkup and not in loopback, both
+		 * sides must be done with LCB set-up before either
+		 * starts the quick linkup.  Put a delay here so that
+		 * both sides can be started and have a chance to be
+		 * done with LCB set up before resuming.
+		 */
+		dd_dev_err(dd,
+			"Pausing for peer to be finished with LCB set up\n");
+		msleep(5000);
+		dd_dev_err(dd,
+			"Continuing with quick linkup\n");
+	}
+
+	write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+	set_8051_lcb_access(dd);
+
+	/*
+	 * State "quick" LinkUp request sets the physical link state to
+	 * LinkUp without a verify capability sequence.
+	 * This state is in simulator v37 and later.
+	 */
+	ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			"%s: set physical link state to quick LinkUp failed with return %d\n",
+			__func__, ret);
+
+		set_host_lcb_access(dd);
+		write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+		if (ret >= 0)
+			ret = -EINVAL;
+		return ret;
+	}
+
+	return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+	if (ret == HCMD_SUCCESS)
+		return 0;
+	dd_dev_err(dd,
+		"Set physical link state to SerDes Loopback failed with return %d\n",
+		ret);
+	if (ret >= 0)
+		ret = -EINVAL;
+	return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+	dd_dev_info(dd, "Entering loopback mode\n");
+
+	/* all loopbacks should disable self GUID check */
+	write_csr(dd, DC_DC8051_CFG_MODE,
+		(read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+	/*
+	 * The simulator has only one loopback option - LCB.  Switch
+	 * to that option, which includes quick link up.
+	 *
+	 * Accept all valid loopback values.
+	 */
+	if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		&& (loopback == LOOPBACK_SERDES
+			|| loopback == LOOPBACK_LCB
+			|| loopback == LOOPBACK_CABLE)) {
+		loopback = LOOPBACK_LCB;
+		quick_linkup = 1;
+		return 0;
+	}
+
+	/* handle serdes loopback */
+	if (loopback == LOOPBACK_SERDES) {
+		/* internal serdes loopack needs quick linkup on RTL */
+		if (dd->icode == ICODE_RTL_SILICON)
+			quick_linkup = 1;
+		return set_serdes_loopback_mode(dd);
+	}
+
+	/* LCB loopback - handled at poll time */
+	if (loopback == LOOPBACK_LCB) {
+		quick_linkup = 1; /* LCB is always quick linkup */
+
+		/* not supported in emulation due to emulation RTL changes */
+		if (dd->icode == ICODE_FPGA_EMULATION) {
+			dd_dev_err(dd,
+				"LCB loopback not supported in emulation\n");
+			return -EINVAL;
+		}
+		return 0;
+	}
+
+	/* external cable loopback requires no extra steps */
+	if (loopback == LOOPBACK_CABLE)
+		return 0;
+
+	dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+	return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+	int i;
+	u16 result = 0;
+
+	static const struct link_bits {
+		u16 from;
+		u16 to;
+	} opa_link_xlate[] = {
+		{ OPA_LINK_WIDTH_1X, 1 << (1-1)  },
+		{ OPA_LINK_WIDTH_2X, 1 << (2-1)  },
+		{ OPA_LINK_WIDTH_3X, 1 << (3-1)  },
+		{ OPA_LINK_WIDTH_4X, 1 << (4-1)  },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+		if (opa_widths & opa_link_xlate[i].from)
+			result |= opa_link_xlate[i].to;
+	}
+	return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u8 enable_lane_tx;
+	u8 tx_polarity_inversion;
+	u8 rx_polarity_inversion;
+	int ret;
+
+	/* reset our fabric serdes to clear any lingering problems */
+	fabric_serdes_reset(dd);
+
+	/* set the local tx rate - need to read-modify-write */
+	ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+		&rx_polarity_inversion, &ppd->local_tx_rate);
+	if (ret)
+		goto set_local_link_attributes_fail;
+
+	if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+		/* set the tx rate to the fastest enabled */
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+			ppd->local_tx_rate = 1;
+		else
+			ppd->local_tx_rate = 0;
+	} else {
+		/* set the tx rate to all enabled */
+		ppd->local_tx_rate = 0;
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+			ppd->local_tx_rate |= 2;
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+			ppd->local_tx_rate |= 1;
+	}
+	ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+		     rx_polarity_inversion, ppd->local_tx_rate);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/*
+	 * DC supports continuous updates.
+	 */
+	ret = write_vc_local_phy(dd, 0 /* no power management */,
+				     1 /* continuous updates */);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/* z=1 in the next call: AU of 0 is not supported by the hardware */
+	ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+				    ppd->port_crc_mode_enabled);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	ret = write_vc_local_link_width(dd, 0, 0,
+		     opa_to_vc_link_widths(ppd->link_width_enabled));
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/* let peer know who we are */
+	ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+	if (ret == HCMD_SUCCESS)
+		return 0;
+
+set_local_link_attributes_fail:
+	dd_dev_err(dd,
+		"Failed to set local link attributes, return 0x%x\n",
+		ret);
+	return ret;
+}
+
+/*
+ * Call this to start the link.  Schedule a retry if the cable is not
+ * present or if unable to start polling.  Do not do anything if the
+ * link is disabled.  Returns 0 if link is disabled or moved to polling
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+	if (!ppd->link_enabled) {
+		dd_dev_info(ppd->dd,
+			"%s: stopping link start because link is disabled\n",
+			__func__);
+		return 0;
+	}
+	if (!ppd->driver_link_ready) {
+		dd_dev_info(ppd->dd,
+			"%s: stopping link start because driver is not ready\n",
+			__func__);
+		return 0;
+	}
+
+	if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
+			loopback == LOOPBACK_LCB ||
+			ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		return set_link_state(ppd, HLS_DN_POLL);
+
+	dd_dev_info(ppd->dd,
+		"%s: stopping link start because no cable is present\n",
+		__func__);
+	return -EAGAIN;
+}
+
+static void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 mask, qsfp_mask;
+
+	mask = (u64)QSFP_HFI0_RESET_N;
+	qsfp_mask = read_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
+	qsfp_mask |= mask;
+	write_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
+		qsfp_mask);
+
+	qsfp_mask = read_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+	qsfp_mask &= ~mask;
+	write_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+		qsfp_mask);
+
+	udelay(10);
+
+	qsfp_mask |= mask;
+	write_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+		qsfp_mask);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+					u8 *qsfp_interrupt_status)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+		(qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+		dd_dev_info(dd,
+			"%s: QSFP cable on fire\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+		(qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+		dd_dev_info(dd,
+			"%s: QSFP cable temperature too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+		(qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+		dd_dev_info(dd,
+			"%s: QSFP supply voltage too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+		(qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+		dd_dev_info(dd,
+			"%s: QSFP supply voltage too low\n",
+			__func__);
+
+	/* Byte 2 is vendor specific */
+
+	if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+		(qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable RX channel 1/2 power too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+		(qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable RX channel 1/2 power too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+		(qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable RX channel 3/4 power too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+		(qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable RX channel 3/4 power too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+		(qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 1/2 bias too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+		(qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 1/2 bias too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+		(qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 3/4 bias too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+		(qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 3/4 bias too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+		(qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 1/2 power too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+		(qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 1/2 power too low\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+		(qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 3/4 power too high\n",
+			__func__);
+
+	if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+		(qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+		dd_dev_info(dd,
+			"%s: Cable TX channel 3/4 power too low\n",
+			__func__);
+
+	/* Bytes 9-10 and 11-12 are reserved */
+	/* Bytes 13-15 are vendor specific */
+
+	return 0;
+}
+
+static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
+{
+	refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+	return 0;
+}
+
+static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u8 qsfp_interrupt_status = 0;
+
+	if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
+		!= 1) {
+		dd_dev_info(dd,
+			"%s: Failed to read status of QSFP module\n",
+			__func__);
+		return -EIO;
+	}
+
+	/* We don't care about alarms & warnings with a non-functional INT_N */
+	if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
+		do_pre_lni_host_behaviors(ppd);
+
+	return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module is present */
+static void qsfp_event(struct work_struct *work)
+{
+	struct qsfp_data *qd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+
+	qd = container_of(work, struct qsfp_data, qsfp_work);
+	ppd = qd->ppd;
+	dd = ppd->dd;
+
+	/* Sanity check */
+	if (!qsfp_mod_present(ppd))
+		return;
+
+	/*
+	 * Turn DC back on after cables has been
+	 * re-inserted. Up until now, the DC has been in
+	 * reset to save power.
+	 */
+	dc_start(dd);
+
+	if (qd->cache_refresh_required) {
+		msleep(3000);
+		reset_qsfp(ppd);
+
+		/* Check for QSFP interrupt after t_init (SFF 8679)
+		 * + extra
+		 */
+		msleep(3000);
+		if (!qd->qsfp_interrupt_functional) {
+			if (do_qsfp_intr_fallback(ppd) < 0)
+				dd_dev_info(dd, "%s: QSFP fallback failed\n",
+					__func__);
+			ppd->driver_link_ready = 1;
+			start_link(ppd);
+		}
+	}
+
+	if (qd->check_interrupt_flags) {
+		u8 qsfp_interrupt_status[16] = {0,};
+
+		if (qsfp_read(ppd, dd->hfi1_id, 6,
+			      &qsfp_interrupt_status[0], 16) != 16) {
+			dd_dev_info(dd,
+				"%s: Failed to read status of QSFP module\n",
+				__func__);
+		} else {
+			unsigned long flags;
+			u8 data_status;
+
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			ppd->qsfp_info.check_interrupt_flags = 0;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+								flags);
+
+			if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
+				 != 1) {
+				dd_dev_info(dd,
+				"%s: Failed to read status of QSFP module\n",
+					__func__);
+			}
+			if (!(data_status & QSFP_DATA_NOT_READY)) {
+				do_pre_lni_host_behaviors(ppd);
+				start_link(ppd);
+			} else
+				handle_qsfp_error_conditions(ppd,
+						qsfp_interrupt_status);
+		}
+	}
+}
+
+void init_qsfp(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 qsfp_mask;
+
+	if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+			ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+			!HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+		ppd->driver_link_ready = 1;
+		return;
+	}
+
+	ppd->qsfp_info.ppd = ppd;
+	INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+	qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+	/* Clear current status to avoid spurious interrupts */
+	write_csr(dd,
+			dd->hfi1_id ?
+				ASIC_QSFP2_CLEAR :
+				ASIC_QSFP1_CLEAR,
+		qsfp_mask);
+
+	/* Handle active low nature of INT_N and MODPRST_N pins */
+	if (qsfp_mod_present(ppd))
+		qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+	write_csr(dd,
+		  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+		  qsfp_mask);
+
+	/* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
+	qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
+	write_csr(dd,
+		dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+		qsfp_mask);
+
+	if (qsfp_mod_present(ppd)) {
+		msleep(3000);
+		reset_qsfp(ppd);
+
+		/* Check for QSFP interrupt after t_init (SFF 8679)
+		 * + extra
+		 */
+		msleep(3000);
+		if (!ppd->qsfp_info.qsfp_interrupt_functional) {
+			if (do_qsfp_intr_fallback(ppd) < 0)
+				dd_dev_info(dd,
+					"%s: QSFP fallback failed\n",
+					__func__);
+			ppd->driver_link_ready = 1;
+		}
+	}
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 guid;
+	int ret;
+
+	if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+		add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+	guid = ppd->guid;
+	if (!guid) {
+		if (dd->base_guid)
+			guid = dd->base_guid + ppd->port - 1;
+		ppd->guid = guid;
+	}
+
+	/* the link defaults to enabled */
+	ppd->link_enabled = 1;
+	/* Set linkinit_reason on power up per OPA spec */
+	ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+	if (loopback) {
+		ret = init_loopback(dd);
+		if (ret < 0)
+			return ret;
+	}
+
+	return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * Shut down the link and keep it down.   First turn off that the
+	 * driver wants to allow the link to be up (driver_link_ready).
+	 * Then make sure the link is not automatically restarted
+	 * (link_enabled).  Cancel any pending restart.  And finally
+	 * go offline.
+	 */
+	ppd->driver_link_ready = 0;
+	ppd->link_enabled = 0;
+
+	set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+	  OPA_LINKDOWN_REASON_SMA_DISABLED);
+	set_link_state(ppd, HLS_DN_OFFLINE);
+
+	/* disable the port */
+	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->ibport_data.rc_acks = NULL;
+		ppd->ibport_data.rc_qacks = NULL;
+		ppd->ibport_data.rc_acks = alloc_percpu(u64);
+		ppd->ibport_data.rc_qacks = alloc_percpu(u64);
+		ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
+		if ((ppd->ibport_data.rc_acks == NULL) ||
+		    (ppd->ibport_data.rc_delayed_comp == NULL) ||
+		    (ppd->ibport_data.rc_qacks == NULL))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static const char * const pt_names[] = {
+	"expected",
+	"eager",
+	"invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+	return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+		  u32 type, unsigned long pa, u16 order)
+{
+	u64 reg;
+	void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+			      (dd->kregbase + RCV_ARRAY));
+
+	if (!(dd->flags & HFI1_PRESENT))
+		goto done;
+
+	if (type == PT_INVALID) {
+		pa = 0;
+	} else if (type > PT_INVALID) {
+		dd_dev_err(dd,
+			"unexpected receive array type %u for index %u, not handled\n",
+			type, index);
+		goto done;
+	}
+
+	hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+		  pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12	/* 4KB kernel address boundary */
+	reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+					<< RCV_ARRAY_RT_ADDR_SHIFT;
+	writeq(reg, base + (index * 8));
+
+	if (type == PT_EAGER)
+		/*
+		 * Eager entries are written one-by-one so we have to push them
+		 * after we write the entry.
+		 */
+		flush_wc();
+done:
+	return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 i;
+
+	/* this could be optimized */
+	for (i = rcd->eager_base; i < rcd->eager_base +
+		     rcd->egrbufs.alloced; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+	for (i = rcd->expected_base;
+			i < rcd->expected_base + rcd->expected_count; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+			struct hfi1_ctxt_info *kinfo)
+{
+	kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+		HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+	return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+				struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+	return (struct hfi1_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+	"HFI1_IB_CFG_LIDLMC",
+	"HFI1_IB_CFG_LWID_DG_ENB",
+	"HFI1_IB_CFG_LWID_ENB",
+	"HFI1_IB_CFG_LWID",
+	"HFI1_IB_CFG_SPD_ENB",
+	"HFI1_IB_CFG_SPD",
+	"HFI1_IB_CFG_RXPOL_ENB",
+	"HFI1_IB_CFG_LREV_ENB",
+	"HFI1_IB_CFG_LINKLATENCY",
+	"HFI1_IB_CFG_HRTBT",
+	"HFI1_IB_CFG_OP_VLS",
+	"HFI1_IB_CFG_VL_HIGH_CAP",
+	"HFI1_IB_CFG_VL_LOW_CAP",
+	"HFI1_IB_CFG_OVERRUN_THRESH",
+	"HFI1_IB_CFG_PHYERR_THRESH",
+	"HFI1_IB_CFG_LINKDEFAULT",
+	"HFI1_IB_CFG_PKEYS",
+	"HFI1_IB_CFG_MTU",
+	"HFI1_IB_CFG_LSTATE",
+	"HFI1_IB_CFG_VL_HIGH_LIMIT",
+	"HFI1_IB_CFG_PMA_TICKS",
+	"HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+	if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+		return "invalid";
+	return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int val = 0;
+
+	switch (which) {
+	case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+		val = ppd->link_width_enabled;
+		break;
+	case HFI1_IB_CFG_LWID: /* currently active Link-width */
+		val = ppd->link_width_active;
+		break;
+	case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+		val = ppd->link_speed_enabled;
+		break;
+	case HFI1_IB_CFG_SPD: /* current Link speed */
+		val = ppd->link_speed_active;
+		break;
+
+	case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+	case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+	case HFI1_IB_CFG_LINKLATENCY:
+		goto unimplemented;
+
+	case HFI1_IB_CFG_OP_VLS:
+		val = ppd->vls_operational;
+		break;
+	case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+		val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+		break;
+	case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+		val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+		break;
+	case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		val = ppd->overrun_threshold;
+		break;
+	case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		val = ppd->phy_error_threshold;
+		break;
+	case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		val = dd->link_default;
+		break;
+
+	case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+	case HFI1_IB_CFG_PMA_TICKS:
+	default:
+unimplemented:
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(
+				dd,
+				"%s: which %s: not implemented\n",
+				__func__,
+				ib_cfg_name(which));
+		break;
+	}
+
+	return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+	/*
+	 * The maximum non-payload (MTU) bytes in LRH.PktLen are
+	 * the Receive Header Entry Size minus the PBC (or RHF) size
+	 * plus one DW for the ICRC appended by HW.
+	 *
+	 * dd->rcd[0].rcvhdrqentsize is in DW.
+	 * We use rcd[0] as all context will have the same value. Also,
+	 * the first kernel context would have been allocated by now so
+	 * we are guaranteed a valid value.
+	 */
+	return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu;
+	u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+			      & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+		SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+	int i;
+
+	for (i = 0; i < ppd->vls_supported; i++) {
+		if (dd->vld[i].mtu > maxvlmtu)
+			maxvlmtu = dd->vld[i].mtu;
+		if (i <= 3)
+			len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+				 & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+				((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+		else
+			len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+				 & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+				((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+	}
+	write_csr(dd, SEND_LEN_CHECK0, len1);
+	write_csr(dd, SEND_LEN_CHECK1, len2);
+	/* adjust kernel credit return thresholds based on new MTUs */
+	/* all kernel receive contexts have the same hdrqentsize */
+	for (i = 0; i < ppd->vls_supported; i++) {
+		sc_set_cr_threshold(dd->vld[i].sc,
+			sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
+				dd->rcd[0]->rcvhdrqentsize));
+	}
+	sc_set_cr_threshold(dd->vld[15].sc,
+		sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
+			dd->rcd[0]->rcvhdrqentsize));
+
+	/* Adjust maximum MTU for the port in DC */
+	dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+		(ilog2(maxvlmtu >> 8) + 1);
+	len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+	len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+	len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+		DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+	write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+	int i;
+	u64 sreg = 0;
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 mask = ~((1U << ppd->lmc) - 1);
+	u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+	if (dd->hfi1_snoop.mode_flag)
+		dd_dev_info(dd, "Set lid/lmc while snooping");
+
+	c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+		| DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+	c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+			<< DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
+	      ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+			<< DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+	write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+	/*
+	 * Iterate over all the send contexts and set their SLID check
+	 */
+	sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+			SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+	       (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+			SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+	for (i = 0; i < dd->chip_send_contexts; i++) {
+		hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+			  i, (u32)sreg);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+	}
+
+	/* Now we have to do the same thing for the sdma engines */
+	sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+	unsigned long timeout;
+	u32 curr_state;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		curr_state = read_physical_state(dd);
+		if (curr_state == state)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				"timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+				state, curr_state);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	return 0;
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 pstate, previous_state;
+	u32 last_local_state;
+	u32 last_remote_state;
+	int ret;
+	int do_transition;
+	int do_wait;
+
+	previous_state = ppd->host_link_state;
+	ppd->host_link_state = HLS_GOING_OFFLINE;
+	pstate = read_physical_state(dd);
+	if (pstate == PLS_OFFLINE) {
+		do_transition = 0;	/* in right state */
+		do_wait = 0;		/* ...no need to wait */
+	} else if ((pstate & 0xff) == PLS_OFFLINE) {
+		do_transition = 0;	/* in an offline transient state */
+		do_wait = 1;		/* ...wait for it to settle */
+	} else {
+		do_transition = 1;	/* need to move to offline */
+		do_wait = 1;		/* ...will need to wait */
+	}
+
+	if (do_transition) {
+		ret = set_physical_link_state(dd,
+			PLS_OFFLINE | (rem_reason << 8));
+
+		if (ret != HCMD_SUCCESS) {
+			dd_dev_err(dd,
+				"Failed to transition to Offline link state, return %d\n",
+				ret);
+			return -EINVAL;
+		}
+		if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
+			ppd->offline_disabled_reason =
+			OPA_LINKDOWN_REASON_TRANSIENT;
+	}
+
+	if (do_wait) {
+		/* it can take a while for the link to go down */
+		ret = wait_phy_linkstate(dd, PLS_OFFLINE, 5000);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* make sure the logical state is also down */
+	wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+	/*
+	 * Now in charge of LCB - must be after the physical state is
+	 * offline.quiet and before host_link_state is changed.
+	 */
+	set_host_lcb_access(dd);
+	write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+	ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+	/*
+	 * The LNI has a mandatory wait time after the physical state
+	 * moves to Offline.Quiet.  The wait time may be different
+	 * depending on how the link went down.  The 8051 firmware
+	 * will observe the needed wait time and only move to ready
+	 * when that is completed.  The largest of the quiet timeouts
+	 * is 2.5s, so wait that long and then a bit more.
+	 */
+	ret = wait_fm_ready(dd, 3000);
+	if (ret) {
+		dd_dev_err(dd,
+			"After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+		/* state is really offline, so make it so */
+		ppd->host_link_state = HLS_DN_OFFLINE;
+		return ret;
+	}
+
+	/*
+	 * The state is now offline and the 8051 is ready to accept host
+	 * requests.
+	 *	- change our state
+	 *	- notify others if we were previously in a linkup state
+	 */
+	ppd->host_link_state = HLS_DN_OFFLINE;
+	if (previous_state & HLS_UP) {
+		/* went down while link was up */
+		handle_linkup_change(dd, 0);
+	} else if (previous_state
+			& (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+		/* went down while attempting link up */
+		/* byte 1 of last_*_state is the failure reason */
+		read_last_local_state(dd, &last_local_state);
+		read_last_remote_state(dd, &last_remote_state);
+		dd_dev_err(dd,
+			"LNI failure last states: local 0x%08x, remote 0x%08x\n",
+			last_local_state, last_remote_state);
+	}
+
+	/* the active link width (downgrade) is 0 on link down */
+	ppd->link_width_active = 0;
+	ppd->link_width_downgrade_tx_active = 0;
+	ppd->link_width_downgrade_rx_active = 0;
+	ppd->current_egress_rate = 0;
+	return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+	const char *name;
+	int n = ilog2(state);
+	static const char * const names[] = {
+		[__HLS_UP_INIT_BP]	 = "INIT",
+		[__HLS_UP_ARMED_BP]	 = "ARMED",
+		[__HLS_UP_ACTIVE_BP]	 = "ACTIVE",
+		[__HLS_DN_DOWNDEF_BP]	 = "DOWNDEF",
+		[__HLS_DN_POLL_BP]	 = "POLL",
+		[__HLS_DN_DISABLE_BP]	 = "DISABLE",
+		[__HLS_DN_OFFLINE_BP]	 = "OFFLINE",
+		[__HLS_VERIFY_CAP_BP]	 = "VERIFY_CAP",
+		[__HLS_GOING_UP_BP]	 = "GOING_UP",
+		[__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+		[__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+	};
+
+	name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+	return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+	if (state == HLS_UP_INIT) {
+		switch (ppd->linkinit_reason) {
+		case OPA_LINKINIT_REASON_LINKUP:
+			return "(LINKUP)";
+		case OPA_LINKINIT_REASON_FLAPPING:
+			return "(FLAPPING)";
+		case OPA_LINKINIT_OUTSIDE_POLICY:
+			return "(OUTSIDE_POLICY)";
+		case OPA_LINKINIT_QUARANTINED:
+			return "(QUARANTINED)";
+		case OPA_LINKINIT_INSUFIC_CAPABILITY:
+			return "(INSUFIC_CAPABILITY)";
+		default:
+			break;
+		}
+	}
+	return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+	switch (ppd->host_link_state) {
+	case HLS_UP_INIT:
+	case HLS_UP_ARMED:
+	case HLS_UP_ACTIVE:
+		return IB_PORTPHYSSTATE_LINKUP;
+	case HLS_DN_POLL:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_DN_DISABLE:
+		return IB_PORTPHYSSTATE_DISABLED;
+	case HLS_DN_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_VERIFY_CAP:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_GOING_UP:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_GOING_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_LINK_COOLDOWN:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_DN_DOWNDEF:
+	default:
+		dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+			   ppd->host_link_state);
+		return  -1;
+	}
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+	if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+		return IB_PORT_DOWN;
+
+	switch (ppd->host_link_state & HLS_UP) {
+	case HLS_UP_INIT:
+		return IB_PORT_INIT;
+	case HLS_UP_ARMED:
+		return IB_PORT_ARMED;
+	case HLS_UP_ACTIVE:
+		return IB_PORT_ACTIVE;
+	default:
+		dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+			   ppd->host_link_state);
+	return -1;
+	}
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+			  u8 neigh_reason, u8 rem_reason)
+{
+	if (ppd->local_link_down_reason.latest == 0 &&
+	    ppd->neigh_link_down_reason.latest == 0) {
+		ppd->local_link_down_reason.latest = lcl_reason;
+		ppd->neigh_link_down_reason.latest = neigh_reason;
+		ppd->remote_link_down_reason = rem_reason;
+	}
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct ib_event event = {.device = NULL};
+	int ret1, ret = 0;
+	int was_up, is_down;
+	int orig_new_state, poll_bounce;
+
+	mutex_lock(&ppd->hls_lock);
+
+	orig_new_state = state;
+	if (state == HLS_DN_DOWNDEF)
+		state = dd->link_default;
+
+	/* interpret poll -> poll as a link bounce */
+	poll_bounce = ppd->host_link_state == HLS_DN_POLL
+				&& state == HLS_DN_POLL;
+
+	dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+		link_state_name(ppd->host_link_state),
+		link_state_name(orig_new_state),
+		poll_bounce ? "(bounce) " : "",
+		link_state_reason_name(ppd, state));
+
+	was_up = !!(ppd->host_link_state & HLS_UP);
+
+	/*
+	 * If we're going to a (HLS_*) link state that implies the logical
+	 * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+	 * reset is_sm_config_started to 0.
+	 */
+	if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+		ppd->is_sm_config_started = 0;
+
+	/*
+	 * Do nothing if the states match.  Let a poll to poll link bounce
+	 * go through.
+	 */
+	if (ppd->host_link_state == state && !poll_bounce)
+		goto done;
+
+	switch (state) {
+	case HLS_UP_INIT:
+		if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
+			    || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+			/*
+			 * Quick link up jumps from polling to here.
+			 *
+			 * Whether in normal or loopback mode, the
+			 * simulator jumps from polling to link up.
+			 * Accept that here.
+			 */
+			/* OK */;
+		} else if (ppd->host_link_state != HLS_GOING_UP) {
+			goto unexpected;
+		}
+
+		ppd->host_link_state = HLS_UP_INIT;
+		ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+		if (ret) {
+			/* logical state didn't change, stay at going_up */
+			ppd->host_link_state = HLS_GOING_UP;
+			dd_dev_err(dd,
+				"%s: logical state did not change to INIT\n",
+				__func__);
+		} else {
+			/* clear old transient LINKINIT_REASON code */
+			if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+				ppd->linkinit_reason =
+					OPA_LINKINIT_REASON_LINKUP;
+
+			/* enable the port */
+			add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+			handle_linkup_change(dd, 1);
+		}
+		break;
+	case HLS_UP_ARMED:
+		if (ppd->host_link_state != HLS_UP_INIT)
+			goto unexpected;
+
+		ppd->host_link_state = HLS_UP_ARMED;
+		set_logical_state(dd, LSTATE_ARMED);
+		ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+		if (ret) {
+			/* logical state didn't change, stay at init */
+			ppd->host_link_state = HLS_UP_INIT;
+			dd_dev_err(dd,
+				"%s: logical state did not change to ARMED\n",
+				__func__);
+		}
+		/*
+		 * The simulator does not currently implement SMA messages,
+		 * so neighbor_normal is not set.  Set it here when we first
+		 * move to Armed.
+		 */
+		if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+			ppd->neighbor_normal = 1;
+		break;
+	case HLS_UP_ACTIVE:
+		if (ppd->host_link_state != HLS_UP_ARMED)
+			goto unexpected;
+
+		ppd->host_link_state = HLS_UP_ACTIVE;
+		set_logical_state(dd, LSTATE_ACTIVE);
+		ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+		if (ret) {
+			/* logical state didn't change, stay at armed */
+			ppd->host_link_state = HLS_UP_ARMED;
+			dd_dev_err(dd,
+				"%s: logical state did not change to ACTIVE\n",
+				__func__);
+		} else {
+
+			/* tell all engines to go running */
+			sdma_all_running(dd);
+
+			/* Signal the IB layer that the port has went active */
+			event.device = &dd->verbs_dev.ibdev;
+			event.element.port_num = ppd->port;
+			event.event = IB_EVENT_PORT_ACTIVE;
+		}
+		break;
+	case HLS_DN_POLL:
+		if ((ppd->host_link_state == HLS_DN_DISABLE ||
+		     ppd->host_link_state == HLS_DN_OFFLINE) &&
+		    dd->dc_shutdown)
+			dc_start(dd);
+		/* Hand LED control to the DC */
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+		if (ppd->host_link_state != HLS_DN_OFFLINE) {
+			u8 tmp = ppd->link_enabled;
+
+			ret = goto_offline(ppd, ppd->remote_link_down_reason);
+			if (ret) {
+				ppd->link_enabled = tmp;
+				break;
+			}
+			ppd->remote_link_down_reason = 0;
+
+			if (ppd->driver_link_ready)
+				ppd->link_enabled = 1;
+		}
+
+		ret = set_local_link_attributes(ppd);
+		if (ret)
+			break;
+
+		ppd->port_error_action = 0;
+		ppd->host_link_state = HLS_DN_POLL;
+
+		if (quick_linkup) {
+			/* quick linkup does not go into polling */
+			ret = do_quick_linkup(dd);
+		} else {
+			ret1 = set_physical_link_state(dd, PLS_POLLING);
+			if (ret1 != HCMD_SUCCESS) {
+				dd_dev_err(dd,
+					"Failed to transition to Polling link state, return 0x%x\n",
+					ret1);
+				ret = -EINVAL;
+			}
+		}
+		ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+		/*
+		 * If an error occurred above, go back to offline.  The
+		 * caller may reschedule another attempt.
+		 */
+		if (ret)
+			goto_offline(ppd, 0);
+		break;
+	case HLS_DN_DISABLE:
+		/* link is disabled */
+		ppd->link_enabled = 0;
+
+		/* allow any state to transition to disabled */
+
+		/* must transition to offline first */
+		if (ppd->host_link_state != HLS_DN_OFFLINE) {
+			ret = goto_offline(ppd, ppd->remote_link_down_reason);
+			if (ret)
+				break;
+			ppd->remote_link_down_reason = 0;
+		}
+
+		ret1 = set_physical_link_state(dd, PLS_DISABLED);
+		if (ret1 != HCMD_SUCCESS) {
+			dd_dev_err(dd,
+				"Failed to transition to Disabled link state, return 0x%x\n",
+				ret1);
+			ret = -EINVAL;
+			break;
+		}
+		ppd->host_link_state = HLS_DN_DISABLE;
+		dc_shutdown(dd);
+		break;
+	case HLS_DN_OFFLINE:
+		if (ppd->host_link_state == HLS_DN_DISABLE)
+			dc_start(dd);
+
+		/* allow any state to transition to offline */
+		ret = goto_offline(ppd, ppd->remote_link_down_reason);
+		if (!ret)
+			ppd->remote_link_down_reason = 0;
+		break;
+	case HLS_VERIFY_CAP:
+		if (ppd->host_link_state != HLS_DN_POLL)
+			goto unexpected;
+		ppd->host_link_state = HLS_VERIFY_CAP;
+		break;
+	case HLS_GOING_UP:
+		if (ppd->host_link_state != HLS_VERIFY_CAP)
+			goto unexpected;
+
+		ret1 = set_physical_link_state(dd, PLS_LINKUP);
+		if (ret1 != HCMD_SUCCESS) {
+			dd_dev_err(dd,
+				"Failed to transition to link up state, return 0x%x\n",
+				ret1);
+			ret = -EINVAL;
+			break;
+		}
+		ppd->host_link_state = HLS_GOING_UP;
+		break;
+
+	case HLS_GOING_OFFLINE:		/* transient within goto_offline() */
+	case HLS_LINK_COOLDOWN:		/* transient within goto_offline() */
+	default:
+		dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+			__func__, state);
+		ret = -EINVAL;
+		break;
+	}
+
+	is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
+			HLS_DN_DISABLE | HLS_DN_OFFLINE));
+
+	if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
+	    ppd->neigh_link_down_reason.sma == 0) {
+		ppd->local_link_down_reason.sma =
+		  ppd->local_link_down_reason.latest;
+		ppd->neigh_link_down_reason.sma =
+		  ppd->neigh_link_down_reason.latest;
+	}
+
+	goto done;
+
+unexpected:
+	dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+		__func__, link_state_name(ppd->host_link_state),
+		link_state_name(state));
+	ret = -EINVAL;
+
+done:
+	mutex_unlock(&ppd->hls_lock);
+
+	if (event.device)
+		ib_dispatch_event(&event);
+
+	return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+	u64 reg;
+	int ret = 0;
+
+	switch (which) {
+	case HFI1_IB_CFG_LIDLMC:
+		set_lidlmc(ppd);
+		break;
+	case HFI1_IB_CFG_VL_HIGH_LIMIT:
+		/*
+		 * The VL Arbitrator high limit is sent in units of 4k
+		 * bytes, while HFI stores it in units of 64 bytes.
+		 */
+		val *= 4096/64;
+		reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+			<< SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+		write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+		break;
+	case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* HFI only supports POLL as the default link down state */
+		if (val != HLS_DN_POLL)
+			ret = -EINVAL;
+		break;
+	case HFI1_IB_CFG_OP_VLS:
+		if (ppd->vls_operational != val) {
+			ppd->vls_operational = val;
+			if (!ppd->port)
+				ret = -EINVAL;
+			else
+				ret = sdma_map_init(
+					ppd->dd,
+					ppd->port - 1,
+					val,
+					NULL);
+		}
+		break;
+	/*
+	 * For link width, link width downgrade, and speed enable, always AND
+	 * the setting with what is actually supported.  This has two benefits.
+	 * First, enabled can't have unsupported values, no matter what the
+	 * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+	 * "fill in with your supported value" have all the bits in the
+	 * field set, so simply ANDing with supported has the desired result.
+	 */
+	case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		ppd->link_width_enabled = val & ppd->link_width_supported;
+		break;
+	case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+		ppd->link_width_downgrade_enabled =
+				val & ppd->link_width_downgrade_supported;
+		break;
+	case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+		ppd->link_speed_enabled = val & ppd->link_speed_supported;
+		break;
+	case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		/*
+		 * HFI does not follow IB specs, save this value
+		 * so we can report it, if asked.
+		 */
+		ppd->overrun_threshold = val;
+		break;
+	case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		/*
+		 * HFI does not follow IB specs, save this value
+		 * so we can report it, if asked.
+		 */
+		ppd->phy_error_threshold = val;
+		break;
+
+	case HFI1_IB_CFG_MTU:
+		set_send_length(ppd);
+		break;
+
+	case HFI1_IB_CFG_PKEYS:
+		if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+			set_partition_keys(ppd);
+		break;
+
+	default:
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(ppd->dd,
+			  "%s: which %s, val 0x%x: not implemented\n",
+			  __func__, ib_cfg_name(which), val);
+		break;
+	}
+	return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+	int i;
+
+	BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+			VL_ARB_LOW_PRIO_TABLE_SIZE);
+	BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+			VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+	/*
+	 * Note that we always return values directly from the
+	 * 'vl_arb_cache' (and do no CSR reads) in response to a
+	 * 'Get(VLArbTable)'. This is obviously correct after a
+	 * 'Set(VLArbTable)', since the cache will then be up to
+	 * date. But it's also correct prior to any 'Set(VLArbTable)'
+	 * since then both the cache, and the relevant h/w registers
+	 * will be zeroed.
+	 */
+
+	for (i = 0; i < MAX_PRIO_TABLE; i++)
+		spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+	if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+		return NULL;
+	spin_lock(&ppd->vl_arb_cache[idx].lock);
+	return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+	spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+			     struct ib_vl_weight_elem *vl)
+{
+	memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+			     struct ib_vl_weight_elem *vl)
+{
+	memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+			      struct ib_vl_weight_elem *vl)
+{
+	return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+			  u32 size, struct ib_vl_weight_elem *vl)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+	unsigned int i, is_up = 0;
+	int drain, ret = 0;
+
+	mutex_lock(&ppd->hls_lock);
+
+	if (ppd->host_link_state & HLS_UP)
+		is_up = 1;
+
+	drain = !is_ax(dd) && is_up;
+
+	if (drain)
+		/*
+		 * Before adjusting VL arbitration weights, empty per-VL
+		 * FIFOs, otherwise a packet whose VL weight is being
+		 * set to 0 could get stuck in a FIFO with no chance to
+		 * egress.
+		 */
+		ret = stop_drain_data_vls(dd);
+
+	if (ret) {
+		dd_dev_err(
+			dd,
+			"%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+			__func__);
+		goto err;
+	}
+
+	for (i = 0; i < size; i++, vl++) {
+		/*
+		 * NOTE: The low priority shift and mask are used here, but
+		 * they are the same for both the low and high registers.
+		 */
+		reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+				<< SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+		      | (((u64)vl->weight
+				& SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+				<< SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+		write_csr(dd, target + (i * 8), reg);
+	}
+	pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+	if (drain)
+		open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+	mutex_unlock(&ppd->hls_lock);
+
+	return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+			   struct vl_limit *vll)
+{
+	u64 reg = read_csr(dd, csr);
+
+	vll->dedicated = cpu_to_be16(
+		(reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+		& SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+	vll->shared = cpu_to_be16(
+		(reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+		& SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+			      struct buffer_control *bc, u16 *overall_limit)
+{
+	u64 reg;
+	int i;
+
+	/* not all entries are filled in */
+	memset(bc, 0, sizeof(*bc));
+
+	/* OPA and HFI have a 1-1 mapping */
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
+
+	/* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+	read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	bc->overall_shared_limit = cpu_to_be16(
+		(reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+		& SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+	if (overall_limit)
+		*overall_limit = (reg
+			>> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+			& SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+	return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+	u64 reg;
+	int i;
+
+	/* each register contains 16 SC->VLnt mappings, 4 bits each */
+	reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+	for (i = 0; i < sizeof(u64); i++) {
+		u8 byte = *(((u8 *)&reg) + i);
+
+		dp->vlnt[2 * i] = byte & 0xf;
+		dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+	}
+
+	reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+	for (i = 0; i < sizeof(u64); i++) {
+		u8 byte = *(((u8 *)&reg) + i);
+
+		dp->vlnt[16 + (2 * i)] = byte & 0xf;
+		dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+	}
+	return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+			      struct ib_vl_weight_elem *vl)
+{
+	unsigned int i;
+
+	for (i = 0; i < nelems; i++, vl++) {
+		vl->vl = 0xf;
+		vl->weight = 0;
+	}
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+		DC_SC_VL_VAL(15_0,
+		0, dp->vlnt[0] & 0xf,
+		1, dp->vlnt[1] & 0xf,
+		2, dp->vlnt[2] & 0xf,
+		3, dp->vlnt[3] & 0xf,
+		4, dp->vlnt[4] & 0xf,
+		5, dp->vlnt[5] & 0xf,
+		6, dp->vlnt[6] & 0xf,
+		7, dp->vlnt[7] & 0xf,
+		8, dp->vlnt[8] & 0xf,
+		9, dp->vlnt[9] & 0xf,
+		10, dp->vlnt[10] & 0xf,
+		11, dp->vlnt[11] & 0xf,
+		12, dp->vlnt[12] & 0xf,
+		13, dp->vlnt[13] & 0xf,
+		14, dp->vlnt[14] & 0xf,
+		15, dp->vlnt[15] & 0xf));
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+		DC_SC_VL_VAL(31_16,
+		16, dp->vlnt[16] & 0xf,
+		17, dp->vlnt[17] & 0xf,
+		18, dp->vlnt[18] & 0xf,
+		19, dp->vlnt[19] & 0xf,
+		20, dp->vlnt[20] & 0xf,
+		21, dp->vlnt[21] & 0xf,
+		22, dp->vlnt[22] & 0xf,
+		23, dp->vlnt[23] & 0xf,
+		24, dp->vlnt[24] & 0xf,
+		25, dp->vlnt[25] & 0xf,
+		26, dp->vlnt[26] & 0xf,
+		27, dp->vlnt[27] & 0xf,
+		28, dp->vlnt[28] & 0xf,
+		29, dp->vlnt[29] & 0xf,
+		30, dp->vlnt[30] & 0xf,
+		31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+			u16 limit)
+{
+	if (limit != 0)
+		dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+			what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+	u64 reg;
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+	reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+	u64 reg;
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+	reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+	u64 reg;
+	u32 addr;
+
+	if (vl < TXE_NUM_DATA_VL)
+		addr = SEND_CM_CREDIT_VL + (8 * vl);
+	else
+		addr = SEND_CM_CREDIT_VL15;
+
+	reg = read_csr(dd, addr);
+	reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+	reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+	write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+	u64 reg;
+	u32 addr;
+
+	if (vl < TXE_NUM_DATA_VL)
+		addr = SEND_CM_CREDIT_VL + (8 * vl);
+	else
+		addr = SEND_CM_CREDIT_VL15;
+
+	reg = read_csr(dd, addr);
+	reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+	reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+	write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+				     const char *which)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+		if (reg == 0)
+			return;	/* success */
+		if (time_after(jiffies, timeout))
+			break;		/* timed out */
+		udelay(1);
+	}
+
+	dd_dev_err(dd,
+		"%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+		which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+	/*
+	 * If this occurs, it is likely there was a credit loss on the link.
+	 * The only recovery from that is a link bounce.
+	 */
+	dd_dev_err(dd,
+		"Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *	earlier in the algorithm), set the new limit to the new value
+ */
+static int set_buffer_control(struct hfi1_devdata *dd,
+			      struct buffer_control *new_bc)
+{
+	u64 changing_mask, ld_mask, stat_mask;
+	int change_count;
+	int i, use_all_mask;
+	int this_shared_changing;
+	/*
+	 * A0: add the variable any_shared_limit_changing below and in the
+	 * algorithm above.  If removing A0 support, it can be removed.
+	 */
+	int any_shared_limit_changing;
+	struct buffer_control cur_bc;
+	u8 changing[OPA_MAX_VLS];
+	u8 lowering_dedicated[OPA_MAX_VLS];
+	u16 cur_total;
+	u32 new_total = 0;
+	const u64 all_mask =
+	SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16	/* look at VL15 and less */
+
+
+	/* find the new total credits, do sanity check on unused VLs */
+	for (i = 0; i < OPA_MAX_VLS; i++) {
+		if (valid_vl(i)) {
+			new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+			continue;
+		}
+		nonzero_msg(dd, i, "dedicated",
+			be16_to_cpu(new_bc->vl[i].dedicated));
+		nonzero_msg(dd, i, "shared",
+			be16_to_cpu(new_bc->vl[i].shared));
+		new_bc->vl[i].dedicated = 0;
+		new_bc->vl[i].shared = 0;
+	}
+	new_total += be16_to_cpu(new_bc->overall_shared_limit);
+	if (new_total > (u32)dd->link_credits)
+		return -EINVAL;
+	/* fetch the current values */
+	get_buffer_control(dd, &cur_bc, &cur_total);
+
+	/*
+	 * Create the masks we will use.
+	 */
+	memset(changing, 0, sizeof(changing));
+	memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+	/* NOTE: Assumes that the individual VL bits are adjacent and in
+	   increasing order */
+	stat_mask =
+		SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+	changing_mask = 0;
+	ld_mask = 0;
+	change_count = 0;
+	any_shared_limit_changing = 0;
+	for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+		if (!valid_vl(i))
+			continue;
+		this_shared_changing = new_bc->vl[i].shared
+						!= cur_bc.vl[i].shared;
+		if (this_shared_changing)
+			any_shared_limit_changing = 1;
+		if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
+				|| this_shared_changing) {
+			changing[i] = 1;
+			changing_mask |= stat_mask;
+			change_count++;
+		}
+		if (be16_to_cpu(new_bc->vl[i].dedicated) <
+					be16_to_cpu(cur_bc.vl[i].dedicated)) {
+			lowering_dedicated[i] = 1;
+			ld_mask |= stat_mask;
+		}
+	}
+
+	/* bracket the credit change with a total adjustment */
+	if (new_total > cur_total)
+		set_global_limit(dd, new_total);
+
+	/*
+	 * Start the credit change algorithm.
+	 */
+	use_all_mask = 0;
+	if ((be16_to_cpu(new_bc->overall_shared_limit) <
+				be16_to_cpu(cur_bc.overall_shared_limit))
+			|| (is_a0(dd) && any_shared_limit_changing)) {
+		set_global_shared(dd, 0);
+		cur_bc.overall_shared_limit = 0;
+		use_all_mask = 1;
+	}
+
+	for (i = 0; i < NUM_USABLE_VLS; i++) {
+		if (!valid_vl(i))
+			continue;
+
+		if (changing[i]) {
+			set_vl_shared(dd, i, 0);
+			cur_bc.vl[i].shared = 0;
+		}
+	}
+
+	wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+		"shared");
+
+	if (change_count > 0) {
+		for (i = 0; i < NUM_USABLE_VLS; i++) {
+			if (!valid_vl(i))
+				continue;
+
+			if (lowering_dedicated[i]) {
+				set_vl_dedicated(dd, i,
+					be16_to_cpu(new_bc->vl[i].dedicated));
+				cur_bc.vl[i].dedicated =
+						new_bc->vl[i].dedicated;
+			}
+		}
+
+		wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+		/* now raise all dedicated that are going up */
+		for (i = 0; i < NUM_USABLE_VLS; i++) {
+			if (!valid_vl(i))
+				continue;
+
+			if (be16_to_cpu(new_bc->vl[i].dedicated) >
+					be16_to_cpu(cur_bc.vl[i].dedicated))
+				set_vl_dedicated(dd, i,
+					be16_to_cpu(new_bc->vl[i].dedicated));
+		}
+	}
+
+	/* next raise all shared that are going up */
+	for (i = 0; i < NUM_USABLE_VLS; i++) {
+		if (!valid_vl(i))
+			continue;
+
+		if (be16_to_cpu(new_bc->vl[i].shared) >
+				be16_to_cpu(cur_bc.vl[i].shared))
+			set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+	}
+
+	/* finally raise the global shared */
+	if (be16_to_cpu(new_bc->overall_shared_limit) >
+			be16_to_cpu(cur_bc.overall_shared_limit))
+		set_global_shared(dd,
+			be16_to_cpu(new_bc->overall_shared_limit));
+
+	/* bracket the credit change with a total adjustment */
+	if (new_total < cur_total)
+		set_global_limit(dd, new_total);
+	return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+	int size;
+	struct vl_arb_cache *vlc;
+
+	switch (which) {
+	case FM_TBL_VL_HIGH_ARB:
+		size = 256;
+		/*
+		 * OPA specifies 128 elements (of 2 bytes each), though
+		 * HFI supports only 16 elements in h/w.
+		 */
+		vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+		vl_arb_get_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+		break;
+	case FM_TBL_VL_LOW_ARB:
+		size = 256;
+		/*
+		 * OPA specifies 128 elements (of 2 bytes each), though
+		 * HFI supports only 16 elements in h/w.
+		 */
+		vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+		vl_arb_get_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+		break;
+	case FM_TBL_BUFFER_CONTROL:
+		size = get_buffer_control(ppd->dd, t, NULL);
+		break;
+	case FM_TBL_SC2VLNT:
+		size = get_sc2vlnt(ppd->dd, t);
+		break;
+	case FM_TBL_VL_PREEMPT_ELEMS:
+		size = 256;
+		/* OPA specifies 128 elements, of 2 bytes each */
+		get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+		break;
+	case FM_TBL_VL_PREEMPT_MATRIX:
+		size = 256;
+		/*
+		 * OPA specifies that this is the same size as the VL
+		 * arbitration tables (i.e., 256 bytes).
+		 */
+		break;
+	default:
+		return -EINVAL;
+	}
+	return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+	int ret = 0;
+	struct vl_arb_cache *vlc;
+
+	switch (which) {
+	case FM_TBL_VL_HIGH_ARB:
+		vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+		if (vl_arb_match_cache(vlc, t)) {
+			vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+			break;
+		}
+		vl_arb_set_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+		ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+				     VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+		break;
+	case FM_TBL_VL_LOW_ARB:
+		vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+		if (vl_arb_match_cache(vlc, t)) {
+			vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+			break;
+		}
+		vl_arb_set_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+		ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+				     VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+		break;
+	case FM_TBL_BUFFER_CONTROL:
+		ret = set_buffer_control(ppd->dd, t);
+		break;
+	case FM_TBL_SC2VLNT:
+		set_sc2vlnt(ppd->dd, t);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+	if (is_a0(dd))
+		return 1;
+
+	pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+	return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+	if (is_a0(dd))
+		return 1;
+
+	pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+	return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+	sc_wait(dd);
+	sdma_wait(dd);
+	pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = disable_data_vls(dd);
+	if (ret == 0)
+		drain_data_vls(dd);
+
+	return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+	u32 cclocks;
+
+	if (dd->icode == ICODE_FPGA_EMULATION)
+		cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+	else  /* simulation pretends to be ASIC */
+		cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+	if (ns && !cclocks)	/* if ns nonzero, must be at least 1 */
+		cclocks = 1;
+	return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+	u32 ns;
+
+	if (dd->icode == ICODE_FPGA_EMULATION)
+		ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+	else  /* simulation pretends to be ASIC */
+		ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+	if (cclocks && !ns)
+		ns = 1;
+	return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 timeout = rcd->rcvavail_timeout;
+
+	/*
+	 * This algorithm doubles or halves the timeout depending on whether
+	 * the number of packets received in this interrupt were less than or
+	 * greater equal the interrupt count.
+	 *
+	 * The calculations below do not allow a steady state to be achieved.
+	 * Only at the endpoints it is possible to have an unchanging
+	 * timeout.
+	 */
+	if (npkts < rcv_intr_count) {
+		/*
+		 * Not enough packets arrived before the timeout, adjust
+		 * timeout downward.
+		 */
+		if (timeout < 2) /* already at minimum? */
+			return;
+		timeout >>= 1;
+	} else {
+		/*
+		 * More than enough packets arrived before the timeout, adjust
+		 * timeout upward.
+		 */
+		if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+			return;
+		timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+	}
+
+	rcd->rcvavail_timeout = timeout;
+	/* timeout cannot be larger than rcv_intr_timeout_csr which has already
+	   been verified to be in range */
+	write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+		(u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+		    u32 intr_adjust, u32 npkts)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u64 reg;
+	u32 ctxt = rcd->ctxt;
+
+	/*
+	 * Need to write timeout register before updating RcvHdrHead to ensure
+	 * that a new value is used when the HW decides to restart counting.
+	 */
+	if (intr_adjust)
+		adjust_rcv_timeout(rcd, npkts);
+	if (updegr) {
+		reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+			<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+	}
+	mmiowb();
+	reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+		(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+			<< RCV_HDR_HEAD_HEAD_SHIFT);
+	write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+	mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+		& RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = get_rcvhdrtail(rcd);
+	else
+		tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+	return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *	0x0 invalid
+ *	0x1   4 KB
+ *	0x2   8 KB
+ *	0x3  16 KB
+ *	0x4  32 KB
+ *	0x5  64 KB
+ *	0x6 128 KB
+ *	0x7 256 KB
+ *	0x8 512 KB (Receive Array only)
+ *	0x9   1 MB (Receive Array only)
+ *	0xa   2 MB (Receive Array only)
+ *
+ *	0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+	switch (size) {
+	case   4*1024: return 0x1;
+	case   8*1024: return 0x2;
+	case  16*1024: return 0x3;
+	case  32*1024: return 0x4;
+	case  64*1024: return 0x5;
+	case 128*1024: return 0x6;
+	case 256*1024: return 0x7;
+	case 512*1024: return 0x8;
+	case   1*1024*1024: return 0x9;
+	case   2*1024*1024: return 0xa;
+	}
+	return 0x1;	/* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+	struct hfi1_ctxtdata *rcd;
+	u64 rcvctrl, reg;
+	int did_enable = 0;
+
+	rcd = dd->rcd[ctxt];
+	if (!rcd)
+		return;
+
+	hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+	rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+	/* if the context already enabled, don't do the extra steps */
+	if ((op & HFI1_RCVCTRL_CTXT_ENB)
+			&& !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+		/* reset the tail and hdr addresses, and sequence count */
+		write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+				rcd->rcvhdrq_phys);
+		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+			write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+					rcd->rcvhdrqtailaddr_phys);
+		rcd->seq_cnt = 1;
+
+		/* reset the cached receive header queue head value */
+		rcd->head = 0;
+
+		/*
+		 * Zero the receive header queue so we don't get false
+		 * positives when checking the sequence number.  The
+		 * sequence numbers could land exactly on the same spot.
+		 * E.g. a rcd restart before the receive header wrapped.
+		 */
+		memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+		/* starting timeout */
+		rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+		/* enable the context */
+		rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+		/* clean the egr buffer size first */
+		rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+		rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+				& RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+					<< RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+		/* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+		write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+		did_enable = 1;
+
+		/* zero RcvEgrIndexHead */
+		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+		/* set eager count and base index */
+		reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+			& RCV_EGR_CTRL_EGR_CNT_MASK)
+		       << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+			(((rcd->eager_base >> RCV_SHIFT)
+			  & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+			 << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+		write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+		/*
+		 * Set TID (expected) count and base index.
+		 * rcd->expected_count is set to individual RcvArray entries,
+		 * not pairs, and the CSR takes a pair-count in groups of
+		 * four, so divide by 8.
+		 */
+		reg = (((rcd->expected_count >> RCV_SHIFT)
+					& RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+				<< RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+		      (((rcd->expected_base >> RCV_SHIFT)
+					& RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+				<< RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+		write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+		if (ctxt == VL15CTXT)
+			write_csr(dd, RCV_VL15, VL15CTXT);
+	}
+	if (op & HFI1_RCVCTRL_CTXT_DIS) {
+		write_csr(dd, RCV_VL15, 0);
+		rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+	}
+	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+	if (op & HFI1_RCVCTRL_TAILUPD_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+	if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+	if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+	if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+		/* In one-packet-per-eager mode, the size comes from
+		   the RcvArray entry. */
+		rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+		rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+	}
+	if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+	if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	rcd->rcvctrl = rcvctrl;
+	hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+	write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+	/* work around sticky RcvCtxtStatus.BlockedRHQFull */
+	if (did_enable
+	    && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+		reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+		if (reg != 0) {
+			dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+				ctxt, reg);
+			read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+			write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+			write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+			read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+			reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+			dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+				ctxt, reg, reg == 0 ? "not" : "still");
+		}
+	}
+
+	if (did_enable) {
+		/*
+		 * The interrupt timeout and count must be set after
+		 * the context is enabled to take effect.
+		 */
+		/* set interrupt timeout */
+		write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+			(u64)rcd->rcvavail_timeout <<
+				RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+		/* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+		reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+		write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+	}
+
+	if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+		/*
+		 * If the context has been disabled and the Tail Update has
+		 * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so
+		 * it doesn't contain an address that is invalid.
+		 */
+		write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+		    u64 **cntrp)
+{
+	int ret;
+	u64 val = 0;
+
+	if (namep) {
+		ret = dd->cntrnameslen;
+		if (pos != 0) {
+			dd_dev_err(dd, "read_cntrs does not support indexing");
+			return 0;
+		}
+		*namep = dd->cntrnames;
+	} else {
+		const struct cntr_entry *entry;
+		int i, j;
+
+		ret = (dd->ndevcntrs) * sizeof(u64);
+		if (pos != 0) {
+			dd_dev_err(dd, "read_cntrs does not support indexing");
+			return 0;
+		}
+
+		/* Get the start of the block of counters */
+		*cntrp = dd->cntrs;
+
+		/*
+		 * Now go and fill in each counter in the block.
+		 */
+		for (i = 0; i < DEV_CNTR_LAST; i++) {
+			entry = &dev_cntrs[i];
+			hfi1_cdbg(CNTR, "reading %s", entry->name);
+			if (entry->flags & CNTR_DISABLED) {
+				/* Nothing */
+				hfi1_cdbg(CNTR, "\tDisabled\n");
+			} else {
+				if (entry->flags & CNTR_VL) {
+					hfi1_cdbg(CNTR, "\tPer VL\n");
+					for (j = 0; j < C_VL_COUNT; j++) {
+						val = entry->rw_cntr(entry,
+								  dd, j,
+								  CNTR_MODE_R,
+								  0);
+						hfi1_cdbg(
+						   CNTR,
+						   "\t\tRead 0x%llx for %d\n",
+						   val, j);
+						dd->cntrs[entry->offset + j] =
+									    val;
+					}
+				} else {
+					val = entry->rw_cntr(entry, dd,
+							CNTR_INVALID_VL,
+							CNTR_MODE_R, 0);
+					dd->cntrs[entry->offset] = val;
+					hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+				}
+			}
+		}
+	}
+	return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+			char **namep, u64 **cntrp)
+{
+	int ret;
+	u64 val = 0;
+
+	if (namep) {
+		ret = dd->portcntrnameslen;
+		if (pos != 0) {
+			dd_dev_err(dd, "index not supported");
+			return 0;
+		}
+		*namep = dd->portcntrnames;
+	} else {
+		const struct cntr_entry *entry;
+		struct hfi1_pportdata *ppd;
+		int i, j;
+
+		ret = (dd->nportcntrs) * sizeof(u64);
+		if (pos != 0) {
+			dd_dev_err(dd, "indexing not supported");
+			return 0;
+		}
+		ppd = (struct hfi1_pportdata *)(dd + 1 + port);
+		*cntrp = ppd->cntrs;
+
+		for (i = 0; i < PORT_CNTR_LAST; i++) {
+			entry = &port_cntrs[i];
+			hfi1_cdbg(CNTR, "reading %s", entry->name);
+			if (entry->flags & CNTR_DISABLED) {
+				/* Nothing */
+				hfi1_cdbg(CNTR, "\tDisabled\n");
+				continue;
+			}
+
+			if (entry->flags & CNTR_VL) {
+				hfi1_cdbg(CNTR, "\tPer VL");
+				for (j = 0; j < C_VL_COUNT; j++) {
+					val = entry->rw_cntr(entry, ppd, j,
+							       CNTR_MODE_R,
+							       0);
+					hfi1_cdbg(
+					   CNTR,
+					   "\t\tRead 0x%llx for %d",
+					   val, j);
+					ppd->cntrs[entry->offset + j] = val;
+				}
+			} else {
+				val = entry->rw_cntr(entry, ppd,
+						       CNTR_INVALID_VL,
+						       CNTR_MODE_R,
+						       0);
+				ppd->cntrs[entry->offset] = val;
+				hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+			}
+		}
+	}
+	return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	if (dd->synth_stats_timer.data)
+		del_timer_sync(&dd->synth_stats_timer);
+	dd->synth_stats_timer.data = 0;
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		kfree(ppd->cntrs);
+		kfree(ppd->scntrs);
+		free_percpu(ppd->ibport_data.rc_acks);
+		free_percpu(ppd->ibport_data.rc_qacks);
+		free_percpu(ppd->ibport_data.rc_delayed_comp);
+		ppd->cntrs = NULL;
+		ppd->scntrs = NULL;
+		ppd->ibport_data.rc_acks = NULL;
+		ppd->ibport_data.rc_qacks = NULL;
+		ppd->ibport_data.rc_delayed_comp = NULL;
+	}
+	kfree(dd->portcntrnames);
+	dd->portcntrnames = NULL;
+	kfree(dd->cntrs);
+	dd->cntrs = NULL;
+	kfree(dd->scntrs);
+	dd->scntrs = NULL;
+	kfree(dd->cntrnames);
+	dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+			      u64 *psval, void *context, int vl)
+{
+	u64 val;
+	u64 sval = *psval;
+
+	if (entry->flags & CNTR_DISABLED) {
+		dd_dev_err(dd, "Counter %s not enabled", entry->name);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+	val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+	/* If its a synthetic counter there is more work we need to do */
+	if (entry->flags & CNTR_SYNTH) {
+		if (sval == CNTR_MAX) {
+			/* No need to read already saturated */
+			return CNTR_MAX;
+		}
+
+		if (entry->flags & CNTR_32BIT) {
+			/* 32bit counters can wrap multiple times */
+			u64 upper = sval >> 32;
+			u64 lower = (sval << 32) >> 32;
+
+			if (lower > val) { /* hw wrapped */
+				if (upper == CNTR_32BIT_MAX)
+					val = CNTR_MAX;
+				else
+					upper++;
+			}
+
+			if (val != CNTR_MAX)
+				val = (upper << 32) | val;
+
+		} else {
+			/* If we rolled we are saturated */
+			if ((val < sval) || (val > CNTR_MAX))
+				val = CNTR_MAX;
+		}
+	}
+
+	*psval = val;
+
+	hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+	return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+			       struct cntr_entry *entry,
+			       u64 *psval, void *context, int vl, u64 data)
+{
+	u64 val;
+
+	if (entry->flags & CNTR_DISABLED) {
+		dd_dev_err(dd, "Counter %s not enabled", entry->name);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+	if (entry->flags & CNTR_SYNTH) {
+		*psval = data;
+		if (entry->flags & CNTR_32BIT) {
+			val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+					     (data << 32) >> 32);
+			val = data; /* return the full 64bit value */
+		} else {
+			val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+					     data);
+		}
+	} else {
+		val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+	}
+
+	*psval = val;
+
+	hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+	return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &dev_cntrs[index];
+	sval = dd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &dev_cntrs[index];
+	sval = dd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &port_cntrs[index];
+	sval = ppd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+	    (index <= C_RCV_HDR_OVF_LAST)) {
+		/* We do not want to bother for disabled contexts */
+		return 0;
+	}
+
+	return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &port_cntrs[index];
+	sval = ppd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+	    (index <= C_RCV_HDR_OVF_LAST)) {
+		/* We do not want to bother for disabled contexts */
+		return 0;
+	}
+
+	return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+	u64 cur_tx;
+	u64 cur_rx;
+	u64 total_flits;
+	u8 update = 0;
+	int i, j, vl;
+	struct hfi1_pportdata *ppd;
+	struct cntr_entry *entry;
+
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+	/*
+	 * Rather than keep beating on the CSRs pick a minimal set that we can
+	 * check to watch for potential roll over. We can do this by looking at
+	 * the number of flits sent/recv. If the total flits exceeds 32bits then
+	 * we have to iterate all the counters and update.
+	 */
+	entry = &dev_cntrs[C_DC_RCV_FLITS];
+	cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+	entry = &dev_cntrs[C_DC_XMIT_FLITS];
+	cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+	hfi1_cdbg(
+	    CNTR,
+	    "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+	    dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+	if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+		/*
+		 * May not be strictly necessary to update but it won't hurt and
+		 * simplifies the logic here.
+		 */
+		update = 1;
+		hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+			  dd->unit);
+	} else {
+		total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+		hfi1_cdbg(CNTR,
+			  "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+			  total_flits, (u64)CNTR_32BIT_MAX);
+		if (total_flits >= CNTR_32BIT_MAX) {
+			hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+				  dd->unit);
+			update = 1;
+		}
+	}
+
+	if (update) {
+		hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+		for (i = 0; i < DEV_CNTR_LAST; i++) {
+			entry = &dev_cntrs[i];
+			if (entry->flags & CNTR_VL) {
+				for (vl = 0; vl < C_VL_COUNT; vl++)
+					read_dev_cntr(dd, i, vl);
+			} else {
+				read_dev_cntr(dd, i, CNTR_INVALID_VL);
+			}
+		}
+		ppd = (struct hfi1_pportdata *)(dd + 1);
+		for (i = 0; i < dd->num_pports; i++, ppd++) {
+			for (j = 0; j < PORT_CNTR_LAST; j++) {
+				entry = &port_cntrs[j];
+				if (entry->flags & CNTR_VL) {
+					for (vl = 0; vl < C_VL_COUNT; vl++)
+						read_port_cntr(ppd, j, vl);
+				} else {
+					read_port_cntr(ppd, j, CNTR_INVALID_VL);
+				}
+			}
+		}
+
+		/*
+		 * We want the value in the register. The goal is to keep track
+		 * of the number of "ticks" not the counter value. In other
+		 * words if the register rolls we want to notice it and go ahead
+		 * and force an update.
+		 */
+		entry = &dev_cntrs[C_DC_XMIT_FLITS];
+		dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+						CNTR_MODE_R, 0);
+
+		entry = &dev_cntrs[C_DC_RCV_FLITS];
+		dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+						CNTR_MODE_R, 0);
+
+		hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+			  dd->unit, dd->last_tx, dd->last_rx);
+
+	} else {
+		hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+	}
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+	int i, rcv_ctxts, index, j;
+	size_t sz;
+	char *p;
+	char name[C_MAX_NAME];
+	struct hfi1_pportdata *ppd;
+
+	/* set up the stats timer; the add_timer is done at the end */
+	init_timer(&dd->synth_stats_timer);
+	dd->synth_stats_timer.function = update_synth_timer;
+	dd->synth_stats_timer.data = (unsigned long) dd;
+
+	/***********************/
+	/* per device counters */
+	/***********************/
+
+	/* size names and determine how many we have*/
+	dd->ndevcntrs = 0;
+	sz = 0;
+	index = 0;
+
+	for (i = 0; i < DEV_CNTR_LAST; i++) {
+		hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
+		if (dev_cntrs[i].flags & CNTR_DISABLED) {
+			hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+			continue;
+		}
+
+		if (dev_cntrs[i].flags & CNTR_VL) {
+			hfi1_dbg_early("\tProcessing VL cntr\n");
+			dev_cntrs[i].offset = index;
+			for (j = 0; j < C_VL_COUNT; j++) {
+				memset(name, '\0', C_MAX_NAME);
+				snprintf(name, C_MAX_NAME, "%s%d",
+					dev_cntrs[i].name,
+					vl_from_idx(j));
+				sz += strlen(name);
+				sz++;
+				hfi1_dbg_early("\t\t%s\n", name);
+				dd->ndevcntrs++;
+				index++;
+			}
+		} else {
+			/* +1 for newline  */
+			sz += strlen(dev_cntrs[i].name) + 1;
+			dd->ndevcntrs++;
+			dev_cntrs[i].offset = index;
+			index++;
+			hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
+		}
+	}
+
+	/* allocate space for the counter values */
+	dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+	if (!dd->cntrs)
+		goto bail;
+
+	dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+	if (!dd->scntrs)
+		goto bail;
+
+
+	/* allocate space for the counter names */
+	dd->cntrnameslen = sz;
+	dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+	if (!dd->cntrnames)
+		goto bail;
+
+	/* fill in the names */
+	for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
+		if (dev_cntrs[i].flags & CNTR_DISABLED) {
+			/* Nothing */
+		} else {
+			if (dev_cntrs[i].flags & CNTR_VL) {
+				for (j = 0; j < C_VL_COUNT; j++) {
+					memset(name, '\0', C_MAX_NAME);
+					snprintf(name, C_MAX_NAME, "%s%d",
+						dev_cntrs[i].name,
+						vl_from_idx(j));
+					memcpy(p, name, strlen(name));
+					p += strlen(name);
+					*p++ = '\n';
+				}
+			} else {
+				memcpy(p, dev_cntrs[i].name,
+				       strlen(dev_cntrs[i].name));
+				p += strlen(dev_cntrs[i].name);
+				*p++ = '\n';
+			}
+			index++;
+		}
+	}
+
+	/*********************/
+	/* per port counters */
+	/*********************/
+
+	/*
+	 * Go through the counters for the overflows and disable the ones we
+	 * don't need. This varies based on platform so we need to do it
+	 * dynamically here.
+	 */
+	rcv_ctxts = dd->num_rcv_contexts;
+	for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+	     i <= C_RCV_HDR_OVF_LAST; i++) {
+		port_cntrs[i].flags |= CNTR_DISABLED;
+	}
+
+	/* size port counter names and determine how many we have*/
+	sz = 0;
+	dd->nportcntrs = 0;
+	for (i = 0; i < PORT_CNTR_LAST; i++) {
+		hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
+		if (port_cntrs[i].flags & CNTR_DISABLED) {
+			hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+			continue;
+		}
+
+		if (port_cntrs[i].flags & CNTR_VL) {
+			hfi1_dbg_early("\tProcessing VL cntr\n");
+			port_cntrs[i].offset = dd->nportcntrs;
+			for (j = 0; j < C_VL_COUNT; j++) {
+				memset(name, '\0', C_MAX_NAME);
+				snprintf(name, C_MAX_NAME, "%s%d",
+					port_cntrs[i].name,
+					vl_from_idx(j));
+				sz += strlen(name);
+				sz++;
+				hfi1_dbg_early("\t\t%s\n", name);
+				dd->nportcntrs++;
+			}
+		} else {
+			/* +1 for newline  */
+			sz += strlen(port_cntrs[i].name) + 1;
+			port_cntrs[i].offset = dd->nportcntrs;
+			dd->nportcntrs++;
+			hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
+		}
+	}
+
+	/* allocate space for the counter names */
+	dd->portcntrnameslen = sz;
+	dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+	if (!dd->portcntrnames)
+		goto bail;
+
+	/* fill in port cntr names */
+	for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+		if (port_cntrs[i].flags & CNTR_DISABLED)
+			continue;
+
+		if (port_cntrs[i].flags & CNTR_VL) {
+			for (j = 0; j < C_VL_COUNT; j++) {
+				memset(name, '\0', C_MAX_NAME);
+				snprintf(name, C_MAX_NAME, "%s%d",
+					port_cntrs[i].name,
+					vl_from_idx(j));
+				memcpy(p, name, strlen(name));
+				p += strlen(name);
+				*p++ = '\n';
+			}
+		} else {
+			memcpy(p, port_cntrs[i].name,
+			       strlen(port_cntrs[i].name));
+			p += strlen(port_cntrs[i].name);
+			*p++ = '\n';
+		}
+	}
+
+	/* allocate per port storage for counter values */
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+		if (!ppd->cntrs)
+			goto bail;
+
+		ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+		if (!ppd->scntrs)
+			goto bail;
+	}
+
+	/* CPU counters need to be allocated and zeroed */
+	if (init_cpu_counters(dd))
+		goto bail;
+
+	mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+	return 0;
+bail:
+	free_cntrs(dd);
+	return -ENOMEM;
+}
+
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+	switch (chip_lstate) {
+	default:
+		dd_dev_err(dd,
+			 "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+			 chip_lstate);
+		/* fall through */
+	case LSTATE_DOWN:
+		return IB_PORT_DOWN;
+	case LSTATE_INIT:
+		return IB_PORT_INIT;
+	case LSTATE_ARMED:
+		return IB_PORT_ARMED;
+	case LSTATE_ACTIVE:
+		return IB_PORT_ACTIVE;
+	}
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+	/* look at the HFI meta-states only */
+	switch (chip_pstate & 0xf0) {
+	default:
+		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+			chip_pstate);
+		/* fall through */
+	case PLS_DISABLED:
+		return IB_PORTPHYSSTATE_DISABLED;
+	case PLS_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case PLS_POLLING:
+		return IB_PORTPHYSSTATE_POLLING;
+	case PLS_CONFIGPHY:
+		return IB_PORTPHYSSTATE_TRAINING;
+	case PLS_LINKUP:
+		return IB_PORTPHYSSTATE_LINKUP;
+	case PLS_PHYTEST:
+		return IB_PORTPHYSSTATE_PHY_TEST;
+	}
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+	static const char * const port_logical_names[] = {
+		"PORT_NOP",
+		"PORT_DOWN",
+		"PORT_INIT",
+		"PORT_ARMED",
+		"PORT_ACTIVE",
+		"PORT_ACTIVE_DEFER",
+	};
+	if (lstate < ARRAY_SIZE(port_logical_names))
+		return port_logical_names[lstate];
+	return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+	static const char * const port_physical_names[] = {
+		"PHYS_NOP",
+		"reserved1",
+		"PHYS_POLL",
+		"PHYS_DISABLED",
+		"PHYS_TRAINING",
+		"PHYS_LINKUP",
+		"PHYS_LINK_ERR_RECOVER",
+		"PHYS_PHY_TEST",
+		"reserved8",
+		"PHYS_OFFLINE",
+		"PHYS_GANGED",
+		"PHYS_TEST",
+	};
+	if (pstate < ARRAY_SIZE(port_physical_names))
+		return port_physical_names[pstate];
+	return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+	u32 new_state;
+
+	new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+	if (new_state != ppd->lstate) {
+		dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+			opa_lstate_name(new_state), new_state);
+		ppd->lstate = new_state;
+	}
+	/*
+	 * Set port status flags in the page mapped into userspace
+	 * memory. Do it here to ensure a reliable state - this is
+	 * the only function called by all state handling code.
+	 * Always set the flags due to the fact that the cache value
+	 * might have been changed explicitly outside of this
+	 * function.
+	 */
+	if (ppd->statusp) {
+		switch (ppd->lstate) {
+		case IB_PORT_DOWN:
+		case IB_PORT_INIT:
+			*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+					   HFI1_STATUS_IB_READY);
+			break;
+		case IB_PORT_ARMED:
+			*ppd->statusp |= HFI1_STATUS_IB_CONF;
+			break;
+		case IB_PORT_ACTIVE:
+			*ppd->statusp |= HFI1_STATUS_IB_READY;
+			break;
+		}
+	}
+	return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				  int msecs)
+{
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		if (get_logical_state(ppd) == state)
+			return 0;
+		if (time_after(jiffies, timeout))
+			break;
+		msleep(20);
+	}
+	dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+	return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+	static u32 remembered_state = 0xff;
+	u32 pstate;
+	u32 ib_pstate;
+
+	pstate = read_physical_state(ppd->dd);
+	ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+	if (remembered_state != ib_pstate) {
+		dd_dev_info(ppd->dd,
+			"%s: physical state changed to %s (0x%x), phy 0x%x\n",
+			__func__, opa_pstate_name(ib_pstate), ib_pstate,
+			pstate);
+		remembered_state = ib_pstate;
+	}
+	return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ *      I2CCLK  (bit 0)
+ *      I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+		  u32 mask)
+{
+	u64 qsfp_oe, target_oe;
+
+	target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+	if (mask) {
+		/* We are writing register bits, so lock access */
+		dir &= mask;
+		data &= mask;
+
+		qsfp_oe = read_csr(dd, target_oe);
+		qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+		write_csr(dd, target_oe, qsfp_oe);
+	}
+	/* We are exclusively reading bits here, but it is unlikely
+	 * we'll get valid data when we set the direction of the pin
+	 * in the same call, so read should call this function again
+	 * to get valid data
+	 */
+	return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+	if (sc != NULL) {
+		struct hfi1_devdata *dd = sc->dd;
+		u64 reg;
+		u8 set = (sc->type == SC_USER ?
+			  HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+			  HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+		reg = read_kctxt_csr(dd, sc->hw_context,
+				     SEND_CTXT_CHECK_ENABLE);
+		if (set)
+			CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+		else
+			SET_STATIC_RATE_CONTROL_SMASK(reg);
+		write_kctxt_csr(dd, sc->hw_context,
+				SEND_CTXT_CHECK_ENABLE, reg);
+	}
+	return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+	int ret = 0;
+	u64 reg;
+
+	if (dd->icode != ICODE_RTL_SILICON) {
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+				    __func__);
+		return -EINVAL;
+	}
+	reg = read_csr(dd, ASIC_STS_THERM);
+	temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+		      ASIC_STS_THERM_CURR_TEMP_MASK);
+	temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+			ASIC_STS_THERM_LO_TEMP_MASK);
+	temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+			ASIC_STS_THERM_HI_TEMP_MASK);
+	temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+			  ASIC_STS_THERM_CRIT_TEMP_MASK);
+	/* triggers is a 3-bit value - 1 bit per trigger. */
+	temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+	return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+	int i;
+
+	/*
+	 * In HFI, the mask needs to be 1 to allow interrupts.
+	 */
+	if (enable) {
+		u64 cce_int_mask;
+		const int qsfp1_int_smask = QSFP1_INT % 64;
+		const int qsfp2_int_smask = QSFP2_INT % 64;
+
+		/* enable all interrupts */
+		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+			write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
+
+		/*
+		 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+		 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+		 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+		 * the index of the appropriate CSR in the CCEIntMask CSR array
+		 */
+		cce_int_mask = read_csr(dd, CCE_INT_MASK +
+						(8*(QSFP1_INT/64)));
+		if (dd->hfi1_id) {
+			cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+			write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
+					cce_int_mask);
+		} else {
+			cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+			write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
+					cce_int_mask);
+		}
+	} else {
+		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+			write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+	}
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
+
+	write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+	write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+	write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+	write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+	pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* remove irqs - must happen before disabling/turning off */
+	if (dd->num_msix_entries) {
+		/* MSI-X */
+		struct hfi1_msix_entry *me = dd->msix_entries;
+
+		for (i = 0; i < dd->num_msix_entries; i++, me++) {
+			if (me->arg == NULL) /* => no irq, no affinity */
+				break;
+			irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
+					NULL);
+			free_irq(me->msix.vector, me->arg);
+		}
+	} else {
+		/* INTx */
+		if (dd->requested_intx_irq) {
+			free_irq(dd->pcidev->irq, dd);
+			dd->requested_intx_irq = 0;
+		}
+	}
+
+	/* turn off interrupts */
+	if (dd->num_msix_entries) {
+		/* MSI-X */
+		hfi1_nomsix(dd);
+	} else {
+		/* INTx */
+		disable_intx(dd->pcidev);
+	}
+
+	/* clean structures */
+	for (i = 0; i < dd->num_msix_entries; i++)
+		free_cpumask_var(dd->msix_entries[i].mask);
+	kfree(dd->msix_entries);
+	dd->msix_entries = NULL;
+	dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+	u64 reg;
+	int m, n;
+
+	/* clear from the handled mask of the general interrupt */
+	m = isrc / 64;
+	n = isrc % 64;
+	dd->gi_mask[m] &= ~((u64)1 << n);
+
+	/* direct the chip source to the given MSI-X interrupt */
+	m = isrc / 8;
+	n = isrc % 8;
+	reg = read_csr(dd, CCE_INT_MAP + (8*m));
+	reg &= ~((u64)0xff << (8*n));
+	reg |= ((u64)msix_intr & 0xff) << (8*n);
+	write_csr(dd, CCE_INT_MAP + (8*m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+				  int engine, int msix_intr)
+{
+	/*
+	 * SDMA engine interrupt sources grouped by type, rather than
+	 * engine.  Per-engine interrupts are as follows:
+	 *	SDMA
+	 *	SDMAProgress
+	 *	SDMAIdle
+	 */
+	remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
+		msix_intr);
+	remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
+		msix_intr);
+	remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
+		msix_intr);
+}
+
+static void remap_receive_available_interrupt(struct hfi1_devdata *dd,
+					      int rx, int msix_intr)
+{
+	remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d",
+		dd->unit);
+	ret = request_irq(dd->pcidev->irq, general_interrupt,
+				  IRQF_SHARED, dd->intx_name, dd);
+	if (ret)
+		dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+				ret);
+	else
+		dd->requested_intx_irq = 1;
+	return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+	const struct cpumask *local_mask;
+	cpumask_var_t def, rcv;
+	bool def_ret, rcv_ret;
+	int first_general, last_general;
+	int first_sdma, last_sdma;
+	int first_rx, last_rx;
+	int first_cpu, restart_cpu, curr_cpu;
+	int rcv_cpu, sdma_cpu;
+	int i, ret = 0, possible;
+	int ht;
+
+	/* calculate the ranges we are going to use */
+	first_general = 0;
+	first_sdma = last_general = first_general + 1;
+	first_rx = last_sdma = first_sdma + dd->num_sdma;
+	last_rx = first_rx + dd->n_krcv_queues;
+
+	/*
+	 * Interrupt affinity.
+	 *
+	 * non-rcv avail gets a default mask that
+	 * starts as possible cpus with threads reset
+	 * and each rcv avail reset.
+	 *
+	 * rcv avail gets node relative 1 wrapping back
+	 * to the node relative 1 as necessary.
+	 *
+	 */
+	local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	/* if first cpu is invalid, use NUMA 0 */
+	if (cpumask_first(local_mask) >= nr_cpu_ids)
+		local_mask = topology_core_cpumask(0);
+
+	def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
+	rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
+	if (!def_ret || !rcv_ret)
+		goto bail;
+	/* use local mask as default */
+	cpumask_copy(def, local_mask);
+	possible = cpumask_weight(def);
+	/* disarm threads from default */
+	ht = cpumask_weight(
+			topology_sibling_cpumask(cpumask_first(local_mask)));
+	for (i = possible/ht; i < possible; i++)
+		cpumask_clear_cpu(i, def);
+	/* reset possible */
+	possible = cpumask_weight(def);
+	/* def now has full cores on chosen node*/
+	first_cpu = cpumask_first(def);
+	if (nr_cpu_ids >= first_cpu)
+		first_cpu++;
+	restart_cpu = first_cpu;
+	curr_cpu = restart_cpu;
+
+	for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) {
+		cpumask_clear_cpu(curr_cpu, def);
+		cpumask_set_cpu(curr_cpu, rcv);
+		if (curr_cpu >= possible)
+			curr_cpu = restart_cpu;
+		else
+			curr_cpu++;
+	}
+	/* def mask has non-rcv, rcv has recv mask */
+	rcv_cpu = cpumask_first(rcv);
+	sdma_cpu = cpumask_first(def);
+
+	/*
+	 * Sanity check - the code expects all SDMA chip source
+	 * interrupts to be in the same CSR, starting at bit 0.  Verify
+	 * that this is true by checking the bit location of the start.
+	 */
+	BUILD_BUG_ON(IS_SDMA_START % 64);
+
+	for (i = 0; i < dd->num_msix_entries; i++) {
+		struct hfi1_msix_entry *me = &dd->msix_entries[i];
+		const char *err_info;
+		irq_handler_t handler;
+		void *arg;
+		int idx;
+		struct hfi1_ctxtdata *rcd = NULL;
+		struct sdma_engine *sde = NULL;
+
+		/* obtain the arguments to request_irq */
+		if (first_general <= i && i < last_general) {
+			idx = i - first_general;
+			handler = general_interrupt;
+			arg = dd;
+			snprintf(me->name, sizeof(me->name),
+				DRIVER_NAME"_%d", dd->unit);
+			err_info = "general";
+		} else if (first_sdma <= i && i < last_sdma) {
+			idx = i - first_sdma;
+			sde = &dd->per_sdma[idx];
+			handler = sdma_interrupt;
+			arg = sde;
+			snprintf(me->name, sizeof(me->name),
+				DRIVER_NAME"_%d sdma%d", dd->unit, idx);
+			err_info = "sdma";
+			remap_sdma_interrupts(dd, idx, i);
+		} else if (first_rx <= i && i < last_rx) {
+			idx = i - first_rx;
+			rcd = dd->rcd[idx];
+			/* no interrupt if no rcd */
+			if (!rcd)
+				continue;
+			/*
+			 * Set the interrupt register and mask for this
+			 * context's interrupt.
+			 */
+			rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
+			rcd->imask = ((u64)1) <<
+					((IS_RCVAVAIL_START+idx) % 64);
+			handler = receive_context_interrupt;
+			arg = rcd;
+			snprintf(me->name, sizeof(me->name),
+				DRIVER_NAME"_%d kctxt%d", dd->unit, idx);
+			err_info = "receive context";
+			remap_receive_available_interrupt(dd, idx, i);
+		} else {
+			/* not in our expected range - complain, then
+			   ignore it */
+			dd_dev_err(dd,
+				"Unexpected extra MSI-X interrupt %d\n", i);
+			continue;
+		}
+		/* no argument, no interrupt */
+		if (arg == NULL)
+			continue;
+		/* make sure the name is terminated */
+		me->name[sizeof(me->name)-1] = 0;
+
+		ret = request_irq(me->msix.vector, handler, 0, me->name, arg);
+		if (ret) {
+			dd_dev_err(dd,
+				"unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+				 err_info, me->msix.vector, idx, ret);
+			return ret;
+		}
+		/*
+		 * assign arg after request_irq call, so it will be
+		 * cleaned up
+		 */
+		me->arg = arg;
+
+		if (!zalloc_cpumask_var(
+			&dd->msix_entries[i].mask,
+			GFP_KERNEL))
+			goto bail;
+		if (handler == sdma_interrupt) {
+			dd_dev_info(dd, "sdma engine %d cpu %d\n",
+				sde->this_idx, sdma_cpu);
+			cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
+			sdma_cpu = cpumask_next(sdma_cpu, def);
+			if (sdma_cpu >= nr_cpu_ids)
+				sdma_cpu = cpumask_first(def);
+		} else if (handler == receive_context_interrupt) {
+			dd_dev_info(dd, "rcv ctxt %d cpu %d\n",
+				rcd->ctxt, rcv_cpu);
+			cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask);
+			rcv_cpu = cpumask_next(rcv_cpu, rcv);
+			if (rcv_cpu >= nr_cpu_ids)
+				rcv_cpu = cpumask_first(rcv);
+		} else {
+			/* otherwise first def */
+			dd_dev_info(dd, "%s cpu %d\n",
+				err_info, cpumask_first(def));
+			cpumask_set_cpu(
+				cpumask_first(def), dd->msix_entries[i].mask);
+		}
+		irq_set_affinity_hint(
+			dd->msix_entries[i].msix.vector,
+			dd->msix_entries[i].mask);
+	}
+
+out:
+	free_cpumask_var(def);
+	free_cpumask_var(rcv);
+	return ret;
+bail:
+	ret = -ENOMEM;
+	goto  out;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* all interrupts handled by the general handler */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		dd->gi_mask[i] = ~(u64)0;
+
+	/* all chip interrupts map to MSI-X 0 */
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP + (8*i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+	struct hfi1_msix_entry *entries;
+	u32 total, request;
+	int i, ret;
+	int single_interrupt = 0; /* we expect to have all the interrupts */
+
+	/*
+	 * Interrupt count:
+	 *	1 general, "slow path" interrupt (includes the SDMA engines
+	 *		slow source, SDMACleanupDone)
+	 *	N interrupts - one per used SDMA engine
+	 *	M interrupt - one per kernel receive context
+	 */
+	total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+	entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+	if (!entries) {
+		dd_dev_err(dd, "cannot allocate msix table\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	/* 1-1 MSI-X entry assignment */
+	for (i = 0; i < total; i++)
+		entries[i].msix.entry = i;
+
+	/* ask for MSI-X interrupts */
+	request = total;
+	request_msix(dd, &request, entries);
+
+	if (request == 0) {
+		/* using INTx */
+		/* dd->num_msix_entries already zero */
+		kfree(entries);
+		single_interrupt = 1;
+		dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+	} else {
+		/* using MSI-X */
+		dd->num_msix_entries = request;
+		dd->msix_entries = entries;
+
+		if (request != total) {
+			/* using MSI-X, with reduced interrupts */
+			dd_dev_err(
+				dd,
+				"cannot handle reduced interrupt case, want %u, got %u\n",
+				total, request);
+			ret = -EINVAL;
+			goto fail;
+		}
+		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+	}
+
+	/* mask all interrupts */
+	set_intr_state(dd, 0);
+	/* clear all pending interrupts */
+	clear_all_interrupts(dd);
+
+	/* reset general handler mask, chip MSI-X mappings */
+	reset_interrupts(dd);
+
+	if (single_interrupt)
+		ret = request_intx_irq(dd);
+	else
+		ret = request_msix_irqs(dd);
+	if (ret)
+		goto fail;
+
+	return 0;
+
+fail:
+	clean_up_interrupts(dd);
+	return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *	num_rcv_contexts - number of contexts being used
+ *	n_krcv_queues - number of kernel contexts
+ *	first_user_ctxt - first non-kernel context in array of contexts
+ *	freectxts  - number of free user contexts
+ *	num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+	int num_kernel_contexts;
+	int num_user_contexts;
+	int total_contexts;
+	int ret;
+	unsigned ngroups;
+
+	/*
+	 * Kernel contexts: (to be fixed later):
+	 * - min or 2 or 1 context/numa
+	 * - Context 0 - default/errors
+	 * - Context 1 - VL15
+	 */
+	if (n_krcvqs)
+		num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS;
+	else
+		num_kernel_contexts = num_online_nodes();
+	num_kernel_contexts =
+		max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+	/*
+	 * Every kernel receive context needs an ACK send context.
+	 * one send context is allocated for each VL{0-7} and VL15
+	 */
+	if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+		dd_dev_err(dd,
+			   "Reducing # kernel rcv contexts to: %d, from %d\n",
+			   (int)(dd->chip_send_contexts - num_vls - 1),
+			   (int)num_kernel_contexts);
+		num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+	}
+	/*
+	 * User contexts: (to be fixed later)
+	 *	- set to num_rcv_contexts if non-zero
+	 *	- default to 1 user context per CPU
+	 */
+	if (num_rcv_contexts)
+		num_user_contexts = num_rcv_contexts;
+	else
+		num_user_contexts = num_online_cpus();
+
+	total_contexts = num_kernel_contexts + num_user_contexts;
+
+	/*
+	 * Adjust the counts given a global max.
+	 */
+	if (total_contexts > dd->chip_rcv_contexts) {
+		dd_dev_err(dd,
+			   "Reducing # user receive contexts to: %d, from %d\n",
+			   (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+			   (int)num_user_contexts);
+		num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+		/* recalculate */
+		total_contexts = num_kernel_contexts + num_user_contexts;
+	}
+
+	/* the first N are kernel contexts, the rest are user contexts */
+	dd->num_rcv_contexts = total_contexts;
+	dd->n_krcv_queues = num_kernel_contexts;
+	dd->first_user_ctxt = num_kernel_contexts;
+	dd->freectxts = num_user_contexts;
+	dd_dev_info(dd,
+		"rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+		(int)dd->chip_rcv_contexts,
+		(int)dd->num_rcv_contexts,
+		(int)dd->n_krcv_queues,
+		(int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+	/*
+	 * Receive array allocation:
+	 *   All RcvArray entries are divided into groups of 8. This
+	 *   is required by the hardware and will speed up writes to
+	 *   consecutive entries by using write-combining of the entire
+	 *   cacheline.
+	 *
+	 *   The number of groups are evenly divided among all contexts.
+	 *   any left over groups will be given to the first N user
+	 *   contexts.
+	 */
+	dd->rcv_entries.group_size = RCV_INCREMENT;
+	ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+	dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+	dd->rcv_entries.nctxt_extra = ngroups -
+		(dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+	dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+		    dd->rcv_entries.ngroups,
+		    dd->rcv_entries.nctxt_extra);
+	if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+	    MAX_EAGER_ENTRIES * 2) {
+		dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+			dd->rcv_entries.group_size;
+		dd_dev_info(dd,
+		   "RcvArray group count too high, change to %u\n",
+		   dd->rcv_entries.ngroups);
+		dd->rcv_entries.nctxt_extra = 0;
+	}
+	/*
+	 * PIO send contexts
+	 */
+	ret = init_sc_pools_and_sizes(dd);
+	if (ret >= 0) {	/* success */
+		dd->num_send_contexts = ret;
+		dd_dev_info(
+			dd,
+			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+			dd->chip_send_contexts,
+			dd->num_send_contexts,
+			dd->sc_sizes[SC_KERNEL].count,
+			dd->sc_sizes[SC_ACK].count,
+			dd->sc_sizes[SC_USER].count);
+		ret = 0;	/* success */
+	}
+
+	return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg = 0;
+	int i;
+
+	dd_dev_info(dd, "Setting partition keys\n");
+	for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+		reg |= (ppd->pkeys[i] &
+			RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+			((i % 4) *
+			 RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+		/* Each register holds 4 PKey values. */
+		if ((i % 4) == 3) {
+			write_csr(dd, RCV_PARTITION_KEY +
+				  ((i - 3) * 2), reg);
+			reg = 0;
+		}
+	}
+
+	/* Always enable HW pkeys check when pkeys table is set */
+	add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+	int i, j;
+
+	/* CceIntMap */
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP+(8*i), 0);
+
+	/* SendCtxtCreditReturnAddr */
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+	/* PIO Send buffers */
+	/* SDMA Send buffers */
+	/* These are not normally read, and (presently) have no method
+	   to be read, so are not pre-initialized */
+
+	/* RcvHdrAddr */
+	/* RcvHdrTailAddr */
+	/* RcvTidFlowTable */
+	for (i = 0; i < dd->chip_rcv_contexts; i++) {
+		write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+		for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+			write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
+	}
+
+	/* RcvArray */
+	for (i = 0; i < dd->chip_rcv_array_count; i++)
+		write_csr(dd, RCV_ARRAY + (8*i),
+					RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+	/* RcvQPMapTable */
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+			     u64 ctrl_bits)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	/* is the condition present? */
+	reg = read_csr(dd, CCE_STATUS);
+	if ((reg & status_bits) == 0)
+		return;
+
+	/* clear the condition */
+	write_csr(dd, CCE_CTRL, ctrl_bits);
+
+	/* wait for the condition to clear */
+	timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, CCE_STATUS);
+		if ((reg & status_bits) == 0)
+			return;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				"Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+				status_bits, reg & status_bits);
+			return;
+		}
+		udelay(1);
+	}
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* CCE_REVISION read-only */
+	/* CCE_REVISION2 read-only */
+	/* CCE_CTRL - bits clear automatically */
+	/* CCE_STATUS read-only, use CceCtrl to clear */
+	clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+	clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+	clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+	for (i = 0; i < CCE_NUM_SCRATCH; i++)
+		write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+	/* CCE_ERR_STATUS read-only */
+	write_csr(dd, CCE_ERR_MASK, 0);
+	write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+	/* CCE_ERR_FORCE leave alone */
+	for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+		write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+	/* CCE_PCIE_CTRL leave alone */
+	for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+		write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+		write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+					CCE_MSIX_TABLE_UPPER_RESETCSR);
+	}
+	for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+		/* CCE_MSIX_PBA read-only */
+		write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+		write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+	}
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP, 0);
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+		/* CCE_INT_STATUS read-only */
+		write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+		write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+		/* CCE_INT_FORCE leave alone */
+		/* CCE_INT_BLOCKED read-only */
+	}
+	for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+		write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set ASIC CSRs to chip reset defaults */
+static void reset_asic_csrs(struct hfi1_devdata *dd)
+{
+	static DEFINE_MUTEX(asic_mutex);
+	static int called;
+	int i;
+
+	/*
+	 * If the HFIs are shared between separate nodes or VMs,
+	 * then more will need to be done here.  One idea is a module
+	 * parameter that returns early, letting the first power-on or
+	 * a known first load do the reset and blocking all others.
+	 */
+
+	/*
+	 * These CSRs should only be reset once - the first one here will
+	 * do the work.  Use a mutex so that a non-first caller waits until
+	 * the first is finished before it can proceed.
+	 */
+	mutex_lock(&asic_mutex);
+	if (called)
+		goto done;
+	called = 1;
+
+	if (dd->icode != ICODE_FPGA_EMULATION) {
+		/* emulation does not have an SBus - leave these alone */
+		/*
+		 * All writes to ASIC_CFG_SBUS_REQUEST do something.
+		 * Notes:
+		 * o The reset is not zero if aimed at the core.  See the
+		 *   SBus documentation for details.
+		 * o If the SBus firmware has been updated (e.g. by the BIOS),
+		 *   will the reset revert that?
+		 */
+		/* ASIC_CFG_SBUS_REQUEST leave alone */
+		write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+	}
+	/* ASIC_SBUS_RESULT read-only */
+	write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
+	for (i = 0; i < ASIC_NUM_SCRATCH; i++)
+		write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
+	write_csr(dd, ASIC_CFG_MUTEX, 0);	/* this will clear it */
+	write_csr(dd, ASIC_CFG_DRV_STR, 0);
+	write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0);
+	/* ASIC_STS_THERM read-only */
+	/* ASIC_CFG_RESET leave alone */
+
+	write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
+	/* ASIC_PCIE_SD_HOST_STATUS read-only */
+	write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
+	write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
+	/* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
+	write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
+	/* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
+	/* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
+	for (i = 0; i < 16; i++)
+		write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
+
+	/* ASIC_GPIO_IN read-only */
+	write_csr(dd, ASIC_GPIO_OE, 0);
+	write_csr(dd, ASIC_GPIO_INVERT, 0);
+	write_csr(dd, ASIC_GPIO_OUT, 0);
+	write_csr(dd, ASIC_GPIO_MASK, 0);
+	/* ASIC_GPIO_STATUS read-only */
+	write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
+	/* ASIC_GPIO_FORCE leave alone */
+
+	/* ASIC_QSFP1_IN read-only */
+	write_csr(dd, ASIC_QSFP1_OE, 0);
+	write_csr(dd, ASIC_QSFP1_INVERT, 0);
+	write_csr(dd, ASIC_QSFP1_OUT, 0);
+	write_csr(dd, ASIC_QSFP1_MASK, 0);
+	/* ASIC_QSFP1_STATUS read-only */
+	write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
+	/* ASIC_QSFP1_FORCE leave alone */
+
+	/* ASIC_QSFP2_IN read-only */
+	write_csr(dd, ASIC_QSFP2_OE, 0);
+	write_csr(dd, ASIC_QSFP2_INVERT, 0);
+	write_csr(dd, ASIC_QSFP2_OUT, 0);
+	write_csr(dd, ASIC_QSFP2_MASK, 0);
+	/* ASIC_QSFP2_STATUS read-only */
+	write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
+	/* ASIC_QSFP2_FORCE leave alone */
+
+	write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
+	/* this also writes a NOP command, clearing paging mode */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
+	write_csr(dd, ASIC_EEP_DATA, 0);
+
+done:
+	mutex_unlock(&asic_mutex);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+		write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+		write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+	}
+	/* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+	   only be written 128-byte chunks */
+	/* init RSA engine to clear lingering errors */
+	write_csr(dd, MISC_CFG_RSA_CMD, 1);
+	write_csr(dd, MISC_CFG_RSA_MU, 0);
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+	/* MISC_STS_8051_DIGEST read-only */
+	/* MISC_STS_SBM_DIGEST read-only */
+	/* MISC_STS_PCIE_DIGEST read-only */
+	/* MISC_STS_FAB_DIGEST read-only */
+	/* MISC_ERR_STATUS read-only */
+	write_csr(dd, MISC_ERR_MASK, 0);
+	write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+	/* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/*
+	 * TXE Kernel CSRs
+	 */
+	write_csr(dd, SEND_CTRL, 0);
+	__cm_reset(dd, 0);	/* reset CM internal state */
+	/* SEND_CONTEXTS read-only */
+	/* SEND_DMA_ENGINES read-only */
+	/* SEND_PIO_MEM_SIZE read-only */
+	/* SEND_DMA_MEM_SIZE read-only */
+	write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+	pio_reset_all(dd);	/* SEND_PIO_INIT_CTXT */
+	/* SEND_PIO_ERR_STATUS read-only */
+	write_csr(dd, SEND_PIO_ERR_MASK, 0);
+	write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+	/* SEND_PIO_ERR_FORCE leave alone */
+	/* SEND_DMA_ERR_STATUS read-only */
+	write_csr(dd, SEND_DMA_ERR_MASK, 0);
+	write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+	/* SEND_DMA_ERR_FORCE leave alone */
+	/* SEND_EGRESS_ERR_STATUS read-only */
+	write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+	write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+	/* SEND_EGRESS_ERR_FORCE leave alone */
+	write_csr(dd, SEND_BTH_QP, 0);
+	write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+	write_csr(dd, SEND_SC2VLT0, 0);
+	write_csr(dd, SEND_SC2VLT1, 0);
+	write_csr(dd, SEND_SC2VLT2, 0);
+	write_csr(dd, SEND_SC2VLT3, 0);
+	write_csr(dd, SEND_LEN_CHECK0, 0);
+	write_csr(dd, SEND_LEN_CHECK1, 0);
+	/* SEND_ERR_STATUS read-only */
+	write_csr(dd, SEND_ERR_MASK, 0);
+	write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+	/* SEND_ERR_FORCE read-only */
+	for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+		write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
+	for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+		write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
+	for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
+		write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
+	for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+		write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
+	for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+		write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
+	write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+					SEND_CM_GLOBAL_CREDIT_RESETCSR);
+	/* SEND_CM_CREDIT_USED_STATUS read-only */
+	write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+	write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+	write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+	write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+	write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+	write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+	/* SEND_CM_CREDIT_USED_VL read-only */
+	/* SEND_CM_CREDIT_USED_VL15 read-only */
+	/* SEND_EGRESS_CTXT_STATUS read-only */
+	/* SEND_EGRESS_SEND_DMA_STATUS read-only */
+	write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+	/* SEND_EGRESS_ERR_INFO read-only */
+	/* SEND_EGRESS_ERR_SOURCE read-only */
+
+	/*
+	 * TXE Per-Context CSRs
+	 */
+	for (i = 0; i < dd->chip_send_contexts; i++) {
+		write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+	}
+
+	/*
+	 * TXE Per-SDMA CSRs
+	 */
+	for (i = 0; i < dd->chip_sdma_engines; i++) {
+		write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+		/* SEND_DMA_STATUS read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+		/* SEND_DMA_HEAD read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+		/* SEND_DMA_IDLE_CNT read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+		/* SEND_DMA_DESC_FETCHED_CNT read-only */
+		/* SEND_DMA_ENG_ERR_STATUS read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+		/* SEND_DMA_ENG_ERR_FORCE leave alone */
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+	}
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	int count;
+
+	/*
+	 * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+	 * clear.
+	 */
+	count = 0;
+	while (1) {
+		reg = read_csr(dd, RCV_STATUS);
+		if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+			    | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+			break;
+		/*
+		 * Give up after 1ms - maximum wait time.
+		 *
+		 * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
+		 * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+		 *	148 KB / (66% * 250MB/s) = 920us
+		 */
+		if (count++ > 500) {
+			dd_dev_err(dd,
+				"%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+				__func__, reg);
+			break;
+		}
+		udelay(2); /* do not busy-wait the CSR */
+	}
+
+	/* start the init - expect RcvCtrl to be 0 */
+	write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+	/*
+	 * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+	 * period after the write before RcvStatus.RxRbufInitDone is valid.
+	 * The delay in the first run through the loop below is sufficient and
+	 * required before the first read of RcvStatus.RxRbufInintDone.
+	 */
+	read_csr(dd, RCV_CTRL);
+
+	/* wait for the init to finish */
+	count = 0;
+	while (1) {
+		/* delay is required first time through - see above */
+		udelay(2); /* do not busy-wait the CSR */
+		reg = read_csr(dd, RCV_STATUS);
+		if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+			break;
+
+		/* give up after 100us - slowest possible at 33MHz is 73us */
+		if (count++ > 50) {
+			dd_dev_err(dd,
+				"%s: RcvStatus.RxRbufInit not set, continuing\n",
+				__func__);
+			break;
+		}
+	}
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+	int i, j;
+
+	/*
+	 * RXE Kernel CSRs
+	 */
+	write_csr(dd, RCV_CTRL, 0);
+	init_rbufs(dd);
+	/* RCV_STATUS read-only */
+	/* RCV_CONTEXTS read-only */
+	/* RCV_ARRAY_CNT read-only */
+	/* RCV_BUF_SIZE read-only */
+	write_csr(dd, RCV_BTH_QP, 0);
+	write_csr(dd, RCV_MULTICAST, 0);
+	write_csr(dd, RCV_BYPASS, 0);
+	write_csr(dd, RCV_VL15, 0);
+	/* this is a clear-down */
+	write_csr(dd, RCV_ERR_INFO,
+			RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+	/* RCV_ERR_STATUS read-only */
+	write_csr(dd, RCV_ERR_MASK, 0);
+	write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+	/* RCV_ERR_FORCE leave alone */
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+	for (i = 0; i < 4; i++)
+		write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+		write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+		write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+		write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+		write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+		write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+	}
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+	/*
+	 * RXE Kernel and User Per-Context CSRs
+	 */
+	for (i = 0; i < dd->chip_rcv_contexts; i++) {
+		/* kernel */
+		write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+		/* RCV_CTXT_STATUS read-only */
+		write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+		/* user */
+		/* RCV_HDR_TAIL read-only */
+		write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+		/* RCV_EGR_INDEX_TAIL read-only */
+		write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+		/* RCV_EGR_OFFSET_TAIL read-only */
+		for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+			write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
+				0);
+		}
+	}
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+	int i;
+	/* init per architecture spec, constrained by hardware capability */
+
+	/* HFI maps sent packets */
+	write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+		0,
+		0, 0, 1, 1,
+		2, 2, 3, 3,
+		4, 4, 5, 5,
+		6, 6, 7, 7));
+	write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+		1,
+		8, 0, 9, 0,
+		10, 0, 11, 0,
+		12, 0, 13, 0,
+		14, 0, 15, 15));
+	write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+		2,
+		16, 0, 17, 0,
+		18, 0, 19, 0,
+		20, 0, 21, 0,
+		22, 0, 23, 0));
+	write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+		3,
+		24, 0, 25, 0,
+		26, 0, 27, 0,
+		28, 0, 29, 0,
+		30, 0, 31, 0));
+
+	/* DC maps received packets */
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+		15_0,
+		0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+		8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+		31_16,
+		16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+		24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+	/* initialize the cached sc2vl values consistently with h/w */
+	for (i = 0; i < 32; i++) {
+		if (i < 8 || i == 15)
+			*((u8 *)(dd->sc2vl) + i) = (u8)i;
+		else
+			*((u8 *)(dd->sc2vl) + i) = 0;
+	}
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/*
+	 * Put the HFI CSRs in a known state.
+	 * Combine this with a DC reset.
+	 *
+	 * Stop the device from doing anything while we do a
+	 * reset.  We know there are no other active users of
+	 * the device since we are now in charge.  Turn off
+	 * off all outbound and inbound traffic and make sure
+	 * the device does not generate any interrupts.
+	 */
+
+	/* disable send contexts and SDMA engines */
+	write_csr(dd, SEND_CTRL, 0);
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+	/* disable port (turn off RXE inbound traffic) and contexts */
+	write_csr(dd, RCV_CTRL, 0);
+	for (i = 0; i < dd->chip_rcv_contexts; i++)
+		write_csr(dd, RCV_CTXT_CTRL, 0);
+	/* mask all interrupt sources */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+
+	/*
+	 * DC Reset: do a full DC reset before the register clear.
+	 * A recommended length of time to hold is one CSR read,
+	 * so reread the CceDcCtrl.  Then, hold the DC in reset
+	 * across the clear.
+	 */
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+	(void) read_csr(dd, CCE_DC_CTRL);
+
+	if (use_flr) {
+		/*
+		 * A FLR will reset the SPC core and part of the PCIe.
+		 * The parts that need to be restored have already been
+		 * saved.
+		 */
+		dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+		/* do the FLR, the DC reset will remain */
+		hfi1_pcie_flr(dd);
+
+		/* restore command and BARs */
+		restore_pci_variables(dd);
+
+		if (is_a0(dd)) {
+			dd_dev_info(dd, "Resetting CSRs with FLR\n");
+			hfi1_pcie_flr(dd);
+			restore_pci_variables(dd);
+		}
+
+	} else {
+		dd_dev_info(dd, "Resetting CSRs with writes\n");
+		reset_cce_csrs(dd);
+		reset_txe_csrs(dd);
+		reset_rxe_csrs(dd);
+		reset_asic_csrs(dd);
+		reset_misc_csrs(dd);
+	}
+	/* clear the DC reset */
+	write_csr(dd, CCE_DC_CTRL, 0);
+	/* Set the LED off */
+	if (is_a0(dd))
+		setextled(dd, 0);
+	/*
+	 * Clear the QSFP reset.
+	 * A0 leaves the out lines floating on power on, then on an FLR
+	 * enforces a 0 on all out pins.  The driver does not touch
+	 * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+	 * anything  plugged constantly in reset, if it pays attention
+	 * to RESET_N.
+	 * A prime example of this is SiPh. For now, set all pins high.
+	 * I2CCLK and I2CDAT will change per direction, and INT_N and
+	 * MODPRS_N are input only and their value is ignored.
+	 */
+	if (is_a0(dd)) {
+		write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+		write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+	}
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* assign link credit variables */
+	dd->vau = CM_VAU;
+	dd->link_credits = CM_GLOBAL_CREDITS;
+	if (is_a0(dd))
+		dd->link_credits--;
+	dd->vcu = cu_to_vcu(hfi1_cu);
+	/* enough room for 8 MAD packets plus header - 17K */
+	dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+	if (dd->vl15_init > dd->link_credits)
+		dd->vl15_init = dd->link_credits;
+
+	write_uninitialized_csrs_and_memories(dd);
+
+	if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_pportdata *ppd = &dd->pport[i];
+
+			set_partition_keys(ppd);
+		}
+	init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+	/* user changed the KDETH_QP */
+	if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+		/* out of range or illegal value */
+		dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+		kdeth_qp = 0;
+	}
+	if (kdeth_qp == 0)	/* not set, or failed range check */
+		kdeth_qp = DEFAULT_KDETH_QP;
+
+	write_csr(dd, SEND_BTH_QP,
+			(kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
+				<< SEND_BTH_QP_KDETH_QP_SHIFT);
+
+	write_csr(dd, RCV_BTH_QP,
+			(kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
+				<< RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+			     u32 first_ctxt,
+			     u32 last_ctxt)
+{
+	u64 reg = 0;
+	u64 regno = RCV_QP_MAP_TABLE;
+	int i;
+	u64 ctxt = first_ctxt;
+
+	for (i = 0; i < 256;) {
+		if (ctxt == VL15CTXT) {
+			ctxt++;
+			if (ctxt > last_ctxt)
+				ctxt = first_ctxt;
+			continue;
+		}
+		reg |= ctxt << (8 * (i % 8));
+		i++;
+		ctxt++;
+		if (ctxt > last_ctxt)
+			ctxt = first_ctxt;
+		if (i % 8 == 0) {
+			write_csr(dd, regno, reg);
+			reg = 0;
+			regno += 8;
+		}
+	}
+	if (i % 8)
+		write_csr(dd, regno, reg);
+
+	add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+			| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @first_context
+ *
+ * This routine initializes Rule 0 and the
+ * RSM map table to implement qos.
+ *
+ * If all of the limit tests succeed,
+ * qos is applied based on the array
+ * interpretation of krcvqs where
+ * entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn
+ * bits (m) are computed to feed both the RSM map table
+ * and the single rule.
+ *
+ */
+static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+{
+	u8 max_by_vl = 0;
+	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+	u64 *rsmmap;
+	u64 reg;
+	u8  rxcontext = is_a0(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+	/* validate */
+	if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+	    num_vls == 1 ||
+	    krcvqsset <= 1)
+		goto bail;
+	for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+		if (krcvqs[i] > max_by_vl)
+			max_by_vl = krcvqs[i];
+	if (max_by_vl > 32)
+		goto bail;
+	qpns_per_vl = __roundup_pow_of_two(max_by_vl);
+	/* determine bits vl */
+	n = ilog2(num_vls);
+	/* determine bits for qpn */
+	m = ilog2(qpns_per_vl);
+	if ((m + n) > 7)
+		goto bail;
+	if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+		goto bail;
+	rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
+	memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
+	/* init the local copy of the table */
+	for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+		unsigned tctxt;
+
+		for (qpn = 0, tctxt = ctxt;
+		     krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+			unsigned idx, regoff, regidx;
+
+			/* generate index <= 128 */
+			idx = (qpn << n) ^ i;
+			regoff = (idx % 8) * 8;
+			regidx = idx / 8;
+			reg = rsmmap[regidx];
+			/* replace 0xff with context number */
+			reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+				<< regoff);
+			reg |= (u64)(tctxt++) << regoff;
+			rsmmap[regidx] = reg;
+			if (tctxt == ctxt + krcvqs[i])
+				tctxt = ctxt;
+		}
+		ctxt += krcvqs[i];
+	}
+	/* flush cached copies to chip */
+	for (i = 0; i < NUM_MAP_REGS; i++)
+		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
+	/* add rule0 */
+	write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
+		RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
+			<< RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+		2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+	write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
+		LRH_BTH_MATCH_OFFSET
+			<< RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+		LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+		LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+		((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+		QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+		((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+	write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
+		LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
+		LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
+		LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
+		LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+	/* Enable RSM */
+	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+	kfree(rsmmap);
+	/* map everything else (non-VL15) to context 0 */
+	init_qpmap_table(
+		dd,
+		0,
+		0);
+	dd->qos_shift = n + 1;
+	return;
+bail:
+	dd->qos_shift = 1;
+	init_qpmap_table(
+		dd,
+		dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0,
+		dd->n_krcv_queues - 1);
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+	/* enable all receive errors */
+	write_csr(dd, RCV_ERR_MASK, ~0ull);
+	/* setup QPN map table - start where VL15 context leaves off */
+	init_qos(
+		dd,
+		dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
+	/*
+	 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+	 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+	 * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+	 * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+	 * Max_PayLoad_Size set to its minimum of 128.
+	 *
+	 * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+	 * (64 bytes).  Max_Payload_Size is possibly modified upward in
+	 * tune_pcie_caps() which is called after this routine.
+	 */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+	/* enable all CCE errors */
+	write_csr(dd, CCE_ERR_MASK, ~0ull);
+	/* enable *some* Misc errors */
+	write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+	/* enable all DC errors, except LCB */
+	write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+	write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+			       u32 csr0to3, u32 csr4to7)
+{
+	write_csr(dd, csr0to3,
+		   0ull <<
+			SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
+		|  1ull <<
+			SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
+		|  2ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
+		|  4ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+	write_csr(dd, csr4to7,
+		   8ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
+		| 16ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
+		| 32ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
+		| 64ull * cu <<
+			SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+	assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+					SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+	assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+					SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* enable all PIO, SDMA, general, and Egress errors */
+	write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+	/* enable all per-context and per-SDMA engine errors */
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+	/* set the local CU to AU mapping */
+	assign_local_cm_au_table(dd, dd->vcu);
+
+	/*
+	 * Set reasonable default for Credit Return Timer
+	 * Don't set on Simulator - causes it to choke.
+	 */
+	if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+		write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+	struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+	unsigned sctxt;
+	int ret = 0;
+	u64 reg;
+
+	if (!rcd || !rcd->sc) {
+		ret = -EINVAL;
+		goto done;
+	}
+	sctxt = rcd->sc->hw_context;
+	reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+		((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+		 SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+	/* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+	if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+		reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+	/*
+	 * Enable send-side J_KEY integrity check, unless this is A0 h/w
+	 * (due to A0 erratum).
+	 */
+	if (!is_a0(dd)) {
+		reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+		reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+		write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	}
+
+	/* Enable J_KEY check on receive context. */
+	reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+		((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+		 RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+	write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+	return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+	struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+	unsigned sctxt;
+	int ret = 0;
+	u64 reg;
+
+	if (!rcd || !rcd->sc) {
+		ret = -EINVAL;
+		goto done;
+	}
+	sctxt = rcd->sc->hw_context;
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+	/*
+	 * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+	 * This check would not have been enabled for A0 h/w, see
+	 * set_ctxt_jkey().
+	 */
+	if (!is_a0(dd)) {
+		reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+		reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+		write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	}
+	/* Turn off the J_KEY on the receive side */
+	write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+	return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned sctxt;
+	int ret = 0;
+	u64 reg;
+
+	if (ctxt < dd->num_rcv_contexts)
+		rcd = dd->rcd[ctxt];
+	else {
+		ret = -EINVAL;
+		goto done;
+	}
+	if (!rcd || !rcd->sc) {
+		ret = -EINVAL;
+		goto done;
+	}
+	sctxt = rcd->sc->hw_context;
+	reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+		SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+	reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+	return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned sctxt;
+	int ret = 0;
+	u64 reg;
+
+	if (ctxt < dd->num_rcv_contexts)
+		rcd = dd->rcd[ctxt];
+	else {
+		ret = -EINVAL;
+		goto done;
+	}
+	if (!rcd || !rcd->sc) {
+		ret = -EINVAL;
+		goto done;
+	}
+	sctxt = rcd->sc->hw_context;
+	reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+	reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+	return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+	free_cntrs(dd);
+	free_rcverr(dd);
+	clean_up_interrupts(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+	((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Certain chip functions need to be initialized only once per asic
+ * instead of per-device. This function finds the peer device and
+ * checks whether that chip initialization needs to be done by this
+ * device.
+ */
+static void asic_should_init(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+	struct hfi1_devdata *tmp, *peer = NULL;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	/* Find our peer device */
+	list_for_each_entry(tmp, &hfi1_dev_list, list) {
+		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+		    dd->unit != tmp->unit) {
+			peer = tmp;
+			break;
+		}
+	}
+
+	/*
+	 * "Claim" the ASIC for initialization if it hasn't been
+	 " "claimed" yet.
+	 */
+	if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
+		dd->flags |= HFI1_DO_INIT_ASIC;
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+}
+
+/**
+ * Allocate an initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+				  const struct pci_device_id *ent)
+{
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	u64 reg;
+	int i, ret;
+	static const char * const inames[] = { /* implementation names */
+		"RTL silicon",
+		"RTL VCS simulation",
+		"RTL FPGA emulation",
+		"Functional simulator"
+	};
+
+	dd = hfi1_alloc_devdata(pdev,
+		NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
+	if (IS_ERR(dd))
+		goto bail;
+	ppd = dd->pport;
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		int vl;
+		/* init common fields */
+		hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+		/* DC supports 4 link widths */
+		ppd->link_width_supported =
+			OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+			OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+		ppd->link_width_downgrade_supported =
+			ppd->link_width_supported;
+		/* start out enabling only 4X */
+		ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+		ppd->link_width_downgrade_enabled =
+					ppd->link_width_downgrade_supported;
+		/* link width active is 0 when link is down */
+		/* link width downgrade active is 0 when link is down */
+
+		if (num_vls < HFI1_MIN_VLS_SUPPORTED
+			|| num_vls > HFI1_MAX_VLS_SUPPORTED) {
+			hfi1_early_err(&pdev->dev,
+				       "Invalid num_vls %u, using %u VLs\n",
+				    num_vls, HFI1_MAX_VLS_SUPPORTED);
+			num_vls = HFI1_MAX_VLS_SUPPORTED;
+		}
+		ppd->vls_supported = num_vls;
+		ppd->vls_operational = ppd->vls_supported;
+		/* Set the default MTU. */
+		for (vl = 0; vl < num_vls; vl++)
+			dd->vld[vl].mtu = hfi1_max_mtu;
+		dd->vld[15].mtu = MAX_MAD_PACKET;
+		/*
+		 * Set the initial values to reasonable default, will be set
+		 * for real when link is up.
+		 */
+		ppd->lstate = IB_PORT_DOWN;
+		ppd->overrun_threshold = 0x4;
+		ppd->phy_error_threshold = 0xf;
+		ppd->port_crc_mode_enabled = link_crc_mask;
+		/* initialize supported LTP CRC mode */
+		ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+		/* initialize enabled LTP CRC mode */
+		ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+		/* start in offline */
+		ppd->host_link_state = HLS_DN_OFFLINE;
+		init_vl_arb_caches(ppd);
+	}
+
+	dd->link_default = HLS_DN_POLL;
+
+	/*
+	 * Do remaining PCIe setup and save PCIe values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped.
+	 */
+	ret = hfi1_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* verify that reads actually work, save revision for reset check */
+	dd->revision = read_csr(dd, CCE_REVISION);
+	if (dd->revision == ~(u64)0) {
+		dd_dev_err(dd, "cannot read chip CSRs\n");
+		ret = -EINVAL;
+		goto bail_cleanup;
+	}
+	dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+			& CCE_REVISION_CHIP_REV_MAJOR_MASK;
+	dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+	/* obtain the hardware ID - NOT related to unit, which is a
+	   software enumeration */
+	reg = read_csr(dd, CCE_REVISION2);
+	dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+					& CCE_REVISION2_HFI_ID_MASK;
+	/* the variable size will remove unwanted bits */
+	dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+	dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+	dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+		dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
+		(int)dd->irev);
+
+	/* speeds the hardware can support */
+	dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+	/* speeds allowed to run at */
+	dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+	/* give a reasonable active value, will be set on link up */
+	dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+	dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+	dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+	dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+	dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+	dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+	/* fix up link widths for emulation _p */
+	ppd = dd->pport;
+	if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+		ppd->link_width_supported =
+			ppd->link_width_enabled =
+			ppd->link_width_downgrade_supported =
+			ppd->link_width_downgrade_enabled =
+				OPA_LINK_WIDTH_1X;
+	}
+	/* insure num_vls isn't larger than number of sdma engines */
+	if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+		dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+				num_vls, HFI1_MAX_VLS_SUPPORTED);
+		ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED;
+		ppd->vls_operational = ppd->vls_supported;
+	}
+
+	/*
+	 * Convert the ns parameter to the 64 * cclocks used in the CSR.
+	 * Limit the max if larger than the field holds.  If timeout is
+	 * non-zero, then the calculated field will be at least 1.
+	 *
+	 * Must be after icode is set up - the cclock rate depends
+	 * on knowing the hardware being used.
+	 */
+	dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+	if (dd->rcv_intr_timeout_csr >
+			RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+		dd->rcv_intr_timeout_csr =
+			RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+	else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+		dd->rcv_intr_timeout_csr = 1;
+
+	/* obtain chip sizes, reset chip CSRs */
+	init_chip(dd);
+
+	/* read in the PCIe link speed information */
+	ret = pcie_speeds(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* needs to be done before we look for the peer device */
+	read_guid(dd);
+
+	asic_should_init(dd);
+
+	/* read in firmware */
+	ret = hfi1_firmware_init(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/*
+	 * In general, the PCIe Gen3 transition must occur after the
+	 * chip has been idled (so it won't initiate any PCIe transactions
+	 * e.g. an interrupt) and before the driver changes any registers
+	 * (the transition will reset the registers).
+	 *
+	 * In particular, place this call after:
+	 * - init_chip()     - the chip will not initiate any PCIe transactions
+	 * - pcie_speeds()   - reads the current link speed
+	 * - hfi1_firmware_init() - the needed firmware is ready to be
+	 *			    downloaded
+	 */
+	ret = do_pcie_gen3_transition(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* start setting dd values and adjusting CSRs */
+	init_early_variables(dd);
+
+	parse_platform_config(dd);
+
+	/* add board names as they are defined */
+	dd->boardname = kmalloc(64, GFP_KERNEL);
+	if (!dd->boardname)
+		goto bail_cleanup;
+	snprintf(dd->boardname, 64, "Board ID 0x%llx",
+		 dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT
+		    & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK);
+
+	snprintf(dd->boardversion, BOARD_VERS_MAX,
+		 "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n",
+		 HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+		 dd->boardname,
+		 (u32)dd->majrev,
+		 (u32)dd->minrev,
+		 (dd->revision >> CCE_REVISION_SW_SHIFT)
+		    & CCE_REVISION_SW_MASK);
+
+	ret = set_up_context_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* set initial RXE CSRs */
+	init_rxe(dd);
+	/* set initial TXE CSRs */
+	init_txe(dd);
+	/* set initial non-RXE, non-TXE CSRs */
+	init_other(dd);
+	/* set up KDETH QP prefix in both RX and TX CSRs */
+	init_kdeth_qp(dd);
+
+	/* send contexts must be set up before receive contexts */
+	ret = init_send_contexts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	ret = hfi1_create_ctxts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+	/*
+	 * rcd[0] is guaranteed to be valid by this point. Also, all
+	 * context are using the same value, as per the module parameter.
+	 */
+	dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+	ret = init_pervl_scs(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* sdma init */
+	for (i = 0; i < dd->num_pports; ++i) {
+		ret = sdma_init(dd, i);
+		if (ret)
+			goto bail_cleanup;
+	}
+
+	/* use contexts created by hfi1_create_ctxts */
+	ret = set_up_interrupts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* set up LCB access - must be after set_up_interrupts() */
+	init_lcb_access(dd);
+
+	snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+		 dd->base_guid & 0xFFFFFF);
+
+	dd->oui1 = dd->base_guid >> 56 & 0xFF;
+	dd->oui2 = dd->base_guid >> 48 & 0xFF;
+	dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+	ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+	if (ret)
+		goto bail_clear_intr;
+	check_fabric_firmware_versions(dd);
+
+	thermal_init(dd);
+
+	ret = init_cntrs(dd);
+	if (ret)
+		goto bail_clear_intr;
+
+	ret = init_rcverr(dd);
+	if (ret)
+		goto bail_free_cntrs;
+
+	ret = eprom_init(dd);
+	if (ret)
+		goto bail_free_rcverr;
+
+	goto bail;
+
+bail_free_rcverr:
+	free_rcverr(dd);
+bail_free_cntrs:
+	free_cntrs(dd);
+bail_clear_intr:
+	clean_up_interrupts(dd);
+bail_cleanup:
+	hfi1_pcie_ddcleanup(dd);
+bail_free:
+	hfi1_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+			u32 dw_len)
+{
+	u32 delta_cycles;
+	u32 current_egress_rate = ppd->current_egress_rate;
+	/* rates here are in units of 10^6 bits/sec */
+
+	if (desired_egress_rate == -1)
+		return 0; /* shouldn't happen */
+
+	if (desired_egress_rate >= current_egress_rate)
+		return 0; /* we can't help go faster, only slower */
+
+	delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+			egress_cycles(dw_len * 4, current_egress_rate);
+
+	return (u16)delta_cycles;
+}
+
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+	       u32 dw_len)
+{
+	u64 pbc, delay = 0;
+
+	if (unlikely(srate_mbs))
+		delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+	pbc = flags
+		| (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+		| ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+		| (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+		| (dw_len & PBC_LENGTH_DWS_MASK)
+			<< PBC_LENGTH_DWS_SHIFT;
+
+	return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+	dd_dev_err((dd),						\
+		   "Thermal sensor initialization failed: %s (%d)\n",	\
+		   (reason), (ret))
+
+/*
+ * Initialize the Avago Thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	if (dd->icode != ICODE_RTL_SILICON ||
+	    !(dd->flags & HFI1_DO_INIT_ASIC))
+		return ret;
+
+	acquire_hw_mutex(dd);
+	dd_dev_info(dd, "Initializing thermal sensor\n");
+	/* Thermal Sensor Initialization */
+	/*    Step 1: Reset the Thermal SBus Receiver */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				RESET_SBUS_RECEIVER, 0);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Bus Reset");
+		goto done;
+	}
+	/*    Step 2: Set Reset bit in Thermal block */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				WRITE_SBUS_RECEIVER, 0x1);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Therm Block Reset");
+		goto done;
+	}
+	/*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+				WRITE_SBUS_RECEIVER, 0x32);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Clock Div");
+		goto done;
+	}
+	/*    Step 4: Select temperature mode */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+				WRITE_SBUS_RECEIVER,
+				SBUS_THERM_MONITOR_MODE);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Mode Sel");
+		goto done;
+	}
+	/*    Step 5: De-assert block reset and start conversion */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				WRITE_SBUS_RECEIVER, 0x2);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Reset Deassert");
+		goto done;
+	}
+	/*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+	msleep(22);
+
+	/* Enable polling of thermal readings */
+	write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+done:
+	release_hw_mutex(dd);
+	return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = &dd->pport[0];
+	/*
+	 * Thermal Critical Interrupt
+	 * Put the device into forced freeze mode, take link down to
+	 * offline, and put DC into reset.
+	 */
+	dd_dev_emerg(dd,
+		     "Critical temperature reached! Forcing device into freeze mode!\n");
+	dd->flags |= HFI1_FORCED_FREEZE;
+	start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
+	/*
+	 * Shut DC down as much and as quickly as possible.
+	 *
+	 * Step 1: Take the link down to OFFLINE. This will cause the
+	 *         8051 to put the Serdes in reset. However, we don't want to
+	 *         go through the entire link state machine since we want to
+	 *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+	 *         but rather an attempt to save the chip.
+	 *         Code below is almost the same as quiet_serdes() but avoids
+	 *         all the extra work and the sleeps.
+	 */
+	ppd->driver_link_ready = 0;
+	ppd->link_enabled = 0;
+	set_physical_link_state(dd, PLS_OFFLINE |
+				(OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
+	/*
+	 * Step 2: Shutdown LCB and 8051
+	 *         After shutdown, do not restore DC_CFG_RESET value.
+	 */
+	dc_shutdown(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
new file mode 100644
index 0000000..f89a432
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -0,0 +1,1035 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000	/* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000)	/* 32 MB */
+#define PIO_BLOCK_SIZE 64			/* bytes */
+#define SDMA_BLOCK_SIZE 64			/* bytes */
+#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
+#define PIO_CMASK 0x7ff	/* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES    2048	/* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024	/* max receive expected pairs */
+/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+   at 64 bytes for all generation one devices */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR		(1ull << 31)
+#define PBC_DC_INFO_SHIFT	(30)
+#define PBC_DC_INFO		(1ull << PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP	(1ull << 29)
+#define PBC_PACKET_BYPASS	(1ull << 28)
+#define PBC_CREDIT_RETURN	(1ull << 25)
+#define PBC_INSERT_BYPASS_ICRC (1ull << 24)
+#define PBC_TEST_BAD_ICRC	(1ull << 23)
+#define PBC_FECN		(1ull << 22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0	/* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1	/* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE   0x2	/* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+	(PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+	PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+	(PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+	(PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START	  0
+#define IS_SDMAENG_ERR_START	 16
+#define IS_SENDCTXT_ERR_START	 32
+#define IS_SDMA_START		192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START		240
+#define IS_DC_START			248
+#define IS_RCVAVAIL_START		256
+#define IS_RCVURGENT_START		416
+#define IS_SENDCREDIT_START		576
+#define IS_RESERVED_START		736
+#define IS_MAX_SOURCES		768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END		IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END		IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END		IS_SDMA_START
+#define IS_SDMA_END			IS_VARIOUS_START
+#define IS_VARIOUS_END		IS_DC_START
+#define IS_DC_END			IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END		IS_RCVURGENT_START
+#define IS_RCVURGENT_END		IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END		IS_RESERVED_START
+#define IS_RESERVED_END		IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT		242
+#define QSFP2_INT		243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN    0x1
+#define LSTATE_INIT    0x2
+#define LSTATE_ARMED   0x3
+#define LSTATE_ACTIVE  0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED			   0x30
+#define PLS_OFFLINE				   0x90
+#define PLS_OFFLINE_QUIET			   0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM	   0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT	   0x92
+#define PLS_OFFLINE_REPORT_FAILURE		   0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC	   0x94
+#define PLS_POLLING				   0x20
+#define PLS_POLLING_QUIET			   0x20
+#define PLS_POLLING_ACTIVE			   0x21
+#define PLS_CONFIGPHY			   0x40
+#define PLS_CONFIGPHY_DEBOUCE		   0x40
+#define PLS_CONFIGPHY_ESTCOMM		   0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT	   0x42
+#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE   0x43
+#define PLS_CONFIGPHY_OPTEQ			   0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING	   0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE	   0x45
+#define PLS_CONFIGPHY_VERIFYCAP		   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE	   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT			   0x48
+#define PLS_CONFIGLT_CONFIGURE		   0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE	   0x49
+#define PLS_LINKUP				   0x50
+#define PLS_PHYTEST				   0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK	   0xe1
+#define PLS_QUICK_LINKUP			   0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA  0x01
+#define HCMD_READ_CONFIG_DATA  0x02
+#define HCMD_CHANGE_PHY_STATE  0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC		   0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR      0x07
+#define HCMD_INTERFACE_TEST	   0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED		    (1 <<  0)
+#define UNKNOWN_FRAME		    (1 <<  1)
+#define TARGET_BER_NOT_MET		    (1 <<  2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK (1 <<  3)
+#define FAILED_SERDES_INIT		    (1 <<  4)
+#define FAILED_LNI_POLLING		    (1 <<  5)
+#define FAILED_LNI_DEBOUNCE		    (1 <<  6)
+#define FAILED_LNI_ESTBCOMM		    (1 <<  7)
+#define FAILED_LNI_OPTEQ		    (1 <<  8)
+#define FAILED_LNI_VERIFY_CAP1	    (1 <<  9)
+#define FAILED_LNI_VERIFY_CAP2	    (1 << 10)
+#define FAILED_LNI_CONFIGLT		    (1 << 11)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+			| FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+			| FAILED_LNI_VERIFY_CAP1 \
+			| FAILED_LNI_VERIFY_CAP2 \
+			| FAILED_LNI_CONFIGLT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE	   (1 << 0)
+#define BC_PWR_MGM_MSG	   (1 << 1)
+#define BC_SMA_MSG		   (1 << 2)
+#define BC_BCC_UNKOWN_MSG	   (1 << 3)
+#define BC_IDLE_UNKNOWN_MSG	   (1 << 4)
+#define EXT_DEVICE_CFG_REQ	   (1 << 5)
+#define VERIFY_CAP_FRAME	   (1 << 6)
+#define LINKUP_ACHIEVED	   (1 << 7)
+#define LINK_GOING_DOWN	   (1 << 8)
+#define LINK_WIDTH_DOWNGRADED  (1 << 9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG	0x01
+#define HREQ_SAVE_CONFIG	0x02
+#define HREQ_READ_CONFIG	0x03
+#define HREQ_SET_TX_EQ_ABS	0x04
+#define HREQ_SET_TX_EQ_REL	0x05
+#define HREQ_ENABLE		0x06
+#define HREQ_CONFIG_DONE	0xfe
+#define HREQ_INTERFACE_TEST	0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID		0x01
+#define HREQ_SUCCESS		0x02
+#define HREQ_NOT_SUPPORTED		0x03
+#define HREQ_FEATURE_NOT_SUPPORTED	0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED	0xfe
+#define HREQ_EXECUTION_ONGOING	0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU		    0x2
+#define IDLE_SMA		    0x3
+#define IDLE_POWER_MGMT	    0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM	1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER       (4 * 1024)
+#define MAX_EAGER_BUFFER       (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT (1 << RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT     0x01	/* do not do recovery */
+#define FREEZE_SELF	     0x02	/* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04	/* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON		0x00
+#define ICODE_RTL_VCS_SIMULATION	0x01
+#define ICODE_FPGA_EMULATION	0x02
+#define ICODE_FUNCTIONAL_SIMULATOR	0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS    0x8
+
+/* 8051 general register Field IDs */
+#define TX_SETTINGS		     0x06
+#define VERIFY_CAP_LOCAL_PHY	     0x07
+#define VERIFY_CAP_LOCAL_FABRIC	     0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
+#define LOCAL_DEVICE_ID		     0x0a
+#define LOCAL_LNI_INFO		     0x0c
+#define REMOTE_LNI_INFO              0x0d
+#define MISC_STATUS		     0x0e
+#define VERIFY_CAP_REMOTE_PHY	     0x0f
+#define VERIFY_CAP_REMOTE_FABRIC     0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE    0x12
+#define LAST_REMOTE_STATE_COMPLETE   0x13
+#define LINK_QUALITY_INFO            0x14
+#define REMOTE_DEVICE_ID	     0x15
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT   0x0
+#define LOAD_DATA_DATA_MASK   0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT   0x0
+#define READ_DATA_DATA_MASK   0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT		0
+#define ENABLE_LANE_TX_MASK		0xff
+#define TX_POLARITY_INVERSION_SHIFT	8
+#define TX_POLARITY_INVERSION_MASK	0xff
+#define RX_POLARITY_INVERSION_SHIFT	16
+#define RX_POLARITY_INVERSION_MASK	0xff
+#define MAX_RATE_SHIFT			24
+#define MAX_RATE_MASK			0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT	0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK	0x1
+#define POWER_MANAGEMENT_SHIFT			0x0
+#define POWER_MANAGEMENT_MASK			0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7	/* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT	0
+#define VAU_MASK	0x0007
+#define Z_SHIFT		3
+#define Z_MASK		0x0001
+#define VCU_SHIFT	4
+#define VCU_MASK	0x0007
+#define VL15BUF_SHIFT	8
+#define VL15BUF_MASK	0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK	0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0		/* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff		/* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK  0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK  0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK  0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL	0x1
+#define PWRM_BANDWIDTH_CONTROL	0x2
+
+/* verify capability fabric CRC size bits */
+enum {
+	CAP_CRC_14B = (1 << 0), /* 14b CRC */
+	CAP_CRC_48B = (1 << 1), /* 48b CRC */
+	CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK  0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK  0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B			0x0	/* 16b CRC */
+#define LCB_CRC_14B			0x1	/* 14b CRC */
+#define LCB_CRC_48B			0x2	/* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE	0x3	/* 12b-16b per lane CRC */
+
+/* the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo) */
+enum {
+	PORT_LTP_CRC_MODE_NONE = 0,
+	PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+	PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+	PORT_LTP_CRC_MODE_48 = 4,
+		/* 48-bit overlapping LTP CRC mode (optional) */
+	PORT_LTP_CRC_MODE_PER_LANE = 8
+		/* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000		/* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000	/* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20	/* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000	/* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10		/* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
+#define ASIC_CCLOCK_PS  1242	/* 805 MHz */
+#define FPGA_CCLOCK_PS 30300	/*  33 MHz */
+
+/*
+ * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+	(~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+		| MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE	0	/* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB	2
+#define LOOPBACK_CABLE	3	/* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+				 u32 offset0)
+{
+	/* kernel per-context CSRs are separated by 0x100 */
+	return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+				   u32 offset0, u64 value)
+{
+	/* kernel per-context CSRs are separated by 0x100 */
+	write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+	struct hfi1_devdata *dd,
+	u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+	struct hfi1_devdata *dd,
+	int ctxt,
+	u32 offset0)
+{
+	return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+				 u32 offset0)
+{
+	/* user per-context CSRs are separated by 0x1000 */
+	return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+				   u32 offset0, u64 value)
+{
+	/* user per-context CSRs are separated by 0x1000 */
+	write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define NUM_PCIE_SERDES 16	/* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+void sbus_request(struct hfi1_devdata *dd,
+		  u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+		      u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+			  u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int start_link(struct hfi1_pportdata *ppd);
+void init_qsfp(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+				 int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_a0(struct hfi1_devdata *dd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END   DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+	return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+
+/* Per VL indexes */
+enum {
+	C_VL_0 = 0,
+	C_VL_1,
+	C_VL_2,
+	C_VL_3,
+	C_VL_4,
+	C_VL_5,
+	C_VL_6,
+	C_VL_7,
+	C_VL_15,
+	C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+	return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+	return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+	C_RCV_OVF = 0,
+	C_RX_TID_FULL,
+	C_RX_TID_INVALID,
+	C_RX_TID_FLGMS,
+	C_RX_CTX_RHQS,
+	C_RX_CTX_EGRS,
+	C_RCV_TID_FLSMS,
+	C_CCE_PCI_CR_ST,
+	C_CCE_PCI_TR_ST,
+	C_CCE_PIO_WR_ST,
+	C_CCE_ERR_INT,
+	C_CCE_SDMA_INT,
+	C_CCE_MISC_INT,
+	C_CCE_RCV_AV_INT,
+	C_CCE_RCV_URG_INT,
+	C_CCE_SEND_CR_INT,
+	C_DC_UNC_ERR,
+	C_DC_RCV_ERR,
+	C_DC_FM_CFG_ERR,
+	C_DC_RMT_PHY_ERR,
+	C_DC_DROPPED_PKT,
+	C_DC_MC_XMIT_PKTS,
+	C_DC_MC_RCV_PKTS,
+	C_DC_XMIT_CERR,
+	C_DC_RCV_CERR,
+	C_DC_RCV_FCC,
+	C_DC_XMIT_FCC,
+	C_DC_XMIT_FLITS,
+	C_DC_RCV_FLITS,
+	C_DC_XMIT_PKTS,
+	C_DC_RCV_PKTS,
+	C_DC_RX_FLIT_VL,
+	C_DC_RX_PKT_VL,
+	C_DC_RCV_FCN,
+	C_DC_RCV_FCN_VL,
+	C_DC_RCV_BCN,
+	C_DC_RCV_BCN_VL,
+	C_DC_RCV_BBL,
+	C_DC_RCV_BBL_VL,
+	C_DC_MARK_FECN,
+	C_DC_MARK_FECN_VL,
+	C_DC_TOTAL_CRC,
+	C_DC_CRC_LN0,
+	C_DC_CRC_LN1,
+	C_DC_CRC_LN2,
+	C_DC_CRC_LN3,
+	C_DC_CRC_MULT_LN,
+	C_DC_TX_REPLAY,
+	C_DC_RX_REPLAY,
+	C_DC_SEQ_CRC_CNT,
+	C_DC_ESC0_ONLY_CNT,
+	C_DC_ESC0_PLUS1_CNT,
+	C_DC_ESC0_PLUS2_CNT,
+	C_DC_REINIT_FROM_PEER_CNT,
+	C_DC_SBE_CNT,
+	C_DC_MISC_FLG_CNT,
+	C_DC_PRF_GOOD_LTP_CNT,
+	C_DC_PRF_ACCEPTED_LTP_CNT,
+	C_DC_PRF_RX_FLIT_CNT,
+	C_DC_PRF_TX_FLIT_CNT,
+	C_DC_PRF_CLK_CNTR,
+	C_DC_PG_DBG_FLIT_CRDTS_CNT,
+	C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+	C_DC_PG_STS_TX_SBE_CNT,
+	C_DC_PG_STS_TX_MBE_CNT,
+	C_SW_CPU_INTR,
+	C_SW_CPU_RCV_LIM,
+	C_SW_VTX_WAIT,
+	C_SW_PIO_WAIT,
+	C_SW_KMEM_WAIT,
+	DEV_CNTR_LAST  /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+	C_TX_UNSUP_VL = 0,
+	C_TX_INVAL_LEN,
+	C_TX_MM_LEN_ERR,
+	C_TX_UNDERRUN,
+	C_TX_FLOW_STALL,
+	C_TX_DROPPED,
+	C_TX_HDR_ERR,
+	C_TX_PKT,
+	C_TX_WORDS,
+	C_TX_WAIT,
+	C_TX_FLIT_VL,
+	C_TX_PKT_VL,
+	C_TX_WAIT_VL,
+	C_RX_PKT,
+	C_RX_WORDS,
+	C_SW_LINK_DOWN,
+	C_SW_LINK_UP,
+	C_SW_XMIT_DSCD,
+	C_SW_XMIT_DSCD_VL,
+	C_SW_XMIT_CSTR_ERR,
+	C_SW_RCV_CSTR_ERR,
+	C_SW_IBP_LOOP_PKTS,
+	C_SW_IBP_RC_RESENDS,
+	C_SW_IBP_RNR_NAKS,
+	C_SW_IBP_OTHER_NAKS,
+	C_SW_IBP_RC_TIMEOUTS,
+	C_SW_IBP_PKT_DROPS,
+	C_SW_IBP_DMA_WAIT,
+	C_SW_IBP_RC_SEQNAK,
+	C_SW_IBP_RC_DUPREQ,
+	C_SW_IBP_RDMA_SEQ,
+	C_SW_IBP_UNALIGNED,
+	C_SW_IBP_SEQ_NAK,
+	C_SW_CPU_RC_ACKS,
+	C_SW_CPU_RC_QACKS,
+	C_SW_CPU_RC_DELAYED_COMP,
+	C_RCV_HDR_OVF_0,
+	C_RCV_HDR_OVF_1,
+	C_RCV_HDR_OVF_2,
+	C_RCV_HDR_OVF_3,
+	C_RCV_HDR_OVF_4,
+	C_RCV_HDR_OVF_5,
+	C_RCV_HDR_OVF_6,
+	C_RCV_HDR_OVF_7,
+	C_RCV_HDR_OVF_8,
+	C_RCV_HDR_OVF_9,
+	C_RCV_HDR_OVF_10,
+	C_RCV_HDR_OVF_11,
+	C_RCV_HDR_OVF_12,
+	C_RCV_HDR_OVF_13,
+	C_RCV_HDR_OVF_14,
+	C_RCV_HDR_OVF_15,
+	C_RCV_HDR_OVF_16,
+	C_RCV_HDR_OVF_17,
+	C_RCV_HDR_OVF_18,
+	C_RCV_HDR_OVF_19,
+	C_RCV_HDR_OVF_20,
+	C_RCV_HDR_OVF_21,
+	C_RCV_HDR_OVF_22,
+	C_RCV_HDR_OVF_23,
+	C_RCV_HDR_OVF_24,
+	C_RCV_HDR_OVF_25,
+	C_RCV_HDR_OVF_26,
+	C_RCV_HDR_OVF_27,
+	C_RCV_HDR_OVF_28,
+	C_RCV_HDR_OVF_29,
+	C_RCV_HDR_OVF_30,
+	C_RCV_HDR_OVF_31,
+	C_RCV_HDR_OVF_32,
+	C_RCV_HDR_OVF_33,
+	C_RCV_HDR_OVF_34,
+	C_RCV_HDR_OVF_35,
+	C_RCV_HDR_OVF_36,
+	C_RCV_HDR_OVF_37,
+	C_RCV_HDR_OVF_38,
+	C_RCV_HDR_OVF_39,
+	C_RCV_HDR_OVF_40,
+	C_RCV_HDR_OVF_41,
+	C_RCV_HDR_OVF_42,
+	C_RCV_HDR_OVF_43,
+	C_RCV_HDR_OVF_44,
+	C_RCV_HDR_OVF_45,
+	C_RCV_HDR_OVF_46,
+	C_RCV_HDR_OVF_47,
+	C_RCV_HDR_OVF_48,
+	C_RCV_HDR_OVF_49,
+	C_RCV_HDR_OVF_50,
+	C_RCV_HDR_OVF_51,
+	C_RCV_HDR_OVF_52,
+	C_RCV_HDR_OVF_53,
+	C_RCV_HDR_OVF_54,
+	C_RCV_HDR_OVF_55,
+	C_RCV_HDR_OVF_56,
+	C_RCV_HDR_OVF_57,
+	C_RCV_HDR_OVF_58,
+	C_RCV_HDR_OVF_59,
+	C_RCV_HDR_OVF_60,
+	C_RCV_HDR_OVF_61,
+	C_RCV_HDR_OVF_62,
+	C_RCV_HDR_OVF_63,
+	C_RCV_HDR_OVF_64,
+	C_RCV_HDR_OVF_65,
+	C_RCV_HDR_OVF_66,
+	C_RCV_HDR_OVF_67,
+	C_RCV_HDR_OVF_68,
+	C_RCV_HDR_OVF_69,
+	C_RCV_HDR_OVF_70,
+	C_RCV_HDR_OVF_71,
+	C_RCV_HDR_OVF_72,
+	C_RCV_HDR_OVF_73,
+	C_RCV_HDR_OVF_74,
+	C_RCV_HDR_OVF_75,
+	C_RCV_HDR_OVF_76,
+	C_RCV_HDR_OVF_77,
+	C_RCV_HDR_OVF_78,
+	C_RCV_HDR_OVF_79,
+	C_RCV_HDR_OVF_80,
+	C_RCV_HDR_OVF_81,
+	C_RCV_HDR_OVF_82,
+	C_RCV_HDR_OVF_83,
+	C_RCV_HDR_OVF_84,
+	C_RCV_HDR_OVF_85,
+	C_RCV_HDR_OVF_86,
+	C_RCV_HDR_OVF_87,
+	C_RCV_HDR_OVF_88,
+	C_RCV_HDR_OVF_89,
+	C_RCV_HDR_OVF_90,
+	C_RCV_HDR_OVF_91,
+	C_RCV_HDR_OVF_92,
+	C_RCV_HDR_OVF_93,
+	C_RCV_HDR_OVF_94,
+	C_RCV_HDR_OVF_95,
+	C_RCV_HDR_OVF_96,
+	C_RCV_HDR_OVF_97,
+	C_RCV_HDR_OVF_98,
+	C_RCV_HDR_OVF_99,
+	C_RCV_HDR_OVF_100,
+	C_RCV_HDR_OVF_101,
+	C_RCV_HDR_OVF_102,
+	C_RCV_HDR_OVF_103,
+	C_RCV_HDR_OVF_104,
+	C_RCV_HDR_OVF_105,
+	C_RCV_HDR_OVF_106,
+	C_RCV_HDR_OVF_107,
+	C_RCV_HDR_OVF_108,
+	C_RCV_HDR_OVF_109,
+	C_RCV_HDR_OVF_110,
+	C_RCV_HDR_OVF_111,
+	C_RCV_HDR_OVF_112,
+	C_RCV_HDR_OVF_113,
+	C_RCV_HDR_OVF_114,
+	C_RCV_HDR_OVF_115,
+	C_RCV_HDR_OVF_116,
+	C_RCV_HDR_OVF_117,
+	C_RCV_HDR_OVF_118,
+	C_RCV_HDR_OVF_119,
+	C_RCV_HDR_OVF_120,
+	C_RCV_HDR_OVF_121,
+	C_RCV_HDR_OVF_122,
+	C_RCV_HDR_OVF_123,
+	C_RCV_HDR_OVF_124,
+	C_RCV_HDR_OVF_125,
+	C_RCV_HDR_OVF_126,
+	C_RCV_HDR_OVF_127,
+	C_RCV_HDR_OVF_128,
+	C_RCV_HDR_OVF_129,
+	C_RCV_HDR_OVF_130,
+	C_RCV_HDR_OVF_131,
+	C_RCV_HDR_OVF_132,
+	C_RCV_HDR_OVF_133,
+	C_RCV_HDR_OVF_134,
+	C_RCV_HDR_OVF_135,
+	C_RCV_HDR_OVF_136,
+	C_RCV_HDR_OVF_137,
+	C_RCV_HDR_OVF_138,
+	C_RCV_HDR_OVF_139,
+	C_RCV_HDR_OVF_140,
+	C_RCV_HDR_OVF_141,
+	C_RCV_HDR_OVF_142,
+	C_RCV_HDR_OVF_143,
+	C_RCV_HDR_OVF_144,
+	C_RCV_HDR_OVF_145,
+	C_RCV_HDR_OVF_146,
+	C_RCV_HDR_OVF_147,
+	C_RCV_HDR_OVF_148,
+	C_RCV_HDR_OVF_149,
+	C_RCV_HDR_OVF_150,
+	C_RCV_HDR_OVF_151,
+	C_RCV_HDR_OVF_152,
+	C_RCV_HDR_OVF_153,
+	C_RCV_HDR_OVF_154,
+	C_RCV_HDR_OVF_155,
+	C_RCV_HDR_OVF_156,
+	C_RCV_HDR_OVF_157,
+	C_RCV_HDR_OVF_158,
+	C_RCV_HDR_OVF_159,
+	PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+				struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+			struct hfi1_ctxt_info *kinfo);
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+		  u32 mask);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+		  u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+		    u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+			char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type".  It is ordered by increasing
+ * number.
+ */
+struct is_table {
+	int start;	 /* interrupt source type start */
+	int end;	 /* interrupt source type end */
+	/* routine that returns the name of the interrupt source */
+	char *(*is_name)(char *name, size_t size, unsigned int source);
+	/* routine to call when receiving an interrupt */
+	void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
new file mode 100644
index 0000000..6521030
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -0,0 +1,1289 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE		0x000000000000
+#define CCE			(CORE + 0x000000000000)
+#define ASIC		(CORE + 0x000000400000)
+#define MISC		(CORE + 0x000000500000)
+#define DC_TOP_CSRS		(CORE + 0x000000600000)
+#define CHIP_DEBUG		(CORE + 0x000000700000)
+#define RXE			(CORE + 0x000001000000)
+#define TXE			(CORE + 0x000001800000)
+#define DCC_CSRS		(DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS		(DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS		(DC_TOP_CSRS + 0x000000002000)
+#define PCIE		0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_CONTEXT_RHQ_STALL 21
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+		0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+		0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+		0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+		0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+		0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+		0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+		0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+		0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+		0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+		0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+		0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+		0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+		0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+		0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+		0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+		0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+		0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+		0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+		0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+		0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+		0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+		0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+		0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+		0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+		0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+		0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+		0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+		0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+		0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+		0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+		0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+		0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+		0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+		0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+		0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+		0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+		0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+		0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+		0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+		0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+		0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+		0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+		0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+		0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+		0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+		0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+		0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+		0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+		0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+		0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+		0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+		0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+		0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+		0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+		0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+		0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+		0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+		0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+		0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+		0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+		0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+		0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+		0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+		0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+		0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+		0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+		0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+		0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+		0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+		0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+		0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+		0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+		0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+		0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+		0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+		0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+		0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+		0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+		0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+		0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+		0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+		0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+		0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+		0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+		0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+		0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+		0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+		0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+		0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+		0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+		0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+		0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+
+#endif          /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
new file mode 100644
index 0000000..5f22937
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/common.h
@@ -0,0 +1,415 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+					   HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~HFI1_CAP_##cap;			\
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_USET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+		})
+#define HFI1_CAP_UCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_SET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<	\
+						  HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_CLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap |			\
+				  (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_LOCK()							\
+	({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |			\
+				 HFI1_CAP_HDRSUPP |			\
+				 HFI1_CAP_MULTI_PKT_EGR |		\
+				 HFI1_CAP_NODROP_RHQ_FULL |		\
+				 HFI1_CAP_NODROP_EGR_FULL |		\
+				 HFI1_CAP_ALLOW_PERM_JKEY |		\
+				 HFI1_CAP_STATIC_RATE_CTRL |		\
+				 HFI1_CAP_PRINT_UNIMPL)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |			\
+				  HFI1_CAP_USE_SDMA_HEAD |		\
+				  HFI1_CAP_EXTENDED_PSN |		\
+				  HFI1_CAP_PRINT_UNIMPL |		\
+				  HFI1_CAP_QSFP_ENABLED |		\
+				  HFI1_CAP_NO_INTEGRITY |		\
+				  HFI1_CAP_PKEY_CHECK) <<		\
+				 HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |			\
+				 HFI1_CAP_NODROP_RHQ_FULL |		\
+				 HFI1_CAP_NODROP_EGR_FULL |		\
+				 HFI1_CAP_SDMA |			\
+				 HFI1_CAP_PRINT_UNIMPL |		\
+				 HFI1_CAP_STATIC_RATE_CTRL |		\
+				 HFI1_CAP_QSFP_ENABLED |		\
+				 HFI1_CAP_PKEY_CHECK |			\
+				 HFI1_CAP_MULTI_PKT_EGR |		\
+				 HFI1_CAP_EXTENDED_PSN |		\
+				 ((HFI1_CAP_HDRSUPP |			\
+				   HFI1_CAP_MULTI_PKT_EGR |		\
+				   HFI1_CAP_STATIC_RATE_CTRL |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_EARLY_CREDIT_RETURN) <<	\
+				  HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |			\
+		     HFI1_CAP_EXTENDED_PSN |		\
+		     HFI1_CAP_PKEY_CHECK |		\
+		     HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-248"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+	__u16 version;		/* structure version */
+	__u16 unit;		/* which device */
+	__u16 sw_index;		/* send sw index to use */
+	__u16 len;		/* data length, in bytes */
+	__u16 port;		/* port number */
+	__u16 unused;
+	__u32 flags;		/* call flags */
+	__u64 data;		/* user data pointer */
+	__u64 pbc;		/* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1	/* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT	0
+#define RHF_PKT_LEN_MASK	0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT	12
+#define RHF_RCV_TYPE_MASK	0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT	15
+#define RHF_USE_EGR_BFR_MASK	0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT	16
+#define RHF_EGR_INDEX_MASK	0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT	27
+#define RHF_DC_INFO_MASK	0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT	28
+#define RHF_RCV_SEQ_MASK	0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT	32
+#define RHF_EGR_OFFSET_MASK	0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT	44
+#define RHF_HDRQ_OFFSET_MASK	0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR	(0x1ull << 53)
+#define RHF_DC_UNC_ERR		(0x1ull << 54)
+#define RHF_DC_ERR		(0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT	56
+#define RHF_RCV_TYPE_ERR_MASK	0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR		(0x1ull << 59)
+#define RHF_LEN_ERR		(0x1ull << 60)
+#define RHF_ECC_ERR		(0x1ull << 61)
+#define RHF_VCRC_ERR		(0x1ull << 62)
+#define RHF_ICRC_ERR		(0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull		/* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR	0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR	0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR		0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR		0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR		0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR	0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR	0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR	0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR	0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR	0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR	0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR		0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+	__be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_PERMISSIVE_LID 0xFFFF
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_QPN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT)
+#define HFI1_MULTICAST_LID_BASE 0xC000
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+	return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+	return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+	return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+	return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+	return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+	return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+	return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+	return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+	return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/staging/rdma/hfi1/cq.c
new file mode 100644
index 0000000..4f046ff
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/cq.c
@@ -0,0 +1,558 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+
+#include "verbs.h"
+#include "hfi.h"
+
+/**
+ * hfi1_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicited entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct hfi1_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data =
+			(__u32 __force)entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED &&
+	     (solicited || entry->status != IB_WC_SUCCESS))) {
+		struct kthread_worker *worker;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		smp_read_barrier_depends(); /* see hfi1_cq_exit */
+		worker = cq->dd->worker;
+		if (likely(worker)) {
+			cq->notify = IB_CQ_NONE;
+			cq->triggered++;
+			queue_kthread_work(worker, &cq->comptask);
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+/**
+ * hfi1_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct hfi1_cq *cq = to_icq(ibcq);
+	struct hfi1_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(struct kthread_work *work)
+{
+	struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask);
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, queue_work()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		/*
+		 * IPoIB connected mode assumes the callback is from a
+		 * soft IRQ. We simulate this by blocking "bottom halves".
+		 * See the implementation for ipoib_cm_handle_tx_wc(),
+		 * netif_tx_lock_bh() and netif_tx_lock().
+		 */
+		local_bh_disable();
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		local_bh_enable();
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * hfi1_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the driver
+ * @udata: user data for libibverbs.so
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *hfi1_create_cq(
+	struct ib_device *ibdev,
+	const struct ib_cq_init_attr *attr,
+	struct ib_ucontext *context,
+	struct ib_udata *udata)
+{
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+	struct hfi1_cq *cq;
+	struct hfi1_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+	unsigned int entries = attr->cqe;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (entries < 1 || entries > hfi1_max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See hfi1_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = hfi1_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == hfi1_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->dd = dd_from_dev(dev);
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	init_kthread_work(&cq->comptask, send_complete);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * hfi1_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int hfi1_destroy_cq(struct ib_cq *ibcq)
+{
+	struct hfi1_ibdev *dev = to_idev(ibcq->device);
+	struct hfi1_cq *cq = to_icq(ibcq);
+
+	flush_kthread_work(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, hfi1_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * hfi1_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct hfi1_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * hfi1_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct hfi1_cq *cq = to_icq(ibcq);
+	struct hfi1_cq_wc *old_wc;
+	struct hfi1_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > hfi1_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct hfi1_ibdev *dev = to_idev(ibcq->device);
+		struct hfi1_mmap_info *ip = cq->ip;
+
+		hfi1_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See hfi1_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
+
+int hfi1_cq_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+	int cpu;
+	struct task_struct *task;
+
+	if (dd->worker)
+		return 0;
+	dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
+	if (!dd->worker)
+		return -ENOMEM;
+	init_kthread_worker(dd->worker);
+	task = kthread_create_on_node(
+		kthread_worker_fn,
+		dd->worker,
+		dd->assigned_node_id,
+		"hfi1_cq%d", dd->unit);
+	if (IS_ERR(task))
+		goto task_fail;
+	cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
+	kthread_bind(task, cpu);
+	wake_up_process(task);
+out:
+	return ret;
+task_fail:
+	ret = PTR_ERR(task);
+	kfree(dd->worker);
+	dd->worker = NULL;
+	goto out;
+}
+
+void hfi1_cq_exit(struct hfi1_devdata *dd)
+{
+	struct kthread_worker *worker;
+
+	worker = dd->worker;
+	if (!worker)
+		return;
+	/* blocks future queuing from send_complete() */
+	dd->worker = NULL;
+	smp_wmb(); /* See hfi1_cq_enter */
+	flush_kthread_worker(worker);
+	kthread_stop(worker->task);
+	kfree(worker);
+}
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
new file mode 100644
index 0000000..acd2269
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.c
@@ -0,0 +1,899 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+	.start = _##name##_seq_start, \
+	.next  = _##name##_seq_next, \
+	.stop  = _##name##_seq_stop, \
+	.show  = _##name##_seq_show \
+}
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+	struct seq_file *seq; \
+	int ret; \
+	ret =  seq_open(s, &_##name##_seq_ops); \
+	if (ret) \
+		return ret; \
+	seq = s->private_data; \
+	seq->private = inode->i_private; \
+	return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+	.owner   = THIS_MODULE, \
+	.open    = _##name##_open, \
+	.read    = seq_read, \
+	.llseek  = seq_lseek, \
+	.release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)	\
+do { \
+	struct dentry *ent; \
+	ent = debugfs_create_file(name, mode, parent, \
+		data, ops); \
+	if (!ent) \
+		pr_warn("create of %s failed\n", name); \
+} while (0)
+
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+	DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	rcu_read_lock();
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	for (j = 0; j < dd->first_user_ctxt; j++) {
+		if (!dd->rcd[j])
+			continue;
+		n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+		n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+	}
+	if (!n_packets && !n_bytes)
+		return SEQ_SKIP;
+	seq_printf(s, "%02llx %llu/%llu\n", i,
+		(unsigned long long) n_packets,
+		(unsigned long long) n_bytes);
+
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN)
+		return pos;
+
+	++*pos;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos;
+	loff_t i, j;
+	u64 n_packets = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(s, "Ctx:npkts\n");
+		return 0;
+	}
+
+	spos = v;
+	i = *spos;
+
+	if (!dd->rcd[i])
+		return SEQ_SKIP;
+
+	for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+		n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+	if (!n_packets)
+		return SEQ_SKIP;
+
+	seq_printf(s, "  %llu:%llu\n", i, n_packets);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+	struct qp_iter *iter;
+	loff_t n = *pos;
+
+	rcu_read_lock();
+	iter = qp_iter_init(s->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (qp_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct qp_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+	struct qp_iter *iter = iter_ptr;
+
+	if (!iter)
+		return 0;
+
+	qp_iter_print(s, iter);
+
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+	struct hfi1_ibdev *ibd;
+	struct hfi1_devdata *dd;
+
+	rcu_read_lock();
+	ibd = (struct hfi1_ibdev *)s->private;
+	dd = dd_from_dev(ibd);
+	if (!dd->per_sdma || *pos >= dd->num_sdma)
+		return NULL;
+	return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	++*pos;
+	if (!dd->per_sdma || *pos >= dd->num_sdma)
+		return NULL;
+	return pos;
+}
+
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	loff_t *spos = v;
+	loff_t i = *spos;
+
+	sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	rcu_read_lock();
+	dd = private2dd(file);
+	avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters);
+	rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
+	rcu_read_unlock();
+	return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	rcu_read_lock();
+	dd = private2dd(file);
+	avail = hfi1_read_cntrs(dd, *ppos, &names, NULL);
+	rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
+	rcu_read_unlock();
+	return rval;
+}
+
+struct counter_info {
+	char *name;
+	const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	rcu_read_lock();
+	dd = private2dd(file);
+	/* port number n/a here since names are constant */
+	avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL);
+	rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+	rcu_read_unlock();
+	return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	ssize_t rval;
+
+	rcu_read_lock();
+	ppd = private2ppd(file);
+	dd = ppd->dd;
+	avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters);
+	rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+	rcu_read_unlock();
+	return rval;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd;
+	char *tmp;
+	int ret;
+
+	rcu_read_lock();
+	ppd = private2ppd(file);
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}
+
+	ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	rcu_read_unlock();
+	kfree(tmp);
+	return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int i2c_addr;
+	int offset;
+	int total_written;
+
+	rcu_read_lock();
+	ppd = private2ppd(file);
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff) {
+		ret = -ENOMEM;
+		goto _return;
+	}
+
+	ret = copy_from_user(buff, buf, count);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	i2c_addr = (*ppos >> 16) & 0xff;
+	offset = *ppos & 0xffff;
+
+	total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+	if (total_written < 0) {
+		ret = total_written;
+		goto _free;
+	}
+
+	*ppos += total_written;
+
+	ret = total_written;
+
+ _free:
+	kfree(buff);
+ _return:
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int i2c_addr;
+	int offset;
+	int total_read;
+
+	rcu_read_lock();
+	ppd = private2ppd(file);
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff) {
+		ret = -ENOMEM;
+		goto _return;
+	}
+
+	i2c_addr = (*ppos >> 16) & 0xff;
+	offset = *ppos & 0xffff;
+
+	total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+	if (total_read < 0) {
+		ret = total_read;
+		goto _free;
+	}
+
+	*ppos += total_read;
+
+	ret = copy_to_user(buf, buff, total_read);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	ret = total_read;
+
+ _free:
+	kfree(buff);
+ _return:
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int total_written;
+
+	rcu_read_lock();
+	if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+		ret = -EINVAL;
+		goto _return;
+	}
+
+	ppd = private2ppd(file);
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff) {
+		ret = -ENOMEM;
+		goto _return;
+	}
+
+	ret = copy_from_user(buff, buf, count);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	total_written = qsfp_write(ppd, target, *ppos, buff, count);
+	if (total_written < 0) {
+		ret = total_written;
+		goto _free;
+	}
+
+	*ppos += total_written;
+
+	ret = total_written;
+
+ _free:
+	kfree(buff);
+ _return:
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int total_read;
+
+	rcu_read_lock();
+	if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+		ret = -EINVAL;
+		goto _return;
+	}
+
+	ppd = private2ppd(file);
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff) {
+		ret = -ENOMEM;
+		goto _return;
+	}
+
+	total_read = qsfp_read(ppd, target, *ppos, buff, count);
+	if (total_read < 0) {
+		ret = total_read;
+		goto _free;
+	}
+
+	*ppos += total_read;
+
+	ret = copy_to_user(buf, buff, total_read);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	ret = total_read;
+
+ _free:
+	kfree(buff);
+ _return:
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine)	\
+{ \
+	.name = nm, \
+	.ops = { \
+		.read = readroutine, \
+		.write = writeroutine, \
+		.llseek = generic_file_llseek, \
+	}, \
+}
+
+static const struct counter_info cntr_ops[] = {
+	DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+	DEBUGFS_OPS("counters", dev_counters_read, NULL),
+	DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+	DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+	DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write),
+	DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write),
+	DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+	DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write),
+	DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+	char name[sizeof("port0counters") + 1];
+	char link[10];
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_pportdata *ppd;
+	int unit = dd->unit;
+	int i, j;
+
+	if (!hfi1_dbg_root)
+		return;
+	snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+	snprintf(link, sizeof(link), "%d", unit);
+	ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+	if (!ibd->hfi1_ibdev_dbg) {
+		pr_warn("create of %s failed\n", name);
+		return;
+	}
+	ibd->hfi1_ibdev_link =
+		debugfs_create_symlink(link, hfi1_dbg_root, name);
+	if (!ibd->hfi1_ibdev_link) {
+		pr_warn("create of %s symlink failed\n", name);
+		return;
+	}
+	DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+	/* dev counter files */
+	for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+		DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+				    ibd->hfi1_ibdev_dbg,
+				    dd,
+				    &cntr_ops[i].ops, S_IRUGO);
+	/* per port files */
+	for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+		for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+			snprintf(name,
+				 sizeof(name),
+				 port_cntr_ops[i].name,
+				 j + 1);
+			DEBUGFS_FILE_CREATE(name,
+					    ibd->hfi1_ibdev_dbg,
+					    ppd,
+					    &port_cntr_ops[i].ops,
+					    port_cntr_ops[i].ops.write == NULL ?
+					    S_IRUGO : S_IRUGO|S_IWUSR);
+		}
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+	if (!hfi1_dbg_root)
+		goto out;
+	debugfs_remove(ibd->hfi1_ibdev_link);
+	debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+	ibd->hfi1_ibdev_dbg = NULL;
+	synchronize_rcu();
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+	/* must be element 0*/
+	"KernIntr",
+	"ErrorIntr",
+	"Tx_Errs",
+	"Rcv_Errs",
+	"H/W_Errs",
+	"NoPIOBufs",
+	"CtxtsOpen",
+	"RcvLen_Errs",
+	"EgrBufFull",
+	"EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+	rcu_read_lock();
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+	struct seq_file *s,
+	void *v,
+	loff_t *pos)
+{
+	++*pos;
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+
+	seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+	rcu_read_lock();
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static u64 hfi1_sps_ints(void)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		sps_ints += get_all_cpu_total(dd->int_counter);
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	char *buffer;
+	u64 *stats = (u64 *)&hfi1_stats;
+	size_t sz = seq_get_buf(s, &buffer);
+
+	if (sz < sizeof(u64))
+		return SEQ_SKIP;
+	/* special case for interrupts */
+	if (*spos == 0)
+		*(u64 *)buffer = hfi1_sps_ints();
+	else
+		*(u64 *)buffer = stats[*spos];
+	seq_commit(s,  sizeof(u64));
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+	hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
+	if (!hfi1_dbg_root)
+		pr_warn("init of debugfs failed\n");
+	DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+	DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+	debugfs_remove_recursive(hfi1_dbg_root);
+	hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
new file mode 100644
index 0000000..92d6fe1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.h
@@ -0,0 +1,78 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
new file mode 100644
index 0000000..07c87a87
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.c
@@ -0,0 +1,142 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+		   const struct file_operations *fops,
+		   struct cdev *cdev, struct device **devp)
+{
+	const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+	struct device *device = NULL;
+	int ret;
+
+	cdev_init(cdev, fops);
+	cdev->owner = THIS_MODULE;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto done;
+	}
+
+	device = device_create(class, NULL, dev, NULL, "%s", name);
+	if (!IS_ERR(device))
+		goto done;
+	ret = PTR_ERR(device);
+	device = NULL;
+	pr_err("Could not create device for minor %d, %s (err %d)\n",
+	       minor, name, -ret);
+	cdev_del(cdev);
+done:
+	*devp = device;
+	return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+	struct device *device = *devp;
+
+	if (device) {
+		device_unregister(device);
+		*devp = NULL;
+
+		cdev_del(cdev);
+	}
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+	return hfi1_class_name;
+}
+
+int __init dev_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+	if (ret < 0) {
+		pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	class = class_create(THIS_MODULE, class_name());
+	if (IS_ERR(class)) {
+		ret = PTR_ERR(class);
+		pr_err("Could not create device class (err %d)\n", -ret);
+		unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+	}
+
+done:
+	return ret;
+}
+
+void dev_cleanup(void)
+{
+	if (class) {
+		class_destroy(class);
+		class = NULL;
+	}
+
+	unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
new file mode 100644
index 0000000..98caecd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.h
@@ -0,0 +1,61 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+		   const struct file_operations *fops,
+		   struct cdev *cdev, struct device **devp);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
new file mode 100644
index 0000000..6777d6b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -0,0 +1,1873 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the hfi1_diag device, normally minor number 129.  Diagnostic use
+ * of the chip may render the chip or board unusable until the driver
+ * is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <rdma/ib_smi.h>
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+#define snoop_dbg(fmt, ...) \
+	hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
+
+/* Snoop option mask */
+#define SNOOP_DROP_SEND	(1 << 0)
+#define SNOOP_USE_METADATA	(1 << 1)
+
+static u8 snoop_flags;
+
+/*
+ * Extract packet length from LRH header.
+ * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the
+ * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes
+ */
+#define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2)
+
+enum hfi1_filter_status {
+	HFI1_FILTER_HIT,
+	HFI1_FILTER_ERR,
+	HFI1_FILTER_MISS
+};
+
+/* snoop processing functions */
+rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
+	[RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
+	[RHF_RCV_TYPE_EAGER]    = snoop_recv_handler,
+	[RHF_RCV_TYPE_IB]       = snoop_recv_handler,
+	[RHF_RCV_TYPE_ERROR]    = snoop_recv_handler,
+	[RHF_RCV_TYPE_BYPASS]   = snoop_recv_handler,
+	[RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
+	[RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
+	[RHF_RCV_TYPE_INVALID7] = process_receive_invalid
+};
+
+/* Snoop packet structure */
+struct snoop_packet {
+	struct list_head list;
+	u32 total_len;
+	u8 data[];
+};
+
+/* Do not make these an enum or it will blow up the capture_md */
+#define PKT_DIR_EGRESS 0x0
+#define PKT_DIR_INGRESS 0x1
+
+/* Packet capture metadata returned to the user with the packet. */
+struct capture_md {
+	u8 port;
+	u8 dir;
+	u8 reserved[6];
+	union {
+		u64 pbc;
+		u64 rhf;
+	} u;
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+/*
+ * This is used for communication with user space for snoop extended IOCTLs
+ */
+struct hfi1_link_info {
+	__be64 node_guid;
+	u8 port_mode;
+	u8 port_state;
+	u16 link_speed_active;
+	u16 link_width_active;
+	u16 vl15_init;
+	u8 port_number;
+	/*
+	 * Add padding to make this a full IB SMP payload. Note: changing the
+	 * size of this structure will make the IOCTLs created with _IOWR
+	 * change.
+	 * Be sure to run tests on all IOCTLs when making changes to this
+	 * structure.
+	 */
+	u8 res[47];
+};
+
+/*
+ * This starts our ioctl sequence numbers *way* off from the ones
+ * defined in ib_core.
+ */
+#define SNOOP_CAPTURE_VERSION 0x1
+
+#define IB_IOCTL_MAGIC          0x1b /* See Documentation/ioctl-number.txt */
+#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
+#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
+
+#define HFI1_SNOOP_IOCGETLINKSTATE \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
+#define HFI1_SNOOP_IOCSETLINKSTATE \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+1)
+#define HFI1_SNOOP_IOCCLEARQUEUE \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+2)
+#define HFI1_SNOOP_IOCCLEARFILTER \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+3)
+#define HFI1_SNOOP_IOCSETFILTER \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+4)
+#define HFI1_SNOOP_IOCGETVERSION \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+5)
+#define HFI1_SNOOP_IOCSET_OPTS \
+	_IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6)
+
+/*
+ * These offsets +6/+7 could change, but these are already known and used
+ * IOCTL numbers so don't change them without a good reason.
+ */
+#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
+	_IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6, \
+		struct hfi1_link_info)
+#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
+	_IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+7, \
+		struct hfi1_link_info)
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp);
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+				size_t pkt_len, loff_t *off);
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off);
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
+static unsigned int hfi1_snoop_poll(struct file *fp,
+					struct poll_table_struct *wait);
+static int hfi1_snoop_release(struct inode *in, struct file *fp);
+
+struct hfi1_packet_filter_command {
+	int opcode;
+	int length;
+	void *value_ptr;
+};
+
+/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
+#define HFI1_SNOOP_INGRESS 0x1
+#define HFI1_SNOOP_EGRESS  0x2
+
+enum hfi1_packet_filter_opcodes {
+	FILTER_BY_LID,
+	FILTER_BY_DLID,
+	FILTER_BY_MAD_MGMT_CLASS,
+	FILTER_BY_QP_NUMBER,
+	FILTER_BY_PKT_TYPE,
+	FILTER_BY_SERVICE_LEVEL,
+	FILTER_BY_PKEY,
+	FILTER_BY_DIRECTION,
+};
+
+static const struct file_operations snoop_file_ops = {
+	.owner = THIS_MODULE,
+	.open = hfi1_snoop_open,
+	.read = hfi1_snoop_read,
+	.unlocked_ioctl = hfi1_ioctl,
+	.poll = hfi1_snoop_poll,
+	.write = hfi1_snoop_write,
+	.release = hfi1_snoop_release
+};
+
+struct hfi1_filter_array {
+	int (*filter)(void *, void *, void *);
+};
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+				      void *value);
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+				     void *value);
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+					void *value);
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
+
+static struct hfi1_filter_array hfi1_filters[] = {
+	{ hfi1_filter_lid },
+	{ hfi1_filter_dlid },
+	{ hfi1_filter_mad_mgmt_class },
+	{ hfi1_filter_qp_number },
+	{ hfi1_filter_ibpacket_type },
+	{ hfi1_filter_ib_service_level },
+	{ hfi1_filter_ib_pkey },
+	{ hfi1_filter_direction },
+};
+
+#define HFI1_MAX_FILTERS	ARRAY_SIZE(hfi1_filters)
+#define HFI1_DIAG_MINOR_BASE	129
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
+
+int hfi1_diag_add(struct hfi1_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
+		 dd->unit);
+	/*
+	 * Do this for each device as opposed to the normal diagpkt
+	 * interface which is one per host
+	 */
+	ret = hfi1_snoop_add(dd, name);
+	if (ret)
+		dd_dev_err(dd, "Unable to init snoop/capture device");
+
+	snprintf(name, sizeof(name), "%s_diagpkt", class_name());
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
+				     &diagpkt_file_ops, &diagpkt_cdev,
+				     &diagpkt_device);
+	}
+
+	return ret;
+}
+
+/* this must be called w/ dd->snoop_in_lock held */
+static void drain_snoop_list(struct list_head *queue)
+{
+	struct list_head *pos, *q;
+	struct snoop_packet *packet;
+
+	list_for_each_safe(pos, q, queue) {
+		packet = list_entry(pos, struct snoop_packet, list);
+		list_del(pos);
+		kfree(packet);
+	}
+}
+
+static void hfi1_snoop_remove(struct hfi1_devdata *dd)
+{
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+	drain_snoop_list(&dd->hfi1_snoop.queue);
+	hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+}
+
+void hfi1_diag_remove(struct hfi1_devdata *dd)
+{
+
+	hfi1_snoop_remove(dd);
+	if (atomic_dec_and_test(&diagpkt_count))
+		hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+	hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+}
+
+
+/*
+ * Allocated structure shared between the credit return mechanism and
+ * diagpkt_send().
+ */
+struct diagpkt_wait {
+	struct completion credits_returned;
+	int code;
+	atomic_t count;
+};
+
+/*
+ * When each side is finished with the structure, they call this.
+ * The last user frees the structure.
+ */
+static void put_diagpkt_wait(struct diagpkt_wait *wait)
+{
+	if (atomic_dec_and_test(&wait->count))
+		kfree(wait);
+}
+
+/*
+ * Callback from the credit return code.  Set the complete, which
+ * will let diapkt_send() continue.
+ */
+static void diagpkt_complete(void *arg, int code)
+{
+	struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
+
+	wait->code = code;
+	complete(&wait->credits_returned);
+	put_diagpkt_wait(wait);	/* finished with the structure */
+}
+
+/**
+ * diagpkt_send - send a packet
+ * @dp: diag packet descriptor
+ */
+static ssize_t diagpkt_send(struct diag_pkt *dp)
+{
+	struct hfi1_devdata *dd;
+	struct send_context *sc;
+	struct pio_buf *pbuf;
+	u32 *tmpbuf = NULL;
+	ssize_t ret = 0;
+	u32 pkt_len, total_len;
+	pio_release_cb credit_cb = NULL;
+	void *credit_arg = NULL;
+	struct diagpkt_wait *wait = NULL;
+
+	dd = hfi1_lookup(dp->unit);
+	if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+	if (!(dd->flags & HFI1_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (dp->version != _DIAG_PKT_VERS) {
+		dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+			    dp->version);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* send count must be an exact number of dwords */
+	if (dp->len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* there is only port 1 */
+	if (dp->port != 1) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* need a valid context */
+	if (dp->sw_index >= dd->num_send_contexts) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* can only use kernel contexts */
+	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* must be allocated */
+	sc = dd->send_contexts[dp->sw_index].sc;
+	if (!sc) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* must be enabled */
+	if (!(sc->flags & SCF_ENABLED)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* allocate a buffer and copy the data in */
+	tmpbuf = vmalloc(dp->len);
+	if (!tmpbuf) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp->data,
+			   dp->len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	/*
+	 * pkt_len is how much data we have to write, includes header and data.
+	 * total_len is length of the packet in Dwords plus the PBC should not
+	 * include the CRC.
+	 */
+	pkt_len = dp->len >> 2;
+	total_len = pkt_len + 2; /* PBC + packet */
+
+	/* if 0, fill in a default */
+	if (dp->pbc == 0) {
+		struct hfi1_pportdata *ppd = dd->pport;
+
+		hfi1_cdbg(PKT, "Generating PBC");
+		dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
+	} else {
+		hfi1_cdbg(PKT, "Using passed in PBC");
+	}
+
+	hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
+
+	/*
+	 * The caller wants to wait until the packet is sent and to
+	 * check for errors.  The best we can do is wait until
+	 * the buffer credits are returned and check if any packet
+	 * error has occurred.  If there are any late errors, this
+	 * could miss it.  If there are other senders who generate
+	 * an error, this may find it.  However, in general, it
+	 * should catch most.
+	 */
+	if (dp->flags & F_DIAGPKT_WAIT) {
+		/* always force a credit return */
+		dp->pbc |= PBC_CREDIT_RETURN;
+		/* turn on credit return interrupts */
+		sc_add_credit_return_intr(sc);
+		wait = kmalloc(sizeof(*wait), GFP_KERNEL);
+		if (!wait) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+		init_completion(&wait->credits_returned);
+		atomic_set(&wait->count, 2);
+		wait->code = PRC_OK;
+
+		credit_cb = diagpkt_complete;
+		credit_arg = wait;
+	}
+
+	pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
+	if (!pbuf) {
+		/*
+		 * No send buffer means no credit callback.  Undo
+		 * the wait set-up that was done above.  We free wait
+		 * because the callback will never be called.
+		 */
+		if (dp->flags & F_DIAGPKT_WAIT) {
+			sc_del_credit_return_intr(sc);
+			kfree(wait);
+			wait = NULL;
+		}
+		ret = -ENOSPC;
+		goto bail;
+	}
+
+	pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
+	/* no flush needed as the HW knows the packet size */
+
+	ret = sizeof(*dp);
+
+	if (dp->flags & F_DIAGPKT_WAIT) {
+		/* wait for credit return */
+		ret = wait_for_completion_interruptible(
+						&wait->credits_returned);
+		/*
+		 * If the wait returns an error, the wait was interrupted,
+		 * e.g. with a ^C in the user program.  The callback is
+		 * still pending.  This is OK as the wait structure is
+		 * kmalloc'ed and the structure will free itself when
+		 * all users are done with it.
+		 *
+		 * A context disable occurs on a send context restart, so
+		 * include that in the list of errors below to check for.
+		 * NOTE: PRC_FILL_ERR is at best informational and cannot
+		 * be depended on.
+		 */
+		if (!ret && (((wait->code & PRC_STATUS_ERR)
+				|| (wait->code & PRC_FILL_ERR)
+				|| (wait->code & PRC_SC_DISABLE))))
+			ret = -EIO;
+
+		put_diagpkt_wait(wait);	/* finished with the structure */
+		sc_del_credit_return_intr(sc);
+	}
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off)
+{
+	struct hfi1_devdata *dd;
+	struct send_context *sc;
+	u8 vl;
+
+	struct diag_pkt dp;
+
+	if (count != sizeof(dp))
+		return -EINVAL;
+
+	if (copy_from_user(&dp, data, sizeof(dp)))
+		return -EFAULT;
+
+	/*
+	* The Send Context is derived from the PbcVL value
+	* if PBC is populated
+	*/
+	if (dp.pbc) {
+		dd = hfi1_lookup(dp.unit);
+		if (dd == NULL)
+			return -ENODEV;
+		vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+		sc = dd->vld[vl].sc;
+		if (sc) {
+			dp.sw_index = sc->sw_index;
+			hfi1_cdbg(
+			       PKT,
+			       "Packet sent over VL %d via Send Context %u(%u)",
+			       vl, sc->sw_index, sc->hw_context);
+		}
+	}
+
+	return diagpkt_send(&dp);
+}
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
+{
+	int ret = 0;
+
+	dd->hfi1_snoop.mode_flag = 0;
+	spin_lock_init(&dd->hfi1_snoop.snoop_lock);
+	INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
+	init_waitqueue_head(&dd->hfi1_snoop.waitq);
+
+	ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
+			     &snoop_file_ops,
+			     &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
+
+	if (ret) {
+		dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
+		hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
+				 &dd->hfi1_snoop.class_dev);
+	}
+
+	return ret;
+}
+
+static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
+{
+	int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
+	struct hfi1_devdata *dd = NULL;
+
+	dd = hfi1_lookup(unit);
+	return dd;
+
+}
+
+/* clear or restore send context integrity checks */
+static void adjust_integrity_checks(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	unsigned long sc_flags;
+	int i;
+
+	spin_lock_irqsave(&dd->sc_lock, sc_flags);
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		int enable;
+
+		sc = dd->send_contexts[i].sc;
+
+		if (!sc)
+			continue;	/* not allocated */
+
+		enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+			 dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
+
+		set_pio_integrity(sc);
+
+		if (enable) /* take HFI_CAP_* flags into account */
+			hfi1_init_ctxt(sc);
+	}
+	spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
+}
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp)
+{
+	int ret;
+	int mode_flag = 0;
+	unsigned long flags = 0;
+	struct hfi1_devdata *dd;
+	struct list_head *queue;
+
+	mutex_lock(&hfi1_mutex);
+
+	dd = hfi1_dd_from_sc_inode(in);
+	if (dd == NULL) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	/*
+	 * File mode determines snoop or capture. Some existing user
+	 * applications expect the capture device to be able to be opened RDWR
+	 * because they expect a dedicated capture device. For this reason we
+	 * support a module param to force capture mode even if the file open
+	 * mode matches snoop.
+	 */
+	if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
+		snoop_dbg("Capture Enabled");
+		mode_flag = HFI1_PORT_CAPTURE_MODE;
+	} else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
+		snoop_dbg("Snoop Enabled");
+		mode_flag = HFI1_PORT_SNOOP_MODE;
+	} else {
+		snoop_dbg("Invalid");
+		ret =  -EINVAL;
+		goto bail;
+	}
+	queue = &dd->hfi1_snoop.queue;
+
+	/*
+	 * We are not supporting snoop and capture at the same time.
+	 */
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+	if (dd->hfi1_snoop.mode_flag) {
+		ret = -EBUSY;
+		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+		goto bail;
+	}
+
+	dd->hfi1_snoop.mode_flag = mode_flag;
+	drain_snoop_list(queue);
+
+	dd->hfi1_snoop.filter_callback = NULL;
+	dd->hfi1_snoop.filter_value = NULL;
+
+	/*
+	 * Send side packet integrity checks are not helpful when snooping so
+	 * disable and re-enable when we stop snooping.
+	 */
+	if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+		/* clear after snoop mode is on */
+		adjust_integrity_checks(dd); /* clear */
+
+		/*
+		 * We also do not want to be doing the DLID LMC check for
+		 * ingressed packets.
+		 */
+		dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
+		write_csr(dd, DCC_CFG_PORT_CONFIG1,
+			  (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
+	}
+
+	/*
+	 * As soon as we set these function pointers the recv and send handlers
+	 * are active. This is a race condition so we must make sure to drain
+	 * the queue and init filter values above. Technically we should add
+	 * locking here but all that will happen is on recv a packet will get
+	 * allocated and get stuck on the snoop_lock before getting added to the
+	 * queue. Same goes for send.
+	 */
+	dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
+	dd->process_pio_send = snoop_send_pio_handler;
+	dd->process_dma_send = snoop_send_pio_handler;
+	dd->pio_inline_send = snoop_inline_pio_send;
+
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+	ret = 0;
+
+bail:
+	mutex_unlock(&hfi1_mutex);
+
+	return ret;
+}
+
+static int hfi1_snoop_release(struct inode *in, struct file *fp)
+{
+	unsigned long flags = 0;
+	struct hfi1_devdata *dd;
+	int mode_flag;
+
+	dd = hfi1_dd_from_sc_inode(in);
+	if (dd == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+	/* clear the snoop mode before re-adjusting send context CSRs */
+	mode_flag = dd->hfi1_snoop.mode_flag;
+	dd->hfi1_snoop.mode_flag = 0;
+
+	/*
+	 * Drain the queue and clear the filters we are done with it. Don't
+	 * forget to restore the packet integrity checks
+	 */
+	drain_snoop_list(&dd->hfi1_snoop.queue);
+	if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+		/* restore after snoop mode is clear */
+		adjust_integrity_checks(dd); /* restore */
+
+		/*
+		 * Also should probably reset the DCC_CONFIG1 register for DLID
+		 * checking on incoming packets again. Use the value saved when
+		 * opening the snoop device.
+		 */
+		write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
+	}
+
+	dd->hfi1_snoop.filter_callback = NULL;
+	kfree(dd->hfi1_snoop.filter_value);
+	dd->hfi1_snoop.filter_value = NULL;
+
+	/*
+	 * User is done snooping and capturing, return control to the normal
+	 * handler. Re-enable SDMA handling.
+	 */
+	dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+	dd->process_pio_send = hfi1_verbs_send_pio;
+	dd->process_dma_send = hfi1_verbs_send_dma;
+	dd->pio_inline_send = pio_copy;
+
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+	snoop_dbg("snoop/capture device released");
+
+	return 0;
+}
+
+static unsigned int hfi1_snoop_poll(struct file *fp,
+				    struct poll_table_struct *wait)
+{
+	int ret = 0;
+	unsigned long flags = 0;
+
+	struct hfi1_devdata *dd;
+
+	dd = hfi1_dd_from_sc_inode(fp->f_inode);
+	if (dd == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+	poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
+	if (!list_empty(&dd->hfi1_snoop.queue))
+		ret |= POLLIN | POLLRDNORM;
+
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+	return ret;
+
+}
+
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off)
+{
+	struct diag_pkt dpkt;
+	struct hfi1_devdata *dd;
+	size_t ret;
+	u8 byte_two, sl, sc5, sc4, vl, byte_one;
+	struct send_context *sc;
+	u32 len;
+	u64 pbc;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+
+	dd = hfi1_dd_from_sc_inode(fp->f_inode);
+	if (dd == NULL)
+		return -ENODEV;
+
+	ppd = dd->pport;
+	snoop_dbg("received %lu bytes from user", count);
+
+	memset(&dpkt, 0, sizeof(struct diag_pkt));
+	dpkt.version = _DIAG_PKT_VERS;
+	dpkt.unit = dd->unit;
+	dpkt.port = 1;
+
+	if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
+		/*
+		* We need to generate the PBC and not let diagpkt_send do it,
+		* to do this we need the VL and the length in dwords.
+		* The VL can be determined by using the SL and looking up the
+		* SC. Then the SC can be converted into VL. The exception to
+		* this is those packets which are from an SMI queue pair.
+		* Since we can't detect anything about the QP here we have to
+		* rely on the SC. If its 0xF then we assume its SMI and
+		* do not look at the SL.
+		*/
+		if (copy_from_user(&byte_one, data, 1))
+			return -EINVAL;
+
+		if (copy_from_user(&byte_two, data+1, 1))
+			return -EINVAL;
+
+		sc4 = (byte_one >> 4) & 0xf;
+		if (sc4 == 0xF) {
+			snoop_dbg("Detected VL15 packet ignoring SL in packet");
+			vl = sc4;
+		} else {
+			sl = (byte_two >> 4) & 0xf;
+			ibp = to_iport(&dd->verbs_dev.ibdev, 1);
+			sc5 = ibp->sl_to_sc[sl];
+			vl = sc_to_vlt(dd, sc5);
+			if (vl != sc4) {
+				snoop_dbg("VL %d does not match SC %d of packet",
+					  vl, sc4);
+				return -EINVAL;
+			}
+		}
+
+		sc = dd->vld[vl].sc; /* Look up the context based on VL */
+		if (sc) {
+			dpkt.sw_index = sc->sw_index;
+			snoop_dbg("Sending on context %u(%u)", sc->sw_index,
+				  sc->hw_context);
+		} else {
+			snoop_dbg("Could not find context for vl %d", vl);
+			return -EINVAL;
+		}
+
+		len = (count >> 2) + 2; /* Add in PBC */
+		pbc = create_pbc(ppd, 0, 0, vl, len);
+	} else {
+		if (copy_from_user(&pbc, data, sizeof(pbc)))
+			return -EINVAL;
+		vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+		sc = dd->vld[vl].sc; /* Look up the context based on VL */
+		if (sc) {
+			dpkt.sw_index = sc->sw_index;
+		} else {
+			snoop_dbg("Could not find context for vl %d", vl);
+			return -EINVAL;
+		}
+		data += sizeof(pbc);
+		count -= sizeof(pbc);
+	}
+	dpkt.len = count;
+	dpkt.data = (unsigned long)data;
+
+	snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
+		  (pbc >> 12) & 0xf,
+		  (pbc & 0xfff));
+
+	dpkt.pbc = pbc;
+	ret = diagpkt_send(&dpkt);
+	/*
+	 * diagpkt_send only returns number of bytes in the diagpkt so patch
+	 * that up here before returning.
+	 */
+	if (ret == sizeof(dpkt))
+		return count;
+
+	return ret;
+}
+
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+			       size_t pkt_len, loff_t *off)
+{
+	ssize_t ret = 0;
+	unsigned long flags = 0;
+	struct snoop_packet *packet = NULL;
+	struct hfi1_devdata *dd;
+
+	dd = hfi1_dd_from_sc_inode(fp->f_inode);
+	if (dd == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+	while (list_empty(&dd->hfi1_snoop.queue)) {
+		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+		if (fp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(
+				dd->hfi1_snoop.waitq,
+				!list_empty(&dd->hfi1_snoop.queue)))
+			return -EINTR;
+
+		spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+	}
+
+	if (!list_empty(&dd->hfi1_snoop.queue)) {
+		packet = list_entry(dd->hfi1_snoop.queue.next,
+				    struct snoop_packet, list);
+		list_del(&packet->list);
+		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+		if (pkt_len >= packet->total_len) {
+			if (copy_to_user(data, packet->data,
+				packet->total_len))
+				ret = -EFAULT;
+			else
+				ret = packet->total_len;
+		} else
+			ret = -EINVAL;
+
+		kfree(packet);
+	} else
+		spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+	return ret;
+}
+
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+	struct hfi1_devdata *dd;
+	void *filter_value = NULL;
+	long ret = 0;
+	int value = 0;
+	u8 physState = 0;
+	u8 linkState = 0;
+	u16 devState = 0;
+	unsigned long flags = 0;
+	unsigned long *argp = NULL;
+	struct hfi1_packet_filter_command filter_cmd = {0};
+	int mode_flag = 0;
+	struct hfi1_pportdata *ppd = NULL;
+	unsigned int index;
+	struct hfi1_link_info link_info;
+
+	dd = hfi1_dd_from_sc_inode(fp->f_inode);
+	if (dd == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+	mode_flag = dd->hfi1_snoop.mode_flag;
+
+	if (((_IOC_DIR(cmd) & _IOC_READ)
+	    && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd)))
+	    || ((_IOC_DIR(cmd) & _IOC_WRITE)
+	    && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) {
+		ret = -EFAULT;
+	} else if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+	} else if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
+		   (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
+		   (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
+		   (cmd != HFI1_SNOOP_IOCSETFILTER)) {
+		/* Capture devices are allowed only 3 operations
+		 * 1.Clear capture queue
+		 * 2.Clear capture filter
+		 * 3.Set capture filter
+		 * Other are invalid.
+		 */
+		ret = -EINVAL;
+	} else {
+		switch (cmd) {
+		case HFI1_SNOOP_IOCSETLINKSTATE:
+			snoop_dbg("HFI1_SNOOP_IOCSETLINKSTATE is not valid");
+			ret = -EINVAL;
+			break;
+
+		case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
+			memset(&link_info, 0, sizeof(link_info));
+
+			ret = copy_from_user(&link_info,
+				(struct hfi1_link_info __user *)arg,
+				sizeof(link_info));
+			if (ret)
+				break;
+
+			value = link_info.port_state;
+			index = link_info.port_number;
+			if (index > dd->num_pports - 1) {
+				ret = -EINVAL;
+				break;
+			}
+
+			ppd = &dd->pport[index];
+			if (!ppd) {
+				ret = -EINVAL;
+				break;
+			}
+
+			/* What we want to transition to */
+			physState = (value >> 4) & 0xF;
+			linkState = value & 0xF;
+			snoop_dbg("Setting link state 0x%x", value);
+
+			switch (linkState) {
+			case IB_PORT_NOP:
+				if (physState == 0)
+					break;
+					/* fall through */
+			case IB_PORT_DOWN:
+				switch (physState) {
+				case 0:
+					devState = HLS_DN_DOWNDEF;
+					break;
+				case 2:
+					devState = HLS_DN_POLL;
+					break;
+				case 3:
+					devState = HLS_DN_DISABLE;
+					break;
+				default:
+					ret = -EINVAL;
+					goto done;
+				}
+				ret = set_link_state(ppd, devState);
+				break;
+			case IB_PORT_ARMED:
+				ret = set_link_state(ppd, HLS_UP_ARMED);
+				if (!ret)
+					send_idle_sma(dd, SMA_IDLE_ARM);
+				break;
+			case IB_PORT_ACTIVE:
+				ret = set_link_state(ppd, HLS_UP_ACTIVE);
+				if (!ret)
+					send_idle_sma(dd, SMA_IDLE_ACTIVE);
+				break;
+			default:
+				ret = -EINVAL;
+				break;
+			}
+
+			if (ret)
+				break;
+			/* fall through */
+		case HFI1_SNOOP_IOCGETLINKSTATE:
+		case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
+			if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
+				memset(&link_info, 0, sizeof(link_info));
+				ret = copy_from_user(&link_info,
+					(struct hfi1_link_info __user *)arg,
+					sizeof(link_info));
+				index = link_info.port_number;
+			} else {
+				ret = __get_user(index, (int __user *) arg);
+				if (ret !=  0)
+					break;
+			}
+
+			if (index > dd->num_pports - 1) {
+				ret = -EINVAL;
+				break;
+			}
+
+			ppd = &dd->pport[index];
+			if (!ppd) {
+				ret = -EINVAL;
+				break;
+			}
+			value = hfi1_ibphys_portstate(ppd);
+			value <<= 4;
+			value |= driver_lstate(ppd);
+
+			snoop_dbg("Link port | Link State: %d", value);
+
+			if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
+			    (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
+				link_info.port_state = value;
+				link_info.node_guid = cpu_to_be64(ppd->guid);
+				link_info.link_speed_active =
+							ppd->link_speed_active;
+				link_info.link_width_active =
+							ppd->link_width_active;
+				ret = copy_to_user(
+					(struct hfi1_link_info __user *)arg,
+					&link_info, sizeof(link_info));
+			} else {
+				ret = __put_user(value, (int __user *)arg);
+			}
+			break;
+
+		case HFI1_SNOOP_IOCCLEARQUEUE:
+			snoop_dbg("Clearing snoop queue");
+			drain_snoop_list(&dd->hfi1_snoop.queue);
+			break;
+
+		case HFI1_SNOOP_IOCCLEARFILTER:
+			snoop_dbg("Clearing filter");
+			if (dd->hfi1_snoop.filter_callback) {
+				/* Drain packets first */
+				drain_snoop_list(&dd->hfi1_snoop.queue);
+				dd->hfi1_snoop.filter_callback = NULL;
+			}
+			kfree(dd->hfi1_snoop.filter_value);
+			dd->hfi1_snoop.filter_value = NULL;
+			break;
+
+		case HFI1_SNOOP_IOCSETFILTER:
+			snoop_dbg("Setting filter");
+			/* just copy command structure */
+			argp = (unsigned long *)arg;
+			ret = copy_from_user(&filter_cmd, (void __user *)argp,
+					     sizeof(filter_cmd));
+			if (ret < 0) {
+				pr_alert("Error copying filter command\n");
+				break;
+			}
+			if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
+				pr_alert("Invalid opcode in request\n");
+				ret = -EINVAL;
+				break;
+			}
+
+			snoop_dbg("Opcode %d Len %d Ptr %p",
+				   filter_cmd.opcode, filter_cmd.length,
+				   filter_cmd.value_ptr);
+
+			filter_value = kzalloc(
+						filter_cmd.length * sizeof(u8),
+						GFP_KERNEL);
+			if (!filter_value) {
+				pr_alert("Not enough memory\n");
+				ret = -ENOMEM;
+				break;
+			}
+			/* copy remaining data from userspace */
+			ret = copy_from_user((u8 *)filter_value,
+					(void __user *)filter_cmd.value_ptr,
+					filter_cmd.length);
+			if (ret < 0) {
+				kfree(filter_value);
+				pr_alert("Error copying filter data\n");
+				break;
+			}
+			/* Drain packets first */
+			drain_snoop_list(&dd->hfi1_snoop.queue);
+			dd->hfi1_snoop.filter_callback =
+				hfi1_filters[filter_cmd.opcode].filter;
+			/* just in case we see back to back sets */
+			kfree(dd->hfi1_snoop.filter_value);
+			dd->hfi1_snoop.filter_value = filter_value;
+
+			break;
+		case HFI1_SNOOP_IOCGETVERSION:
+			value = SNOOP_CAPTURE_VERSION;
+			snoop_dbg("Getting version: %d", value);
+			ret = __put_user(value, (int __user *)arg);
+			break;
+		case HFI1_SNOOP_IOCSET_OPTS:
+			snoop_flags = 0;
+			ret = __get_user(value, (int __user *) arg);
+			if (ret != 0)
+				break;
+
+			snoop_dbg("Setting snoop option %d", value);
+			if (value & SNOOP_DROP_SEND)
+				snoop_flags |= SNOOP_DROP_SEND;
+			if (value & SNOOP_USE_METADATA)
+				snoop_flags |= SNOOP_USE_METADATA;
+			break;
+		default:
+			ret = -ENOTTY;
+			break;
+		}
+	}
+done:
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+	return ret;
+}
+
+static void snoop_list_add_tail(struct snoop_packet *packet,
+				struct hfi1_devdata *dd)
+{
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+	if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
+		   (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
+		list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
+		snoop_dbg("Added packet to list");
+	}
+
+	/*
+	 * Technically we can could have closed the snoop device while waiting
+	 * on the above lock and it is gone now. The snoop mode_flag will
+	 * prevent us from adding the packet to the queue though.
+	 */
+
+	spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+	wake_up_interruptible(&dd->hfi1_snoop.waitq);
+}
+
+static inline int hfi1_filter_check(void *val, const char *msg)
+{
+	if (!val) {
+		snoop_dbg("Error invalid %s value for filter", msg);
+		return HFI1_FILTER_ERR;
+	}
+	return 0;
+}
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
+{
+	struct hfi1_ib_header *hdr;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
+		return HFI1_FILTER_HIT; /* matched */
+
+	return HFI1_FILTER_MISS; /* Not matched */
+}
+
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
+{
+	struct hfi1_ib_header *hdr;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
+		return HFI1_FILTER_HIT;
+
+	return HFI1_FILTER_MISS;
+}
+
+/* Not valid for outgoing packets, send handler passes null for data*/
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+				      void *value)
+{
+	struct hfi1_ib_header *hdr;
+	struct hfi1_other_headers *ohdr = NULL;
+	struct ib_smp *smp = NULL;
+	u32 qpn = 0;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(packet_data, "packet_data");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	/* Check for GRH */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+	else
+		ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+
+	qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
+	if (qpn <= 1) {
+		smp = (struct ib_smp *)packet_data;
+		if (*((u8 *)value) == smp->mgmt_class)
+			return HFI1_FILTER_HIT;
+		else
+			return HFI1_FILTER_MISS;
+	}
+	return HFI1_FILTER_ERR;
+}
+
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
+{
+
+	struct hfi1_ib_header *hdr;
+	struct hfi1_other_headers *ohdr = NULL;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	/* Check for GRH */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+	else
+		ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+	if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
+		return HFI1_FILTER_HIT;
+
+	return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+				     void *value)
+{
+	u32 lnh = 0;
+	u8 opcode = 0;
+	struct hfi1_ib_header *hdr;
+	struct hfi1_other_headers *ohdr = NULL;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+
+	if (lnh == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == HFI1_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else
+		return HFI1_FILTER_ERR;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+	if (*((u8 *)value) == ((opcode >> 5) & 0x7))
+		return HFI1_FILTER_HIT;
+
+	return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+					void *value)
+{
+	struct hfi1_ib_header *hdr;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
+		return HFI1_FILTER_HIT;
+
+	return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
+{
+
+	u32 lnh = 0;
+	struct hfi1_ib_header *hdr;
+	struct hfi1_other_headers *ohdr = NULL;
+	int ret;
+
+	ret = hfi1_filter_check(ibhdr, "header");
+	if (ret)
+		return ret;
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	hdr = (struct hfi1_ib_header *)ibhdr;
+
+	lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+	if (lnh == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == HFI1_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else
+		return HFI1_FILTER_ERR;
+
+	/* P_key is 16-bit entity, however top most bit indicates
+	 * type of membership. 0 for limited and 1 for Full.
+	 * Limited members cannot accept information from other
+	 * Limited members, but communication is allowed between
+	 * every other combination of membership.
+	 * Hence we'll omit comparing top-most bit while filtering
+	 */
+
+	if ((*(u16 *)value & 0x7FFF) ==
+		((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
+		return HFI1_FILTER_HIT;
+
+	return HFI1_FILTER_MISS;
+}
+
+/*
+ * If packet_data is NULL then this is coming from one of the send functions.
+ * Thus we know if its an ingressed or egressed packet.
+ */
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
+{
+	u8 user_dir = *(u8 *)value;
+	int ret;
+
+	ret = hfi1_filter_check(value, "user");
+	if (ret)
+		return ret;
+
+	if (packet_data) {
+		/* Incoming packet */
+		if (user_dir & HFI1_SNOOP_INGRESS)
+			return HFI1_FILTER_HIT;
+	} else {
+		/* Outgoing packet */
+		if (user_dir & HFI1_SNOOP_EGRESS)
+			return HFI1_FILTER_HIT;
+	}
+
+	return HFI1_FILTER_MISS;
+}
+
+/*
+ * Allocate a snoop packet. The structure that is stored in the ring buffer, not
+ * to be confused with an hfi packet type.
+ */
+static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
+						  u32 data_len,
+						  u32 md_len)
+{
+
+	struct snoop_packet *packet = NULL;
+
+	packet = kzalloc(sizeof(struct snoop_packet) + hdr_len + data_len
+			 + md_len,
+			 GFP_ATOMIC | __GFP_NOWARN);
+	if (likely(packet))
+		INIT_LIST_HEAD(&packet->list);
+
+
+	return packet;
+}
+
+/*
+ * Instead of having snoop and capture code intermixed with the recv functions,
+ * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
+ * and land in here for snoop/capture but if not enabled the call will go
+ * through as before. This gives us a single point to constrain all of the snoop
+ * snoop recv logic. There is nothing special that needs to happen for bypass
+ * packets. This routine should not try to look into the packet. It just copied
+ * it. There is no guarantee for filters when it comes to bypass packets as
+ * there is no specific support. Bottom line is this routine does now even know
+ * what a bypass packet is.
+ */
+int snoop_recv_handler(struct hfi1_packet *packet)
+{
+	struct hfi1_pportdata *ppd = packet->rcd->ppd;
+	struct hfi1_ib_header *hdr = packet->hdr;
+	int header_size = packet->hlen;
+	void *data = packet->ebuf;
+	u32 tlen = packet->tlen;
+	struct snoop_packet *s_packet = NULL;
+	int ret;
+	int snoop_mode = 0;
+	u32 md_len = 0;
+	struct capture_md md;
+
+	snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
+		  data);
+
+	trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
+			    data);
+
+	if (!ppd->dd->hfi1_snoop.filter_callback) {
+		snoop_dbg("filter not set");
+		ret = HFI1_FILTER_HIT;
+	} else {
+		ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
+					ppd->dd->hfi1_snoop.filter_value);
+	}
+
+	switch (ret) {
+	case HFI1_FILTER_ERR:
+		snoop_dbg("Error in filter call");
+		break;
+	case HFI1_FILTER_MISS:
+		snoop_dbg("Filter Miss");
+		break;
+	case HFI1_FILTER_HIT:
+
+		if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+			snoop_mode = 1;
+		if ((snoop_mode == 0) ||
+		    unlikely(snoop_flags & SNOOP_USE_METADATA))
+			md_len = sizeof(struct capture_md);
+
+
+		s_packet = allocate_snoop_packet(header_size,
+						 tlen - header_size,
+						 md_len);
+
+		if (unlikely(s_packet == NULL)) {
+			dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+			break;
+		}
+
+		if (md_len > 0) {
+			memset(&md, 0, sizeof(struct capture_md));
+			md.port = 1;
+			md.dir = PKT_DIR_INGRESS;
+			md.u.rhf = packet->rhf;
+			memcpy(s_packet->data, &md, md_len);
+		}
+
+		/* We should always have a header */
+		if (hdr) {
+			memcpy(s_packet->data + md_len, hdr, header_size);
+		} else {
+			dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
+			kfree(s_packet);
+			break;
+		}
+
+		/*
+		 * Packets with no data are possible. If there is no data needed
+		 * to take care of the last 4 bytes which are normally included
+		 * with data buffers and are included in tlen.  Since we kzalloc
+		 * the buffer we do not need to set any values but if we decide
+		 * not to use kzalloc we should zero them.
+		 */
+		if (data)
+			memcpy(s_packet->data + header_size + md_len, data,
+			       tlen - header_size);
+
+		s_packet->total_len = tlen + md_len;
+		snoop_list_add_tail(s_packet, ppd->dd);
+
+		/*
+		 * If we are snooping the packet not capturing then throw away
+		 * after adding to the list.
+		 */
+		snoop_dbg("Capturing packet");
+		if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
+			snoop_dbg("Throwing packet away");
+			/*
+			 * If we are dropping the packet we still may need to
+			 * handle the case where error flags are set, this is
+			 * normally done by the type specific handler but that
+			 * won't be called in this case.
+			 */
+			if (unlikely(rhf_err_flags(packet->rhf)))
+				handle_eflags(packet);
+
+			/* throw the packet on the floor */
+			return RHF_RCV_CONTINUE;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * We do not care what type of packet came in here - just pass it off
+	 * to the normal handler.
+	 */
+	return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
+			(packet);
+}
+
+/*
+ * Handle snooping and capturing packets when sdma is being used.
+ */
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+			   u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			   u32 plen, u32 dwords, u64 pbc)
+{
+	pr_alert("Snooping/Capture of  Send DMA Packets Is Not Supported!\n");
+	snoop_dbg("Unsupported Operation");
+	return hfi1_verbs_send_dma(qp, ibhdr, hdrwords, ss, len, plen, dwords,
+				  0);
+}
+
+/*
+ * Handle snooping and capturing packets when pio is being used. Does not handle
+ * bypass packets. The only way to send a bypass packet currently is to use the
+ * diagpkt interface. When that interface is enable snoop/capture is not.
+ */
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+			   u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			   u32 plen, u32 dwords, u64 pbc)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct snoop_packet *s_packet = NULL;
+	u32 *hdr = (u32 *)&ahdr->ibh;
+	u32 length = 0;
+	struct hfi1_sge_state temp_ss;
+	void *data = NULL;
+	void *data_start = NULL;
+	int ret;
+	int snoop_mode = 0;
+	int md_len = 0;
+	struct capture_md md;
+	u32 vl;
+	u32 hdr_len = hdrwords << 2;
+	u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh);
+
+	md.u.pbc = 0;
+
+	snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
+		  hdrwords, len, plen, dwords, tlen);
+	if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+		snoop_mode = 1;
+	if ((snoop_mode == 0) ||
+	    unlikely(snoop_flags & SNOOP_USE_METADATA))
+		md_len = sizeof(struct capture_md);
+
+	/* not using ss->total_len as arg 2 b/c that does not count CRC */
+	s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
+
+	if (unlikely(s_packet == NULL)) {
+		dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+		goto out;
+	}
+
+	s_packet->total_len = tlen + md_len;
+
+	if (md_len > 0) {
+		memset(&md, 0, sizeof(struct capture_md));
+		md.port = 1;
+		md.dir = PKT_DIR_EGRESS;
+		if (likely(pbc == 0)) {
+			vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12;
+			md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
+		} else {
+			md.u.pbc = 0;
+		}
+		memcpy(s_packet->data, &md, md_len);
+	} else {
+		md.u.pbc = pbc;
+	}
+
+	/* Copy header */
+	if (likely(hdr)) {
+		memcpy(s_packet->data + md_len, hdr, hdr_len);
+	} else {
+		dd_dev_err(ppd->dd,
+			   "Unable to copy header to snoop/capture packet\n");
+		kfree(s_packet);
+		goto out;
+	}
+
+	if (ss) {
+		data = s_packet->data + hdr_len + md_len;
+		data_start = data;
+
+		/*
+		 * Copy SGE State
+		 * The update_sge() function below will not modify the
+		 * individual SGEs in the array. It will make a copy each time
+		 * and operate on that. So we only need to copy this instance
+		 * and it won't impact PIO.
+		 */
+		temp_ss = *ss;
+		length = len;
+
+		snoop_dbg("Need to copy %d bytes", length);
+		while (length) {
+			void *addr = temp_ss.sge.vaddr;
+			u32 slen = temp_ss.sge.length;
+
+			if (slen > length) {
+				slen = length;
+				snoop_dbg("slen %d > len %d", slen, length);
+			}
+			snoop_dbg("copy %d to %p", slen, addr);
+			memcpy(data, addr, slen);
+			update_sge(&temp_ss, slen);
+			length -= slen;
+			data += slen;
+			snoop_dbg("data is now %p bytes left %d", data, length);
+		}
+		snoop_dbg("Completed SGE copy");
+	}
+
+	/*
+	 * Why do the filter check down here? Because the event tracing has its
+	 * own filtering and we need to have the walked the SGE list.
+	 */
+	if (!ppd->dd->hfi1_snoop.filter_callback) {
+		snoop_dbg("filter not set\n");
+		ret = HFI1_FILTER_HIT;
+	} else {
+		ret = ppd->dd->hfi1_snoop.filter_callback(
+					&ahdr->ibh,
+					NULL,
+					ppd->dd->hfi1_snoop.filter_value);
+	}
+
+	switch (ret) {
+	case HFI1_FILTER_ERR:
+		snoop_dbg("Error in filter call");
+		/* fall through */
+	case HFI1_FILTER_MISS:
+		snoop_dbg("Filter Miss");
+		kfree(s_packet);
+		break;
+	case HFI1_FILTER_HIT:
+		snoop_dbg("Capturing packet");
+		snoop_list_add_tail(s_packet, ppd->dd);
+
+		if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
+			     (ppd->dd->hfi1_snoop.mode_flag &
+			      HFI1_PORT_SNOOP_MODE))) {
+			unsigned long flags;
+
+			snoop_dbg("Dropping packet");
+			if (qp->s_wqe) {
+				spin_lock_irqsave(&qp->s_lock, flags);
+				hfi1_send_complete(
+					qp,
+					qp->s_wqe,
+					IB_WC_SUCCESS);
+				spin_unlock_irqrestore(&qp->s_lock, flags);
+			} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+				spin_lock_irqsave(&qp->s_lock, flags);
+				hfi1_rc_send_complete(qp, &ahdr->ibh);
+				spin_unlock_irqrestore(&qp->s_lock, flags);
+			}
+			return 0;
+		}
+		break;
+	default:
+		kfree(s_packet);
+		break;
+	}
+out:
+	return hfi1_verbs_send_pio(qp, ahdr, hdrwords, ss, len, plen, dwords,
+				  md.u.pbc);
+}
+
+/*
+ * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
+ * this can be used anywhere, but the intention is for inline ACKs for RC and
+ * CCA packets. We don't restrict this usage though.
+ */
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+			   u64 pbc, const void *from, size_t count)
+{
+	int snoop_mode = 0;
+	int md_len = 0;
+	struct capture_md md;
+	struct snoop_packet *s_packet = NULL;
+
+	/*
+	 * count is in dwords so we need to convert to bytes.
+	 * We also need to account for CRC which would be tacked on by hardware.
+	 */
+	int packet_len = (count << 2) + 4;
+	int ret;
+
+	snoop_dbg("ACK OUT: len %d", packet_len);
+
+	if (!dd->hfi1_snoop.filter_callback) {
+		snoop_dbg("filter not set");
+		ret = HFI1_FILTER_HIT;
+	} else {
+		ret = dd->hfi1_snoop.filter_callback(
+				(struct hfi1_ib_header *)from,
+				NULL,
+				dd->hfi1_snoop.filter_value);
+	}
+
+	switch (ret) {
+	case HFI1_FILTER_ERR:
+		snoop_dbg("Error in filter call");
+		/* fall through */
+	case HFI1_FILTER_MISS:
+		snoop_dbg("Filter Miss");
+		break;
+	case HFI1_FILTER_HIT:
+		snoop_dbg("Capturing packet");
+		if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+			snoop_mode = 1;
+		if ((snoop_mode == 0) ||
+		    unlikely(snoop_flags & SNOOP_USE_METADATA))
+			md_len = sizeof(struct capture_md);
+
+		s_packet = allocate_snoop_packet(packet_len, 0, md_len);
+
+		if (unlikely(s_packet == NULL)) {
+			dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
+			goto inline_pio_out;
+		}
+
+		s_packet->total_len = packet_len + md_len;
+
+		/* Fill in the metadata for the packet */
+		if (md_len > 0) {
+			memset(&md, 0, sizeof(struct capture_md));
+			md.port = 1;
+			md.dir = PKT_DIR_EGRESS;
+			md.u.pbc = pbc;
+			memcpy(s_packet->data, &md, md_len);
+		}
+
+		/* Add the packet data which is a single buffer */
+		memcpy(s_packet->data + md_len, from, packet_len);
+
+		snoop_list_add_tail(s_packet, dd);
+
+		if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
+			snoop_dbg("Dropping packet");
+			return;
+		}
+		break;
+	default:
+		break;
+	}
+
+inline_pio_out:
+	pio_copy(dd, pbuf, pbc, from, count);
+
+}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
new file mode 100644
index 0000000..e03bd73
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/dma.c
@@ -0,0 +1,186 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (WARN_ON(!valid_dma_direction(direction)))
+		return BAD_DMA_ADDRESS;
+
+	return (u64) cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			    enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (WARN_ON(!valid_dma_direction(direction)))
+		return BAD_DMA_ADDRESS;
+
+	if (offset + size > PAGE_SIZE)
+		return BAD_DMA_ADDRESS;
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+
+	return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+		       int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	if (WARN_ON(!valid_dma_direction(direction)))
+		return BAD_DMA_ADDRESS;
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+			  struct scatterlist *sg, int nents,
+			 enum dma_data_direction direction)
+{
+	/* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+				     size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+					size_t size,
+					enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+	.mapping_error = hfi1_mapping_error,
+	.map_single = hfi1_dma_map_single,
+	.unmap_single = hfi1_dma_unmap_single,
+	.map_page = hfi1_dma_map_page,
+	.unmap_page = hfi1_dma_unmap_page,
+	.map_sg = hfi1_map_sg,
+	.unmap_sg = hfi1_unmap_sg,
+	.sync_single_for_cpu = hfi1_sync_single_for_cpu,
+	.sync_single_for_device = hfi1_sync_single_for_device,
+	.alloc_coherent = hfi1_dma_alloc_coherent,
+	.free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
new file mode 100644
index 0000000..c0a5900
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -0,0 +1,1241 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex);	/* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+	.set = hfi1_caps_set,
+	.get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+	int ret = 0;
+	unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+		cap_mask = *cap_mask_ptr, value, diff,
+		write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+			      HFI1_CAP_WRITABLE_MASK);
+
+	ret = kstrtoul(val, 0, &value);
+	if (ret) {
+		pr_warn("Invalid module parameter value for 'cap_mask'\n");
+		goto done;
+	}
+	/* Get the changed bits (except the locked bit) */
+	diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+	/* Remove any bits that are not allowed to change after driver load */
+	if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+		pr_warn("Ignoring non-writable capability bits %#lx\n",
+			diff & ~write_mask);
+		diff &= write_mask;
+	}
+
+	/* Mask off any reserved bits */
+	diff &= ~HFI1_CAP_RESERVED_MASK;
+	/* Clear any previously set and changing bits */
+	cap_mask &= ~diff;
+	/* Update the bits with the new capability */
+	cap_mask |= (value & diff);
+	/* Check for any kernel/user restrictions */
+	diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+		((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+	cap_mask &= ~diff;
+	/* Set the bitmask to the final set */
+	*cap_mask_ptr = cap_mask;
+done:
+	return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+	unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+	cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+	cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+	return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+	static char iname[16];
+
+	snprintf(iname, sizeof(iname), DRIVER_NAME"_%u", unit);
+	return iname;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	unsigned long flags;
+	int pidx, nunits_active = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+			continue;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && ppd->linkup) {
+				nunits_active++;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+	int nunits = 0, npresent = 0, nup = 0;
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	int pidx;
+	struct hfi1_pportdata *ppd;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		nunits++;
+		if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+			npresent++;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && ppd->linkup)
+				nup++;
+		}
+	}
+
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+
+	return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+			       u8 *update)
+{
+	u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+	*update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+	return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+			(offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+	if (unlikely(!IS_ALIGNED(size, PAGE_SIZE)))
+		return 0;
+	if (unlikely(size < MIN_EAGER_BUFFER))
+		return 0;
+	if (size >
+	    (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+		return 0;
+	if (encoded)
+		*encoded = ilog2(size / PAGE_SIZE) + 1;
+	return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+		       struct hfi1_packet *packet)
+{
+	struct hfi1_message_header *rhdr = packet->hdr;
+	u32 rte = rhf_rcv_type_err(packet->rhf);
+	int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+		return;
+
+	if (packet->rhf & RHF_TID_ERR) {
+		/* For TIDERR and RC QPs preemptively schedule a NAK */
+		struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+		struct hfi1_other_headers *ohdr = NULL;
+		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+		u16 lid  = be16_to_cpu(hdr->lrh[1]);
+		u32 qp_num;
+		u32 rcv_flags = 0;
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto drop;
+
+		/* Check for GRH */
+		if (lnh == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else if (lnh == HFI1_LRH_GRH) {
+			u32 vtf;
+
+			ohdr = &hdr->u.l.oth;
+			if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+				goto drop;
+			vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+				goto drop;
+			rcv_flags |= HFI1_HAS_GRH;
+		} else
+			goto drop;
+
+		/* Get the destination QP number. */
+		qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		if (lid < HFI1_MULTICAST_LID_BASE) {
+			struct hfi1_qp *qp;
+
+			rcu_read_lock();
+			qp = hfi1_lookup_qpn(ibp, qp_num);
+			if (!qp) {
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			/*
+			 * Handle only RC QPs - for other QP types drop error
+			 * packet.
+			 */
+			spin_lock(&qp->r_lock);
+
+			/* Check for valid receive state. */
+			if (!(ib_hfi1_state_ops[qp->state] &
+			      HFI1_PROCESS_RECV_OK)) {
+				ibp->n_pkt_drops++;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_RC:
+				hfi1_rc_hdrerr(
+					rcd,
+					hdr,
+					rcv_flags,
+					qp);
+				break;
+			default:
+				/* For now don't handle any other QP types */
+				break;
+			}
+
+			spin_unlock(&qp->r_lock);
+			rcu_read_unlock();
+		} /* Unicast QP */
+	} /* Valid packet with TIDErr */
+
+	/* handle "RcvTypeErr" flags */
+	switch (rte) {
+	case RHF_RTE_ERROR_OP_CODE_ERR:
+	{
+		u32 opcode;
+		void *ebuf = NULL;
+		__be32 *bth = NULL;
+
+		if (rhf_use_egr_bfr(packet->rhf))
+			ebuf = packet->ebuf;
+
+		if (ebuf == NULL)
+			goto drop; /* this should never happen */
+
+		if (lnh == HFI1_LRH_BTH)
+			bth = (__be32 *)ebuf;
+		else if (lnh == HFI1_LRH_GRH)
+			bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+		else
+			goto drop;
+
+		opcode = be32_to_cpu(bth[0]) >> 24;
+		opcode &= 0xff;
+
+		if (opcode == IB_OPCODE_CNP) {
+			/*
+			 * Only in pre-B0 h/w is the CNP_OPCODE handled
+			 * via this code path (errata 291394).
+			 */
+			struct hfi1_qp *qp = NULL;
+			u32 lqpn, rqpn;
+			u16 rlid;
+			u8 svc_type, sl, sc5;
+
+			sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
+			if (rhf_dc_info(packet->rhf))
+				sc5 |= 0x10;
+			sl = ibp->sc_to_sl[sc5];
+
+			lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK;
+			rcu_read_lock();
+			qp = hfi1_lookup_qpn(ibp, lqpn);
+			if (qp == NULL) {
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_UD:
+				rlid = 0;
+				rqpn = 0;
+				svc_type = IB_CC_SVCTYPE_UD;
+				break;
+			case IB_QPT_UC:
+				rlid = be16_to_cpu(rhdr->lrh[3]);
+				rqpn = qp->remote_qpn;
+				svc_type = IB_CC_SVCTYPE_UC;
+				break;
+			default:
+				goto drop;
+			}
+
+			process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+			rcu_read_unlock();
+		}
+
+		packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+		break;
+	}
+	default:
+		break;
+	}
+
+drop:
+	return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+			      struct hfi1_packet *packet)
+{
+
+	packet->rsize = rcd->rcvhdrqentsize; /* words */
+	packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+	packet->rcd = rcd;
+	packet->updegr = 0;
+	packet->etail = -1;
+	packet->rhf_addr = (__le32 *) rcd->rcvhdrq + rcd->head +
+			   rcd->dd->rhf_offset;
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+	packet->rhqoff = rcd->head;
+	packet->numpkt = 0;
+	packet->rcv_flags = 0;
+}
+
+#ifndef CONFIG_PRESCAN_RXQ
+static void prescan_rxq(struct hfi1_packet *packet) {}
+#else /* CONFIG_PRESCAN_RXQ */
+static int prescan_receive_queue;
+
+static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr,
+			struct hfi1_other_headers *ohdr,
+			u64 rhf, struct ib_grh *grh)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u32 bth1;
+	u8 sc5, svc_type;
+	int is_fecn, is_becn;
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_UD:
+		svc_type = IB_CC_SVCTYPE_UD;
+		break;
+	case IB_QPT_UC:	/* LATER */
+	case IB_QPT_RC:	/* LATER */
+	default:
+		return;
+	}
+
+	is_fecn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+			HFI1_FECN_MASK;
+	is_becn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+			HFI1_BECN_MASK;
+
+	sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+	if (rhf_dc_info(rhf))
+		sc5 |= 0x10;
+
+	if (is_fecn) {
+		u32 src_qpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+		u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+		u16 dlid = be16_to_cpu(hdr->lrh[1]);
+		u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+		return_cnp(ibp, qp, src_qpn, pkey, dlid, slid, sc5, grh);
+	}
+
+	if (is_becn) {
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+		u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		u8 sl = ibp->sc_to_sl[sc5];
+
+		process_becn(ppd, sl, 0, lqpn, 0, svc_type);
+	}
+
+	/* turn off BECN, or FECN */
+	bth1 = be32_to_cpu(ohdr->bth[1]);
+	bth1 &= ~(HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+	bth1 &= ~(HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+	ohdr->bth[1] = cpu_to_be32(bth1);
+}
+
+struct ps_mdata {
+	struct hfi1_ctxtdata *rcd;
+	u32 rsize;
+	u32 maxcnt;
+	u32 ps_head;
+	u32 ps_tail;
+	u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+				 struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
+	mdata->rcd = rcd;
+	mdata->rsize = packet->rsize;
+	mdata->maxcnt = packet->maxcnt;
+
+	if (rcd->ps_state.initialized == 0) {
+		mdata->ps_head = packet->rhqoff;
+		rcd->ps_state.initialized++;
+	} else
+		mdata->ps_head = rcd->ps_state.ps_head;
+
+	if (HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+		mdata->ps_tail = packet->hdrqtail;
+		mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+	} else {
+		mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+		mdata->ps_seq = rcd->seq_cnt;
+	}
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf)
+{
+	if (HFI1_CAP_IS_KSET(DMA_RTAIL))
+		return mdata->ps_head == mdata->ps_tail;
+	return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata)
+{
+	struct hfi1_ctxtdata *rcd = mdata->rcd;
+
+	mdata->ps_head += mdata->rsize;
+	if (mdata->ps_head > mdata->maxcnt)
+		mdata->ps_head = 0;
+	rcd->ps_state.ps_head = mdata->ps_head;
+	if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+		if (++mdata->ps_seq > 13)
+			mdata->ps_seq = 1;
+	}
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ */
+static void prescan_rxq(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ps_mdata mdata;
+
+	if (!prescan_receive_queue)
+		return;
+
+	init_ps_mdata(&mdata, packet);
+
+	while (1) {
+		struct hfi1_devdata *dd = rcd->dd;
+		struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+		__le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head +
+					 dd->rhf_offset;
+		struct hfi1_qp *qp;
+		struct hfi1_ib_header *hdr;
+		struct hfi1_other_headers *ohdr;
+		struct ib_grh *grh = NULL;
+		u64 rhf = rhf_to_cpu(rhf_addr);
+		u32 etype = rhf_rcv_type(rhf), qpn;
+		int is_ecn = 0;
+		u8 lnh;
+
+		if (ps_done(&mdata, rhf))
+			break;
+
+		if (etype != RHF_RCV_TYPE_IB)
+			goto next;
+
+		hdr = (struct hfi1_ib_header *)
+			hfi1_get_msgheader(dd, rhf_addr);
+		lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+		if (lnh == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else if (lnh == HFI1_LRH_GRH) {
+			ohdr = &hdr->u.l.oth;
+			grh = &hdr->u.l.grh;
+		} else
+			goto next; /* just in case */
+
+		is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+			(HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+		is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+			(HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+
+		if (!is_ecn)
+			goto next;
+
+		qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		rcu_read_lock();
+		qp = hfi1_lookup_qpn(ibp, qpn);
+
+		if (qp == NULL) {
+			rcu_read_unlock();
+			goto next;
+		}
+
+		process_ecn(qp, hdr, ohdr, rhf, grh);
+		rcu_read_unlock();
+next:
+		update_ps_mdata(&mdata);
+	}
+}
+#endif /* CONFIG_PRESCAN_RXQ */
+
+#define RCV_PKT_OK 0x0
+#define RCV_PKT_MAX 0x1
+
+static inline int process_rcv_packet(struct hfi1_packet *packet)
+{
+	int ret = RCV_PKT_OK;
+
+	packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+					 packet->rhf_addr);
+	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+	packet->etype = rhf_rcv_type(packet->rhf);
+	/* total length */
+	packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+	/* retrieve eager buffer details */
+	packet->ebuf = NULL;
+	if (rhf_use_egr_bfr(packet->rhf)) {
+		packet->etail = rhf_egr_index(packet->rhf);
+		packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+				 &packet->updegr);
+		/*
+		 * Prefetch the contents of the eager buffer.  It is
+		 * OK to send a negative length to prefetch_range().
+		 * The +2 is the size of the RHF.
+		 */
+		prefetch_range(packet->ebuf,
+			packet->tlen - ((packet->rcd->rcvhdrqentsize -
+				  (rhf_hdrq_offset(packet->rhf)+2)) * 4));
+	}
+
+	/*
+	 * Call a type specific handler for the packet. We
+	 * should be able to trust that etype won't be beyond
+	 * the range of valid indexes. If so something is really
+	 * wrong and we can probably just let things come
+	 * crashing down. There is no need to eat another
+	 * comparison in this performance critical code.
+	 */
+	packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+	packet->numpkt++;
+
+	/* Set up for the next packet */
+	packet->rhqoff += packet->rsize;
+	if (packet->rhqoff >= packet->maxcnt)
+		packet->rhqoff = 0;
+
+	if (packet->numpkt == MAX_PKT_RECV) {
+		ret = RCV_PKT_MAX;
+		this_cpu_inc(*packet->rcd->dd->rcv_limit);
+	}
+
+	packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff +
+				      packet->rcd->dd->rhf_offset;
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+	return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+	/*
+	 * Update head regs etc., every 16 packets, if not last pkt,
+	 * to help prevent rcvhdrq overflows, when many packets
+	 * are processed and queue is nearly full.
+	 * Don't request an interrupt for intermediate updates.
+	 */
+	if (!last && !(packet->numpkt & 0xf)) {
+		update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+			       packet->etail, 0, 0);
+		packet->updegr = 0;
+	}
+	packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+
+	/*
+	 * Nothing we need to free for the packet.
+	 *
+	 * The only thing we need to do is a final update and call for an
+	 * interrupt
+	 */
+	update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+		       packet->etail, rcv_intr_dynamic, packet->numpkt);
+
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+
+	struct hfi1_ctxtdata *rcd;
+	struct hfi1_qp *qp, *nqp;
+
+	rcd = packet->rcd;
+	rcd->head = packet->rhqoff;
+
+	/*
+	 * Iterate over all QPs waiting to respond.
+	 * The list won't change since the IRQ is only run on one CPU.
+	 */
+	list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+		list_del_init(&qp->rspwait);
+		if (qp->r_flags & HFI1_R_RSP_NAK) {
+			qp->r_flags &= ~HFI1_R_RSP_NAK;
+			hfi1_send_rc_ack(rcd, qp, 0);
+		}
+		if (qp->r_flags & HFI1_R_RSP_SEND) {
+			unsigned long flags;
+
+			qp->r_flags &= ~HFI1_R_RSP_SEND;
+			spin_lock_irqsave(&qp->s_lock, flags);
+			if (ib_hfi1_state_ops[qp->state] &
+					HFI1_PROCESS_OR_FLUSH_SEND)
+				hfi1_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd)
+{
+	u32 seq;
+	int last = 0;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	seq = rhf_rcv_seq(packet.rhf);
+	if (seq != rcd->seq_cnt)
+		goto bail;
+
+	prescan_rxq(&packet);
+
+	while (!last) {
+		last = process_rcv_packet(&packet);
+		seq = rhf_rcv_seq(packet.rhf);
+		if (++rcd->seq_cnt > 13)
+			rcd->seq_cnt = 1;
+		if (seq != rcd->seq_cnt)
+			last = 1;
+		process_rcv_update(last, &packet);
+	}
+	process_rcv_qp_work(&packet);
+bail:
+	finish_packet(&packet);
+}
+
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd)
+{
+	u32 hdrqtail;
+	int last = 0;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	hdrqtail = get_rcvhdrtail(rcd);
+	if (packet.rhqoff == hdrqtail)
+		goto bail;
+	smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+	prescan_rxq(&packet);
+
+	while (!last) {
+		last = process_rcv_packet(&packet);
+		if (packet.rhqoff == hdrqtail)
+			last = 1;
+		process_rcv_update(last, &packet);
+	}
+	process_rcv_qp_work(&packet);
+bail:
+	finish_packet(&packet);
+
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->first_user_ctxt; i++)
+		dd->rcd[i]->do_interrupt =
+			&handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->first_user_ctxt; i++)
+		dd->rcd[i]->do_interrupt =
+			&handle_receive_interrupt_dma_rtail;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+void handle_receive_interrupt(struct hfi1_ctxtdata *rcd)
+{
+
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 hdrqtail;
+	int last = 0, needset = 1;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+
+	if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+		u32 seq = rhf_rcv_seq(packet.rhf);
+
+		if (seq != rcd->seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = get_rcvhdrtail(rcd);
+		if (packet.rhqoff == hdrqtail)
+			goto bail;
+		smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+	}
+
+	prescan_rxq(&packet);
+
+	while (!last) {
+
+		if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet,
+			DROP_PACKET_OFF) == DROP_PACKET_ON)) {
+			dd->do_drop = 0;
+
+			/* On to the next packet */
+			packet.rhqoff += packet.rsize;
+			packet.rhf_addr = (__le32 *) rcd->rcvhdrq +
+					  packet.rhqoff +
+					  dd->rhf_offset;
+			packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+		} else {
+			last = process_rcv_packet(&packet);
+		}
+
+		if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+			u32 seq = rhf_rcv_seq(packet.rhf);
+
+			if (++rcd->seq_cnt > 13)
+				rcd->seq_cnt = 1;
+			if (seq != rcd->seq_cnt)
+				last = 1;
+			if (needset) {
+				dd_dev_info(dd,
+					"Switching to NO_DMA_RTAIL\n");
+				set_all_nodma_rtail(dd);
+				needset = 0;
+			}
+		} else {
+			if (packet.rhqoff == hdrqtail)
+				last = 1;
+			if (needset) {
+				dd_dev_info(dd,
+					    "Switching to DMA_RTAIL\n");
+				set_all_dma_rtail(dd);
+				needset = 0;
+			}
+		}
+
+		process_rcv_update(last, &packet);
+	}
+
+	process_rcv_qp_work(&packet);
+
+bail:
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	finish_packet(&packet);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+	switch (mtu) {
+	case     0: return OPA_MTU_0;
+	case   256: return OPA_MTU_256;
+	case   512: return OPA_MTU_512;
+	case  1024: return OPA_MTU_1024;
+	case  2048: return OPA_MTU_2048;
+	case  4096: return OPA_MTU_4096;
+	case  8192: return OPA_MTU_8192;
+	case 10240: return OPA_MTU_10240;
+	}
+	return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+	switch (mtu) {
+	case OPA_MTU_0:     return 0;
+	case OPA_MTU_256:   return 256;
+	case OPA_MTU_512:   return 512;
+	case OPA_MTU_1024:  return 1024;
+	case OPA_MTU_2048:  return 2048;
+	case OPA_MTU_4096:  return 4096;
+	case OPA_MTU_8192:  return 8192;
+	case OPA_MTU_10240: return 10240;
+	default: return 0xffff;
+	}
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.  We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int i, drain, ret = 0, is_up = 0;
+
+	ppd->ibmtu = 0;
+	for (i = 0; i < ppd->vls_supported; i++)
+		if (ppd->ibmtu < dd->vld[i].mtu)
+			ppd->ibmtu = dd->vld[i].mtu;
+	ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+	mutex_lock(&ppd->hls_lock);
+	if (ppd->host_link_state == HLS_UP_INIT
+			|| ppd->host_link_state == HLS_UP_ARMED
+			|| ppd->host_link_state == HLS_UP_ACTIVE)
+		is_up = 1;
+
+	drain = !is_ax(dd) && is_up;
+
+	if (drain)
+		/*
+		 * MTU is specified per-VL. To ensure that no packet gets
+		 * stuck (due, e.g., to the MTU for the packet's VL being
+		 * reduced), empty the per-VL FIFOs before adjusting MTU.
+		 */
+		ret = stop_drain_data_vls(dd);
+
+	if (ret) {
+		dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+			   __func__);
+		goto err;
+	}
+
+	hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+	if (drain)
+		open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+	mutex_unlock(&ppd->hls_lock);
+
+	return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	ppd->lid = lid;
+	ppd->lmc = lmc;
+	hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+	dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
+
+	return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDs, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void run_led_override(unsigned long opaque)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+	struct hfi1_devdata *dd = ppd->dd;
+	int timeoff;
+	int ph_idx;
+
+	if (!(dd->flags & HFI1_INITTED))
+		return;
+
+	ph_idx = ppd->led_override_phase++ & 1;
+	ppd->led_override = ppd->led_override_vals[ph_idx];
+	timeoff = ppd->led_override_timeoff;
+
+	/*
+	 * don't re-fire the timer if user asked for it to be off; we let
+	 * it fire one more time after they turn it off to simplify
+	 */
+	if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+		mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int timeoff, freq;
+
+	if (!(dd->flags & HFI1_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = val & 0xF;
+	}
+	ppd->led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&ppd->led_override_timer);
+		ppd->led_override_timer.function = run_led_override;
+		ppd->led_override_timer.data = (unsigned long) ppd;
+		ppd->led_override_timer.expires = jiffies + 1;
+		add_timer(&ppd->led_override_timer);
+	} else {
+		if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+			mod_timer(&ppd->led_override_timer, jiffies + 1);
+		atomic_dec(&ppd->led_override_timer_active);
+	}
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+	int ret, i;
+	struct hfi1_devdata *dd = hfi1_lookup(unit);
+	struct hfi1_pportdata *ppd;
+	unsigned long flags;
+	int pidx;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+	if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+		dd_dev_info(dd,
+			"Invalid unit number %u or not initialized or not present\n",
+			unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (dd->rcd)
+		for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+			if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (atomic_read(&ppd->led_override_timer_active)) {
+			/* Need to stop LED timer, _then_ shut off LEDs */
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+
+		/* Shut off LEDs after we are sure timer is not running */
+		ppd->led_override = LED_OVER_BOTH_OFF;
+	}
+	if (dd->flags & HFI1_HAS_SEND_DMA)
+		sdma_exit(dd);
+
+	hfi1_reset_cpu_counters(dd);
+
+	ret = hfi1_init(dd, 1);
+
+	if (ret)
+		dd_dev_err(dd,
+			"Reinitialize unit %u after reset failed with %d\n",
+			unit, ret);
+	else
+		dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+			unit);
+
+bail:
+	return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	u32 rte = rhf_rcv_type_err(packet->rhf);
+
+	dd_dev_err(rcd->dd,
+		"receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+		rcd->ctxt, packet->rhf,
+		packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+		packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+		packet->rhf & RHF_DC_ERR ? "dc " : "",
+		packet->rhf & RHF_TID_ERR ? "tid " : "",
+		packet->rhf & RHF_LEN_ERR ? "len " : "",
+		packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+		packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+		packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+		rte);
+
+	rcv_hdrerr(rcd, rcd->ppd, packet);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+	trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+			  packet->rcd->ctxt,
+			  rhf_err_flags(packet->rhf),
+			  RHF_RCV_TYPE_IB,
+			  packet->hlen,
+			  packet->tlen,
+			  packet->updegr,
+			  rhf_egr_index(packet->rhf));
+
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		handle_eflags(packet);
+		return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_ib_rcv(packet);
+	return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		handle_eflags(packet);
+
+	dd_dev_err(packet->rcd->dd,
+	   "Bypass packets are not supported in normal operation. Dropping\n");
+	return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+	handle_eflags(packet);
+
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		dd_dev_err(packet->rcd->dd,
+			   "Unhandled error packet received. Dropping.\n");
+
+	return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		handle_eflags(packet);
+
+	dd_dev_err(packet->rcd->dd,
+		   "Unhandled expected packet received. Dropping.\n");
+	return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		handle_eflags(packet);
+
+	dd_dev_err(packet->rcd->dd,
+		   "Unhandled eager packet received. Dropping.\n");
+	return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+	dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+		rhf_rcv_type(packet->rhf));
+	return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
new file mode 100644
index 0000000..b61d3ae
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.c
@@ -0,0 +1,475 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+/*
+ * The EPROM is logically divided into two partitions:
+ *	partition 0: the first 128K, visible from PCI ROM BAR
+ *	partition 1: the rest
+ */
+#define P0_SIZE (128 * 1024)
+#define P1_START P0_SIZE
+
+/* largest erase size supported by the controller */
+#define SIZE_32KB (32 * 1024)
+#define MASK_32KB (SIZE_32KB - 1)
+
+/* controller page size, in bytes */
+#define EP_PAGE_SIZE 256
+#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
+
+/* controller commands */
+#define CMD_SHIFT 24
+#define CMD_NOP			    (0)
+#define CMD_PAGE_PROGRAM(addr)	    ((0x02 << CMD_SHIFT) | addr)
+#define CMD_READ_DATA(addr)	    ((0x03 << CMD_SHIFT) | addr)
+#define CMD_READ_SR1		    ((0x05 << CMD_SHIFT))
+#define CMD_WRITE_ENABLE	    ((0x06 << CMD_SHIFT))
+#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
+#define CMD_CHIP_ERASE		    ((0x60 << CMD_SHIFT))
+#define CMD_READ_MANUF_DEV_ID	    ((0x90 << CMD_SHIFT))
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2	/* full speed */
+
+/* controller status register 1 bits */
+#define SR1_BUSY 0x1ull		/* the BUSY bit in SR1 */
+
+/* sleep length while waiting for controller */
+#define WAIT_SLEEP_US 100	/* must be larger than 5 (see usage) */
+#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US))
+
+/* GPIO pins */
+#define EPROM_WP_N (1ull << 14)	/* EPROM write line */
+
+/*
+ * Use the EP mutex to guard against other callers from within the driver.
+ * Also covers usage of eprom_available.
+ */
+static DEFINE_MUTEX(eprom_mutex);
+static int eprom_available;	/* default: not available */
+
+/*
+ * Turn on external enable line that allows writing on the flash.
+ */
+static void write_enable(struct hfi1_devdata *dd)
+{
+	/* raise signal */
+	write_csr(dd, ASIC_GPIO_OUT,
+		read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
+	/* raise enable */
+	write_csr(dd, ASIC_GPIO_OE,
+		read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
+}
+
+/*
+ * Turn off external enable line that allows writing on the flash.
+ */
+static void write_disable(struct hfi1_devdata *dd)
+{
+	/* lower signal */
+	write_csr(dd, ASIC_GPIO_OUT,
+		read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
+	/* lower enable */
+	write_csr(dd, ASIC_GPIO_OE,
+		read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
+}
+
+/*
+ * Wait for the device to become not busy.  Must be called after all
+ * write or erase operations.
+ */
+static int wait_for_not_busy(struct hfi1_devdata *dd)
+{
+	unsigned long count = 0;
+	u64 reg;
+	int ret = 0;
+
+	/* starts page mode */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
+	while (1) {
+		udelay(WAIT_SLEEP_US);
+		usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
+		count++;
+		reg = read_csr(dd, ASIC_EEP_DATA);
+		if ((reg & SR1_BUSY) == 0)
+			break;
+		/* 200s is the largest time for a 128Mb device */
+		if (count > COUNT_DELAY_SEC(200)) {
+			dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
+			ret = -ETIMEDOUT;
+			break; /* break, not goto - must stop page mode */
+		}
+	}
+
+	/* stop page mode with a NOP */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
+
+	return ret;
+}
+
+/*
+ * Read the device ID from the SPI controller.
+ */
+static u32 read_device_id(struct hfi1_devdata *dd)
+{
+	/* read the Manufacture Device ID */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
+	return (u32)read_csr(dd, ASIC_EEP_DATA);
+}
+
+/*
+ * Erase the whole flash.
+ */
+static int erase_chip(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	write_enable(dd);
+
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
+	ret = wait_for_not_busy(dd);
+
+	write_disable(dd);
+
+	return ret;
+}
+
+/*
+ * Erase a range using the 32KB erase command.
+ */
+static int erase_32kb_range(struct hfi1_devdata *dd, u32 start, u32 end)
+{
+	int ret = 0;
+
+	if (end < start)
+		return -EINVAL;
+
+	if ((start & MASK_32KB) || (end & MASK_32KB)) {
+		dd_dev_err(dd,
+			"%s: non-aligned range (0x%x,0x%x) for a 32KB erase\n",
+			__func__, start, end);
+		return -EINVAL;
+	}
+
+	write_enable(dd);
+
+	for (; start < end; start += SIZE_32KB) {
+		write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+		write_csr(dd, ASIC_EEP_ADDR_CMD,
+						CMD_SECTOR_ERASE_32KB(start));
+		ret = wait_for_not_busy(dd);
+		if (ret)
+			goto done;
+	}
+
+done:
+	write_disable(dd);
+
+	return ret;
+}
+
+/*
+ * Read a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
+{
+	int i;
+
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
+	for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++)
+		result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
+}
+
+/*
+ * Read length bytes starting at offset.  Copy to user address addr.
+ */
+static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+	u32 offset;
+	u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+	int ret = 0;
+
+	/* reject anything not on an EPROM page boundary */
+	if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+		return -EINVAL;
+
+	for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+		read_page(dd, start + offset, buffer);
+		if (copy_to_user((void __user *)(addr + offset),
+						buffer, EP_PAGE_SIZE)) {
+			ret = -EFAULT;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+/*
+ * Write a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
+{
+	int i;
+
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+	write_csr(dd, ASIC_EEP_DATA, data[0]);
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
+	for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++)
+		write_csr(dd, ASIC_EEP_DATA, data[i]);
+	/* will close the open page */
+	return wait_for_not_busy(dd);
+}
+
+/*
+ * Write length bytes starting at offset.  Read from user address addr.
+ */
+static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+	u32 offset;
+	u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+	int ret = 0;
+
+	/* reject anything not on an EPROM page boundary */
+	if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+		return -EINVAL;
+
+	write_enable(dd);
+
+	for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+		if (copy_from_user(buffer, (void __user *)(addr + offset),
+						EP_PAGE_SIZE)) {
+			ret = -EFAULT;
+			goto done;
+		}
+		ret = write_page(dd, start + offset, buffer);
+		if (ret)
+			goto done;
+	}
+
+done:
+	write_disable(dd);
+	return ret;
+}
+
+/*
+ * Perform the given operation on the EPROM.  Called from user space.  The
+ * user credentials have already been checked.
+ *
+ * Return 0 on success, -ERRNO on error
+ */
+int handle_eprom_command(const struct hfi1_cmd *cmd)
+{
+	struct hfi1_devdata *dd;
+	u32 dev_id;
+	int ret = 0;
+
+	/*
+	 * The EPROM is per-device, so use unit 0 as that will always
+	 * exist.
+	 */
+	dd = hfi1_lookup(0);
+	if (!dd) {
+		pr_err("%s: cannot find unit 0!\n", __func__);
+		return -EINVAL;
+	}
+
+	/* lock against other callers touching the ASIC block */
+	mutex_lock(&eprom_mutex);
+
+	/* some platforms do not have an EPROM */
+	if (!eprom_available) {
+		ret = -ENOSYS;
+		goto done_asic;
+	}
+
+	/* lock against the other HFI on another OS */
+	ret = acquire_hw_mutex(dd);
+	if (ret) {
+		dd_dev_err(dd,
+			"%s: unable to acquire hw mutex, no EPROM support\n",
+			__func__);
+		goto done_asic;
+	}
+
+	dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
+		__func__, cmd->type, cmd->len, cmd->addr);
+
+	switch (cmd->type) {
+	case HFI1_CMD_EP_INFO:
+		if (cmd->len != sizeof(u32)) {
+			ret = -ERANGE;
+			break;
+		}
+		dev_id = read_device_id(dd);
+		/* addr points to a u32 user buffer */
+		if (copy_to_user((void __user *)cmd->addr, &dev_id,
+								sizeof(u32)))
+			ret = -EFAULT;
+		break;
+	case HFI1_CMD_EP_ERASE_CHIP:
+		ret = erase_chip(dd);
+		break;
+	case HFI1_CMD_EP_ERASE_P0:
+		if (cmd->len != P0_SIZE) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = erase_32kb_range(dd, 0, cmd->len);
+		break;
+	case HFI1_CMD_EP_ERASE_P1:
+		/* check for overflow */
+		if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = erase_32kb_range(dd, P1_START, P1_START + cmd->len);
+		break;
+	case HFI1_CMD_EP_READ_P0:
+		if (cmd->len != P0_SIZE) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = read_length(dd, 0, cmd->len, cmd->addr);
+		break;
+	case HFI1_CMD_EP_READ_P1:
+		/* check for overflow */
+		if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = read_length(dd, P1_START, cmd->len, cmd->addr);
+		break;
+	case HFI1_CMD_EP_WRITE_P0:
+		if (cmd->len > P0_SIZE) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = write_length(dd, 0, cmd->len, cmd->addr);
+		break;
+	case HFI1_CMD_EP_WRITE_P1:
+		/* check for overflow */
+		if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+			ret = -ERANGE;
+			break;
+		}
+		ret = write_length(dd, P1_START, cmd->len, cmd->addr);
+		break;
+	default:
+		dd_dev_err(dd, "%s: unexpected command %d\n",
+			__func__, cmd->type);
+		ret = -EINVAL;
+		break;
+	}
+
+	release_hw_mutex(dd);
+done_asic:
+	mutex_unlock(&eprom_mutex);
+	return ret;
+}
+
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	/* only the discrete chip has an EPROM, nothing to do */
+	if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+		return 0;
+
+	/* lock against other callers */
+	mutex_lock(&eprom_mutex);
+	if (eprom_available)	/* already initialized */
+		goto done_asic;
+
+	/*
+	 * Lock against the other HFI on another OS - the mutex above
+	 * would have caught anything in this driver.  It is OK if
+	 * both OSes reset the EPROM - as long as they don't do it at
+	 * the same time.
+	 */
+	ret = acquire_hw_mutex(dd);
+	if (ret) {
+		dd_dev_err(dd,
+			"%s: unable to acquire hw mutex, no EPROM support\n",
+			__func__);
+		goto done_asic;
+	}
+
+	/* reset EPROM to be sure it is in a good state */
+
+	/* set reset */
+	write_csr(dd, ASIC_EEP_CTL_STAT,
+					ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+	/* clear reset, set speed */
+	write_csr(dd, ASIC_EEP_CTL_STAT,
+			EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+	/* wake the device with command "release powerdown NoID" */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+	eprom_available = 1;
+	release_hw_mutex(dd);
+done_asic:
+	mutex_unlock(&eprom_mutex);
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
new file mode 100644
index 0000000..64a6427
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.h
@@ -0,0 +1,55 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
new file mode 100644
index 0000000..4698617
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -0,0 +1,2140 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/uio.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "eprom.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_file_write(struct file *, const char __user *,
+			       size_t, loff_t *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *,
+			    int, unsigned);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+			 struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
+static int exp_tid_free(struct file *, struct hfi1_tid_info *);
+static void unlock_exp_tids(struct hfi1_ctxtdata *);
+
+static const struct file_operations hfi1_file_ops = {
+	.owner = THIS_MODULE,
+	.write = hfi1_file_write,
+	.write_iter = hfi1_write_iter,
+	.open = hfi1_file_open,
+	.release = hfi1_file_close,
+	.poll = hfi1_poll,
+	.mmap = hfi1_file_mmap,
+	.llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+	.fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+	PIO_BUFS = 1,
+	PIO_BUFS_SOP,
+	PIO_CRED,
+	RCV_HDRQ,
+	RCV_EGRBUF,
+	UREGS,
+	EVENTS,
+	STATUS,
+	RTAIL,
+	SUBCTXT_UREGS,
+	SUBCTXT_RCV_HDRQ,
+	SUBCTXT_EGRBUF,
+	SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK   0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT  0
+#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK     0xffULL
+#define HFI1_MMAP_CTXT_SHIFT    16
+#define HFI1_MMAP_TYPE_MASK     0xfULL
+#define HFI1_MMAP_TYPE_SHIFT    24
+#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT   32
+
+#define HFI1_MMAP_MAGIC         0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val)	\
+	(((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+	(((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
+	(HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+	HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+	HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+	HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+	HFI1_MMAP_TOKEN_SET(OFFSET, ((unsigned long)addr & ~PAGE_MASK)))
+
+#define EXP_TID_SET(field, value)			\
+	(((value) & EXP_TID_TID##field##_MASK) <<	\
+	 EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) {					\
+		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
+			   EXP_TID_TID##field##_SHIFT);			\
+			}
+#define EXP_TID_RESET(tid, field, value) do {				\
+		EXP_TID_CLEAR(tid, field);				\
+		(tid) |= EXP_TID_SET(field, value);			\
+	} while (0)
+
+#define dbg(fmt, ...)				\
+	pr_info(fmt, ##__VA_ARGS__)
+
+
+static inline int is_valid_mmap(u64 token)
+{
+	return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+	/* The real work is performed later in assign_ctxt() */
+	fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
+	if (fp->private_data) /* no cpu affinity by default */
+		((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
+			       size_t count, loff_t *offset)
+{
+	const struct hfi1_cmd __user *ucmd;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_cmd cmd;
+	struct hfi1_user_info uinfo;
+	struct hfi1_tid_info tinfo;
+	ssize_t consumed = 0, copy = 0, ret = 0;
+	void *dest = NULL;
+	__u64 user_val = 0;
+	int uctxt_required = 1;
+	int must_be_root = 0;
+
+	if (count < sizeof(cmd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct hfi1_cmd __user *)data;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd);
+
+	switch (cmd.type) {
+	case HFI1_CMD_ASSIGN_CTXT:
+		uctxt_required = 0;	/* assigned user context not required */
+		copy = sizeof(uinfo);
+		dest = &uinfo;
+		break;
+	case HFI1_CMD_SDMA_STATUS_UPD:
+	case HFI1_CMD_CREDIT_UPD:
+		copy = 0;
+		break;
+	case HFI1_CMD_TID_UPDATE:
+	case HFI1_CMD_TID_FREE:
+		copy = sizeof(tinfo);
+		dest = &tinfo;
+		break;
+	case HFI1_CMD_USER_INFO:
+	case HFI1_CMD_RECV_CTRL:
+	case HFI1_CMD_POLL_TYPE:
+	case HFI1_CMD_ACK_EVENT:
+	case HFI1_CMD_CTXT_INFO:
+	case HFI1_CMD_SET_PKEY:
+	case HFI1_CMD_CTXT_RESET:
+		copy = 0;
+		user_val = cmd.addr;
+		break;
+	case HFI1_CMD_EP_INFO:
+	case HFI1_CMD_EP_ERASE_CHIP:
+	case HFI1_CMD_EP_ERASE_P0:
+	case HFI1_CMD_EP_ERASE_P1:
+	case HFI1_CMD_EP_READ_P0:
+	case HFI1_CMD_EP_READ_P1:
+	case HFI1_CMD_EP_WRITE_P0:
+	case HFI1_CMD_EP_WRITE_P1:
+		uctxt_required = 0;	/* assigned user context not required */
+		must_be_root = 1;	/* validate user */
+		copy = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* If the command comes with user data, copy it. */
+	if (copy) {
+		if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		consumed += copy;
+	}
+
+	/*
+	 * Make sure there is a uctxt when needed.
+	 */
+	if (uctxt_required && !uctxt) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* only root can do these operations */
+	if (must_be_root && !capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case HFI1_CMD_ASSIGN_CTXT:
+		ret = assign_ctxt(fp, &uinfo);
+		if (ret < 0)
+			goto bail;
+		ret = setup_ctxt(fp);
+		if (ret)
+			goto bail;
+		ret = user_init(fp);
+		break;
+	case HFI1_CMD_CTXT_INFO:
+		ret = get_ctxt_info(fp, (void __user *)(unsigned long)
+				    user_val, cmd.len);
+		break;
+	case HFI1_CMD_USER_INFO:
+		ret = get_base_info(fp, (void __user *)(unsigned long)
+				    user_val, cmd.len);
+		break;
+	case HFI1_CMD_SDMA_STATUS_UPD:
+		break;
+	case HFI1_CMD_CREDIT_UPD:
+		if (uctxt && uctxt->sc)
+			sc_return_credits(uctxt->sc);
+		break;
+	case HFI1_CMD_TID_UPDATE:
+		ret = exp_tid_setup(fp, &tinfo);
+		if (!ret) {
+			unsigned long addr;
+			/*
+			 * Copy the number of tidlist entries we used
+			 * and the length of the buffer we registered.
+			 * These fields are adjacent in the structure so
+			 * we can copy them at the same time.
+			 */
+			addr = (unsigned long)cmd.addr +
+				offsetof(struct hfi1_tid_info, tidcnt);
+			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+					 sizeof(tinfo.tidcnt) +
+					 sizeof(tinfo.length)))
+				ret = -EFAULT;
+		}
+		break;
+	case HFI1_CMD_TID_FREE:
+		ret = exp_tid_free(fp, &tinfo);
+		break;
+	case HFI1_CMD_RECV_CTRL:
+		ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val);
+		break;
+	case HFI1_CMD_POLL_TYPE:
+		uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
+		break;
+	case HFI1_CMD_ACK_EVENT:
+		ret = user_event_ack(uctxt, subctxt_fp(fp), user_val);
+		break;
+	case HFI1_CMD_SET_PKEY:
+		if (HFI1_CAP_IS_USET(PKEY_CHECK))
+			ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val);
+		else
+			ret = -EPERM;
+		break;
+	case HFI1_CMD_CTXT_RESET: {
+		struct send_context *sc;
+		struct hfi1_devdata *dd;
+
+		if (!uctxt || !uctxt->dd || !uctxt->sc) {
+			ret = -EINVAL;
+			break;
+		}
+		/*
+		 * There is no protection here. User level has to
+		 * guarantee that no one will be writing to the send
+		 * context while it is being re-initialized.
+		 * If user level breaks that guarantee, it will break
+		 * it's own context and no one else's.
+		 */
+		dd = uctxt->dd;
+		sc = uctxt->sc;
+		/*
+		 * Wait until the interrupt handler has marked the
+		 * context as halted or frozen. Report error if we time
+		 * out.
+		 */
+		wait_event_interruptible_timeout(
+			sc->halt_wait, (sc->flags & SCF_HALTED),
+			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+		if (!(sc->flags & SCF_HALTED)) {
+			ret = -ENOLCK;
+			break;
+		}
+		/*
+		 * If the send context was halted due to a Freeze,
+		 * wait until the device has been "unfrozen" before
+		 * resetting the context.
+		 */
+		if (sc->flags & SCF_FROZEN) {
+			wait_event_interruptible_timeout(
+				dd->event_queue,
+				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+			if (dd->flags & HFI1_FROZEN) {
+				ret = -ENOLCK;
+				break;
+			}
+			if (dd->flags & HFI1_FORCED_FREEZE) {
+				/* Don't allow context reset if we are into
+				 * forced freeze */
+				ret = -ENODEV;
+				break;
+			}
+			sc_disable(sc);
+			ret = sc_enable(sc);
+			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+				     uctxt->ctxt);
+		} else
+			ret = sc_restart(sc);
+		if (!ret)
+			sc_return_credits(sc);
+		break;
+	}
+	case HFI1_CMD_EP_INFO:
+	case HFI1_CMD_EP_ERASE_CHIP:
+	case HFI1_CMD_EP_ERASE_P0:
+	case HFI1_CMD_EP_ERASE_P1:
+	case HFI1_CMD_EP_READ_P0:
+	case HFI1_CMD_EP_READ_P1:
+	case HFI1_CMD_EP_WRITE_P0:
+	case HFI1_CMD_EP_WRITE_P1:
+		ret = handle_eprom_command(&cmd);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+bail:
+	return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+	struct hfi1_user_sdma_pkt_q *pq;
+	struct hfi1_user_sdma_comp_q *cq;
+	int ret = 0, done = 0, reqs = 0;
+	unsigned long dim = from->nr_segs;
+
+	if (!user_sdma_comp_fp(kiocb->ki_filp) ||
+	    !user_sdma_pkt_fp(kiocb->ki_filp)) {
+		ret = -EIO;
+		goto done;
+	}
+
+	if (!iter_is_iovec(from) || !dim) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+		  ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp),
+		  dim);
+	pq = user_sdma_pkt_fp(kiocb->ki_filp);
+	cq = user_sdma_comp_fp(kiocb->ki_filp);
+
+	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+		ret = -ENOSPC;
+		goto done;
+	}
+
+	while (dim) {
+		unsigned long count = 0;
+
+		ret = hfi1_user_sdma_process_request(
+			kiocb->ki_filp,	(struct iovec *)(from->iov + done),
+			dim, &count);
+		if (ret)
+			goto done;
+		dim -= count;
+		done += count;
+		reqs++;
+	}
+done:
+	return ret ? ret : reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_devdata *dd;
+	unsigned long flags, pfn;
+	u64 token = vma->vm_pgoff << PAGE_SHIFT,
+		memaddr = 0;
+	u8 subctxt, mapio = 0, vmf = 0, type;
+	ssize_t memlen = 0;
+	int ret = 0;
+	u16 ctxt;
+
+	uctxt = ctxt_fp(fp);
+	if (!is_valid_mmap(token) || !uctxt ||
+	    !(vma->vm_flags & VM_SHARED)) {
+		ret = -EINVAL;
+		goto done;
+	}
+	dd = uctxt->dd;
+	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+	if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	flags = vma->vm_flags;
+
+	switch (type) {
+	case PIO_BUFS:
+	case PIO_BUFS_SOP:
+		memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+				/* chip pio base */
+			   (uctxt->sc->hw_context * (1 << 16))) +
+				/* 64K PIO space / ctxt */
+			(type == PIO_BUFS_SOP ?
+				(TXE_PIO_SIZE / 2) : 0); /* sop? */
+		/*
+		 * Map only the amount allocated to the context, not the
+		 * entire available context's PIO space.
+		 */
+		memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE,
+			       PAGE_SIZE);
+		flags &= ~VM_MAYREAD;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		mapio = 1;
+		break;
+	case PIO_CRED:
+		if (flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		/*
+		 * The credit return location for this context could be on the
+		 * second or third page allocated for credit returns (if number
+		 * of enabled contexts > 64 and 128 respectively).
+		 */
+		memaddr = dd->cr_base[uctxt->numa_id].pa +
+			(((u64)uctxt->sc->hw_free -
+			  (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+		memlen = PAGE_SIZE;
+		flags &= ~VM_MAYWRITE;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		/*
+		 * The driver has already allocated memory for credit
+		 * returns and programmed it into the chip. Has that
+		 * memory been flagged as non-cached?
+		 */
+		/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+		mapio = 1;
+		break;
+	case RCV_HDRQ:
+		memaddr = uctxt->rcvhdrq_phys;
+		memlen = uctxt->rcvhdrq_size;
+		break;
+	case RCV_EGRBUF: {
+		unsigned long addr;
+		int i;
+		/*
+		 * The RcvEgr buffer need to be handled differently
+		 * as multiple non-contiguous pages need to be mapped
+		 * into the user process.
+		 */
+		memlen = uctxt->egrbufs.size;
+		if ((vma->vm_end - vma->vm_start) != memlen) {
+			dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+				   (vma->vm_end - vma->vm_start), memlen);
+			ret = -EINVAL;
+			goto done;
+		}
+		if (vma->vm_flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		vma->vm_flags &= ~VM_MAYWRITE;
+		addr = vma->vm_start;
+		for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+			ret = remap_pfn_range(
+				vma, addr,
+				uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+				uctxt->egrbufs.buffers[i].len,
+				vma->vm_page_prot);
+			if (ret < 0)
+				goto done;
+			addr += uctxt->egrbufs.buffers[i].len;
+		}
+		ret = 0;
+		goto done;
+	}
+	case UREGS:
+		/*
+		 * Map only the page that contains this context's user
+		 * registers.
+		 */
+		memaddr = (unsigned long)
+			(dd->physaddr + RXE_PER_CONTEXT_USER)
+			+ (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+		/*
+		 * TidFlow table is on the same page as the rest of the
+		 * user registers.
+		 */
+		memlen = PAGE_SIZE;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		mapio = 1;
+		break;
+	case EVENTS:
+		/*
+		 * Use the page where this context's flags are. User level
+		 * knows where it's own bitmap is within the page.
+		 */
+		memaddr = ((unsigned long)dd->events +
+			   ((uctxt->ctxt - dd->first_user_ctxt) *
+			    HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+		memlen = PAGE_SIZE;
+		/*
+		 * v3.7 removes VM_RESERVED but the effect is kept by
+		 * using VM_IO.
+		 */
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case STATUS:
+		memaddr = kvirt_to_phys((void *)dd->status);
+		memlen = PAGE_SIZE;
+		flags |= VM_IO | VM_DONTEXPAND;
+		break;
+	case RTAIL:
+		if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+			/*
+			 * If the memory allocation failed, the context alloc
+			 * also would have failed, so we would never get here
+			 */
+			ret = -EINVAL;
+			goto done;
+		}
+		if (flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		memaddr = uctxt->rcvhdrqtailaddr_phys;
+		memlen = PAGE_SIZE;
+		flags &= ~VM_MAYWRITE;
+		break;
+	case SUBCTXT_UREGS:
+		memaddr = (u64)uctxt->subctxt_uregbase;
+		memlen = PAGE_SIZE;
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case SUBCTXT_RCV_HDRQ:
+		memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+		memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case SUBCTXT_EGRBUF:
+		memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+		memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+		flags |= VM_IO | VM_DONTEXPAND;
+		flags &= ~VM_MAYWRITE;
+		vmf = 1;
+		break;
+	case SDMA_COMP: {
+		struct hfi1_user_sdma_comp_q *cq;
+
+		if (!user_sdma_comp_fp(fp)) {
+			ret = -EFAULT;
+			goto done;
+		}
+		cq = user_sdma_comp_fp(fp);
+		memaddr = (u64)cq->comps;
+		memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if ((vma->vm_end - vma->vm_start) != memlen) {
+		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+			  uctxt->ctxt, subctxt_fp(fp),
+			  (vma->vm_end - vma->vm_start), memlen);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	vma->vm_flags = flags;
+	dd_dev_info(dd,
+		    "%s: %u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+		    __func__, ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+		    vma->vm_end - vma->vm_start, vma->vm_flags);
+	pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+	if (vmf) {
+		vma->vm_pgoff = pfn;
+		vma->vm_ops = &vm_ops;
+		ret = 0;
+	} else if (mapio) {
+		ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+					 vma->vm_page_prot);
+	} else {
+		ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+				      vma->vm_page_prot);
+	}
+done:
+	return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	struct hfi1_ctxtdata *uctxt;
+	unsigned pollflag;
+
+	uctxt = ctxt_fp(fp);
+	if (!uctxt)
+		pollflag = POLLERR;
+	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+		pollflag = poll_urgent(fp, pt);
+	else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+		pollflag = poll_next(fp, pt);
+	else /* invalid */
+		pollflag = POLLERR;
+
+	return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+	struct hfi1_filedata *fdata = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+	struct hfi1_devdata *dd;
+	unsigned long flags, *ev;
+
+	fp->private_data = NULL;
+
+	if (!uctxt)
+		goto done;
+
+	hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+	dd = uctxt->dd;
+	mutex_lock(&hfi1_mutex);
+
+	flush_wc();
+	/* drain user sdma queue */
+	if (fdata->pq)
+		hfi1_user_sdma_free_queues(fdata);
+
+	/*
+	 * Clear any left over, unhandled events so the next process that
+	 * gets this context doesn't get confused.
+	 */
+	ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+			   HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+	*ev = 0;
+
+	if (--uctxt->cnt) {
+		uctxt->active_slaves &= ~(1 << fdata->subctxt);
+		uctxt->subpid[fdata->subctxt] = 0;
+		mutex_unlock(&hfi1_mutex);
+		goto done;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	/*
+	 * Disable receive context and interrupt available, reset all
+	 * RcvCtxtCtrl bits to default values.
+	 */
+	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+		     HFI1_RCVCTRL_TIDFLOW_DIS |
+		     HFI1_RCVCTRL_INTRAVAIL_DIS |
+		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+	/* Clear the context's J_KEY */
+	hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+	/*
+	 * Reset context integrity checks to default.
+	 * (writes to CSRs probably belong in chip.c)
+	 */
+	write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+			hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+	sc_disable(uctxt->sc);
+	uctxt->pid = 0;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	dd->rcd[uctxt->ctxt] = NULL;
+	uctxt->rcvwait_to = 0;
+	uctxt->piowait_to = 0;
+	uctxt->rcvnowait = 0;
+	uctxt->pionowait = 0;
+	uctxt->event_flags = 0;
+
+	hfi1_clear_tids(uctxt);
+	hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+	if (uctxt->tid_pg_list)
+		unlock_exp_tids(uctxt);
+
+	hfi1_stats.sps_ctxts--;
+	dd->freectxts++;
+	mutex_unlock(&hfi1_mutex);
+	hfi1_free_ctxtdata(dd, uctxt);
+done:
+	kfree(fdata);
+	return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(addr);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+	int i_minor, ret = 0;
+	unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
+
+	swmajor = uinfo->userversion >> 16;
+	if (swmajor != HFI1_USER_SWMAJOR) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->userversion & 0xffff;
+
+	if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
+		alg = uinfo->hfi1_alg;
+
+	mutex_lock(&hfi1_mutex);
+	/* First, lets check if we need to setup a shared context? */
+	if (uinfo->subctxt_cnt)
+		ret = find_shared_ctxt(fp, uinfo);
+
+	/*
+	 * We execute the following block if we couldn't find a
+	 * shared context or if context sharing is not required.
+	 */
+	if (!ret) {
+		i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+		ret = get_user_context(fp, uinfo, i_minor - 1, alg);
+	}
+	mutex_unlock(&hfi1_mutex);
+done:
+	return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+			    int devno, unsigned alg)
+{
+	struct hfi1_devdata *dd = NULL;
+	int ret = 0, devmax, npresent, nup, dev;
+
+	devmax = hfi1_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (!nup) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+	if (devno >= 0) {
+		dd = hfi1_lookup(devno);
+		if (!dd)
+			ret = -ENODEV;
+		else if (!dd->freectxts)
+			ret = -EBUSY;
+	} else {
+		struct hfi1_devdata *pdd;
+
+		if (alg == HFI1_ALG_ACROSS) {
+			unsigned free = 0U;
+
+			for (dev = 0; dev < devmax; dev++) {
+				pdd = hfi1_lookup(dev);
+				if (pdd && pdd->freectxts &&
+				    pdd->freectxts > free) {
+					dd = pdd;
+					free = pdd->freectxts;
+				}
+			}
+		} else {
+			for (dev = 0; dev < devmax; dev++) {
+				pdd = hfi1_lookup(dev);
+				if (pdd && pdd->freectxts) {
+					dd = pdd;
+					break;
+				}
+			}
+		}
+		if (!dd)
+			ret = -EBUSY;
+	}
+done:
+	return ret ? ret : allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+			    const struct hfi1_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = hfi1_count_units(NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+		/* device portion of usable() */
+		if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+			continue;
+		for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+			struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+			/* Skip ctxts which are not yet open */
+			if (!uctxt || !uctxt->cnt)
+				continue;
+			/* Skip ctxt if it doesn't match the requested one */
+			if (memcmp(uctxt->uuid, uinfo->uuid,
+				   sizeof(uctxt->uuid)) ||
+			    uctxt->subctxt_id != uinfo->subctxt_id ||
+			    uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+				continue;
+
+			/* Verify the sharing process matches the master */
+			if (uctxt->userversion != uinfo->userversion ||
+			    uctxt->cnt >= uctxt->subctxt_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			ctxt_fp(fp) = uctxt;
+			subctxt_fp(fp) = uctxt->cnt++;
+			uctxt->subpid[subctxt_fp(fp)] = current->pid;
+			uctxt->active_slaves |= 1 << subctxt_fp(fp);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+			 struct hfi1_user_info *uinfo)
+{
+	struct hfi1_ctxtdata *uctxt;
+	unsigned ctxt;
+	int ret;
+
+	if (dd->flags & HFI1_FROZEN) {
+		/*
+		 * Pick an error that is unique from all other errors
+		 * that are returned so the user process knows that
+		 * it tried to allocate while the SPC was frozen.  It
+		 * it should be able to retry with success in a short
+		 * while.
+		 */
+		return -EIO;
+	}
+
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+		if (!dd->rcd[ctxt])
+			break;
+
+	if (ctxt == dd->num_rcv_contexts)
+		return -EBUSY;
+
+	uctxt = hfi1_create_ctxtdata(dd->pport, ctxt);
+	if (!uctxt) {
+		dd_dev_err(dd,
+			   "Unable to allocate ctxtdata memory, failing open\n");
+		return -ENOMEM;
+	}
+	/*
+	 * Allocate and enable a PIO send context.
+	 */
+	uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+			     uctxt->numa_id);
+	if (!uctxt->sc)
+		return -ENOMEM;
+
+	dbg("allocated send context %u(%u)\n", uctxt->sc->sw_index,
+		uctxt->sc->hw_context);
+	ret = sc_enable(uctxt->sc);
+	if (ret)
+		return ret;
+	/*
+	 * Setup shared context resources if the user-level has requested
+	 * shared contexts and this is the 'master' process.
+	 * This has to be done here so the rest of the sub-contexts find the
+	 * proper master.
+	 */
+	if (uinfo->subctxt_cnt && !subctxt_fp(fp)) {
+		ret = init_subctxts(uctxt, uinfo);
+		/*
+		 * On error, we don't need to disable and de-allocate the
+		 * send context because it will be done during file close
+		 */
+		if (ret)
+			return ret;
+	}
+	uctxt->userversion = uinfo->userversion;
+	uctxt->pid = current->pid;
+	uctxt->flags = HFI1_CAP_UGET(MASK);
+	init_waitqueue_head(&uctxt->wait);
+	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+	uctxt->jkey = generate_jkey(current_uid());
+	INIT_LIST_HEAD(&uctxt->sdma_queues);
+	spin_lock_init(&uctxt->sdma_qlock);
+	hfi1_stats.sps_ctxts++;
+	dd->freectxts--;
+	ctxt_fp(fp) = uctxt;
+
+	return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+			 const struct hfi1_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subctxts;
+
+	num_subctxts = uinfo->subctxt_cnt;
+	if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+	uctxt->subctxt_id = uinfo->subctxt_id;
+	uctxt->active_slaves = 1;
+	uctxt->redirect_seq_cnt = 1;
+	set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+bail:
+	return ret;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+	int ret = 0;
+	unsigned num_subctxts = uctxt->subctxt_cnt;
+
+	uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+	if (!uctxt->subctxt_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* We can take the size of the RcvHdr Queue from the master */
+	uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+						  num_subctxts);
+	if (!uctxt->subctxt_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+						num_subctxts);
+	if (!uctxt->subctxt_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+	goto bail;
+bail_rhdr:
+	vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+	vfree(uctxt->subctxt_uregbase);
+	uctxt->subctxt_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int user_init(struct file *fp)
+{
+	int ret;
+	unsigned int rcvctrl_ops = 0;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+
+	/* make sure that the context has already been setup */
+	if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/*
+	 * Subctxts don't need to initialize anything since master
+	 * has done it.
+	 */
+	if (subctxt_fp(fp)) {
+		ret = wait_event_interruptible(uctxt->wait,
+			!test_bit(HFI1_CTXT_MASTER_UNINIT,
+			&uctxt->event_flags));
+		goto done;
+	}
+
+	/* initialize poll variables... */
+	uctxt->urgent = 0;
+	uctxt->urgent_poll = 0;
+
+	/*
+	 * Now enable the ctxt for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ctxts, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	if (uctxt->rcvhdrtail_kvaddr)
+		clear_rcvhdrtail(uctxt);
+
+	/* Setup J_KEY before enabling the context */
+	hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+	/*
+	 * Ignore the bit in the flags for now until proper
+	 * support for multiple packet per rcv array entry is
+	 * added.
+	 */
+	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+	/* Notify any waiting slaves */
+	if (uctxt->subctxt_cnt) {
+		clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+		wake_up(&uctxt->wait);
+	}
+	ret = 0;
+
+done:
+	return ret;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+	struct hfi1_ctxt_info cinfo;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	int ret = 0;
+
+	ret = hfi1_get_base_kinfo(uctxt, &cinfo);
+	if (ret < 0)
+		goto done;
+	cinfo.num_active = hfi1_count_active_units();
+	cinfo.unit = uctxt->dd->unit;
+	cinfo.ctxt = uctxt->ctxt;
+	cinfo.subctxt = subctxt_fp(fp);
+	cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+				uctxt->dd->rcv_entries.group_size) +
+		uctxt->expected_count;
+	cinfo.credits = uctxt->sc->credits;
+	cinfo.numa_node = uctxt->numa_id;
+	cinfo.rec_cpu = fd->rec_cpu_num;
+	cinfo.send_ctxt = uctxt->sc->hw_context;
+
+	cinfo.egrtids = uctxt->egrbufs.alloced;
+	cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+	cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+	cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries;
+	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo);
+	if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+		ret = -EFAULT;
+done:
+	return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	int ret = 0;
+
+	/*
+	 * Context should be set up only once (including allocation and
+	 * programming of eager buffers. This is done if context sharing
+	 * is not requested or by the master process.
+	 */
+	if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) {
+		ret = hfi1_init_ctxt(uctxt->sc);
+		if (ret)
+			goto done;
+
+		/* Now allocate the RcvHdr queue and eager buffers. */
+		ret = hfi1_create_rcvhdrq(dd, uctxt);
+		if (ret)
+			goto done;
+		ret = hfi1_setup_eagerbufs(uctxt);
+		if (ret)
+			goto done;
+		if (uctxt->subctxt_cnt && !subctxt_fp(fp)) {
+			ret = setup_subctxt(uctxt);
+			if (ret)
+				goto done;
+		}
+		/* Setup Expected Rcv memories */
+		uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
+					     sizeof(struct page **));
+		if (!uctxt->tid_pg_list) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uctxt->physshadow = vzalloc(uctxt->expected_count *
+					    sizeof(*uctxt->physshadow));
+		if (!uctxt->physshadow) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		/* allocate expected TID map and initialize the cursor */
+		atomic_set(&uctxt->tidcursor, 0);
+		uctxt->numtidgroups = uctxt->expected_count /
+			dd->rcv_entries.group_size;
+		uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
+			!!(uctxt->numtidgroups % BITS_PER_LONG);
+		uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
+						sizeof(*uctxt->tidusemap),
+						GFP_KERNEL, uctxt->numa_id);
+		if (!uctxt->tidusemap) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		/*
+		 * In case that the number of groups is not a multiple of
+		 * 64 (the number of groups in a tidusemap element), mark
+		 * the extra ones as used. This will effectively make them
+		 * permanently used and should never be assigned. Otherwise,
+		 * the code which checks how many free groups we have will
+		 * get completely confused about the state of the bits.
+		 */
+		if (uctxt->numtidgroups % BITS_PER_LONG)
+			uctxt->tidusemap[uctxt->tidmapcnt - 1] =
+				~((1ULL << (uctxt->numtidgroups %
+					    BITS_PER_LONG)) - 1);
+		trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0,
+				       uctxt->tidusemap, uctxt->tidmapcnt);
+	}
+	ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+	if (ret)
+		goto done;
+
+	set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+	return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+	struct hfi1_base_info binfo;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	ssize_t sz;
+	unsigned offset;
+	int ret = 0;
+
+	trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+	memset(&binfo, 0, sizeof(binfo));
+	binfo.hw_version = dd->revision;
+	binfo.sw_version = HFI1_KERN_SWVERSION;
+	binfo.bthqp = kdeth_qp;
+	binfo.jkey = uctxt->jkey;
+	/*
+	 * If more than 64 contexts are enabled the allocated credit
+	 * return will span two or three contiguous pages. Since we only
+	 * map the page containing the context's credit return address,
+	 * we need to calculate the offset in the proper page.
+	 */
+	offset = ((u64)uctxt->sc->hw_free -
+		  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+	binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+					       subctxt_fp(fp), offset);
+	binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+					    subctxt_fp(fp),
+					    uctxt->sc->base_addr);
+	binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+						uctxt->ctxt,
+						subctxt_fp(fp),
+						uctxt->sc->base_addr);
+	binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+					       subctxt_fp(fp),
+					       uctxt->rcvhdrq);
+	binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+					       subctxt_fp(fp),
+					       uctxt->egrbufs.rcvtids[0].phys);
+	binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+						 subctxt_fp(fp), 0);
+	/*
+	 * user regs are at
+	 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+	 */
+	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+					    subctxt_fp(fp), 0);
+	offset = ((((uctxt->ctxt - dd->first_user_ctxt) *
+		    HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) *
+		  sizeof(*dd->events)) & ~PAGE_MASK;
+	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+					      subctxt_fp(fp),
+					      offset);
+	binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+					      subctxt_fp(fp),
+					      dd->status);
+	if (HFI1_CAP_IS_USET(DMA_RTAIL))
+		binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+						       subctxt_fp(fp), 0);
+	if (uctxt->subctxt_cnt) {
+		binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+							uctxt->ctxt,
+							subctxt_fp(fp), 0);
+		binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+							 uctxt->ctxt,
+							 subctxt_fp(fp), 0);
+		binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+							 uctxt->ctxt,
+							 subctxt_fp(fp), 0);
+	}
+	sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+	if (copy_to_user(ubase, &binfo, sz))
+		ret = -EFAULT;
+	return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+				struct poll_table_struct *pt)
+{
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &uctxt->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (uctxt->urgent != uctxt->urgent_poll) {
+		pollflag = POLLIN | POLLRDNORM;
+		uctxt->urgent_poll = uctxt->urgent;
+	} else {
+		pollflag = 0;
+		set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+			      struct poll_table_struct *pt)
+{
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &uctxt->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (hdrqempty(uctxt)) {
+		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+		pollflag = 0;
+	} else
+		pollflag = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned ctxt;
+	int ret = 0;
+	unsigned long flags;
+
+	if (!dd->events) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+	     ctxt++) {
+		uctxt = dd->rcd[ctxt];
+		if (uctxt) {
+			unsigned long *evs = dd->events +
+				(uctxt->ctxt - dd->first_user_ctxt) *
+				HFI1_MAX_SHARED_CTXTS;
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(evtbit, evs);
+			for (i = 1; i < uctxt->subctxt_cnt; i++)
+				set_bit(evtbit, evs + i);
+		}
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+	return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+		       int start_stop)
+{
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned int rcvctrl_op;
+
+	if (subctxt)
+		goto bail;
+	/* atomically clear receive enable ctxt. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.
+		 */
+		if (uctxt->rcvhdrtail_kvaddr)
+			clear_rcvhdrtail(uctxt);
+		rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+	} else
+		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+	hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+			  unsigned long events)
+{
+	int i;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned long *evs;
+
+	if (!dd->events)
+		return 0;
+
+	evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+			    HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+	for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+		if (!test_bit(i, &events))
+			continue;
+		clear_bit(i, evs);
+	}
+	return 0;
+}
+
+#define num_user_pages(vaddr, len)					\
+	(1 + (((((unsigned long)(vaddr) +				\
+		 (unsigned long)(len) - 1) & PAGE_MASK) -		\
+	       ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+/**
+ * tzcnt - count the number of trailing zeros in a 64bit value
+ * @value: the value to be examined
+ *
+ * Returns the number of trailing least significant zeros in the
+ * the input value. If the value is zero, return the number of
+ * bits of the value.
+ */
+static inline u8 tzcnt(u64 value)
+{
+	return value ? __builtin_ctzl(value) : sizeof(value) * 8;
+}
+
+static inline unsigned num_free_groups(unsigned long map, u16 *start)
+{
+	unsigned free;
+	u16 bitidx = *start;
+
+	if (bitidx >= BITS_PER_LONG)
+		return 0;
+	/* "Turn off" any bits set before our bit index */
+	map &= ~((1ULL << bitidx) - 1);
+	free = tzcnt(map) - bitidx;
+	while (!free && bitidx < BITS_PER_LONG) {
+		/* Zero out the last set bit so we look at the rest */
+		map &= ~(1ULL << bitidx);
+		/*
+		 * Account for the previously checked bits and advance
+		 * the bit index. We don't have to check for bitidx
+		 * getting bigger than BITS_PER_LONG here as it would
+		 * mean extra instructions that we don't need. If it
+		 * did happen, it would push free to a negative value
+		 * which will break the loop.
+		 */
+		free = tzcnt(map) - ++bitidx;
+	}
+	*start = bitidx;
+	return free;
+}
+
+static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+	int ret = 0;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned tid, mapped = 0, npages, ngroups, exp_groups,
+		tidpairs = uctxt->expected_count / 2;
+	struct page **pages;
+	unsigned long vaddr, tidmap[uctxt->tidmapcnt];
+	dma_addr_t *phys;
+	u32 tidlist[tidpairs], pairidx = 0, tidcursor;
+	u16 useidx, idx, bitidx, tidcnt = 0;
+
+	vaddr = tinfo->vaddr;
+
+	if (vaddr & ~PAGE_MASK) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	npages = num_user_pages(vaddr, tinfo->length);
+	if (!npages) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+		       npages * PAGE_SIZE)) {
+		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+			   (void *)vaddr, npages);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
+	memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
+
+	exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
+	/* which group set do we look at first? */
+	tidcursor = atomic_read(&uctxt->tidcursor);
+	useidx = (tidcursor >> 16) & 0xffff;
+	bitidx = tidcursor & 0xffff;
+
+	/*
+	 * Keep going until we've mapped all pages or we've exhausted all
+	 * RcvArray entries.
+	 * This iterates over the number of tidmaps + 1
+	 * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
+	 * started from one more time for any free bits before the
+	 * starting point bit.
+	 */
+	for (mapped = 0, idx = 0;
+	     mapped < npages && idx <= uctxt->tidmapcnt;) {
+		u64 i, offset = 0;
+		unsigned free, pinned, pmapped = 0, bits_used;
+		u16 grp;
+
+		/*
+		 * "Reserve" the needed group bits under lock so other
+		 * processes can't step in the middle of it. Once
+		 * reserved, we don't need the lock anymore since we
+		 * are guaranteed the groups.
+		 */
+		spin_lock(&uctxt->exp_lock);
+		if (uctxt->tidusemap[useidx] == -1ULL ||
+		    bitidx >= BITS_PER_LONG) {
+			/* no free groups in the set, use the next */
+			useidx = (useidx + 1) % uctxt->tidmapcnt;
+			idx++;
+			bitidx = 0;
+			spin_unlock(&uctxt->exp_lock);
+			continue;
+		}
+		ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
+			!!((npages - mapped) % dd->rcv_entries.group_size);
+
+		/*
+		 * If we've gotten here, the current set of groups does have
+		 * one or more free groups.
+		 */
+		free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
+		if (!free) {
+			/*
+			 * Despite the check above, free could still come back
+			 * as 0 because we don't check the entire bitmap but
+			 * we start from bitidx.
+			 */
+			spin_unlock(&uctxt->exp_lock);
+			continue;
+		}
+		bits_used = min(free, ngroups);
+		tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
+		uctxt->tidusemap[useidx] |= tidmap[useidx];
+		spin_unlock(&uctxt->exp_lock);
+
+		/*
+		 * At this point, we know where in the map we have free bits.
+		 * properly offset into the various "shadow" arrays and compute
+		 * the RcvArray entry index.
+		 */
+		offset = ((useidx * BITS_PER_LONG) + bitidx) *
+			dd->rcv_entries.group_size;
+		pages = uctxt->tid_pg_list + offset;
+		phys = uctxt->physshadow + offset;
+		tid = uctxt->expected_base + offset;
+
+		/* Calculate how many pages we can pin based on free bits */
+		pinned = min((bits_used * dd->rcv_entries.group_size),
+			     (npages - mapped));
+		/*
+		 * Now that we know how many free RcvArray entries we have,
+		 * we can pin that many user pages.
+		 */
+		ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE),
+					  pinned, pages);
+		if (ret) {
+			/*
+			 * We can't continue because the pages array won't be
+			 * initialized. This should never happen,
+			 * unless perhaps the user has mpin'ed the pages
+			 * themselves.
+			 */
+			dd_dev_info(dd,
+				    "Failed to lock addr %p, %u pages: errno %d\n",
+				    (void *) vaddr, pinned, -ret);
+			/*
+			 * Let go of the bits that we reserved since we are not
+			 * going to use them.
+			 */
+			spin_lock(&uctxt->exp_lock);
+			uctxt->tidusemap[useidx] &=
+				~(((1ULL << bits_used) - 1) << bitidx);
+			spin_unlock(&uctxt->exp_lock);
+			goto done;
+		}
+		/*
+		 * How many groups do we need based on how many pages we have
+		 * pinned?
+		 */
+		ngroups = (pinned / dd->rcv_entries.group_size) +
+			!!(pinned % dd->rcv_entries.group_size);
+		/*
+		 * Keep programming RcvArray entries for all the <ngroups> free
+		 * groups.
+		 */
+		for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
+			unsigned j;
+			u32 pair_size = 0, tidsize;
+			/*
+			 * This inner loop will program an entire group or the
+			 * array of pinned pages (which ever limit is hit
+			 * first).
+			 */
+			for (j = 0; j < dd->rcv_entries.group_size &&
+				     pmapped < pinned; j++, pmapped++, tid++) {
+				tidsize = PAGE_SIZE;
+				phys[pmapped] = hfi1_map_page(dd->pcidev,
+						   pages[pmapped], 0,
+						   tidsize, PCI_DMA_FROMDEVICE);
+				trace_hfi1_exp_rcv_set(uctxt->ctxt,
+						       subctxt_fp(fp),
+						       tid, vaddr,
+						       phys[pmapped],
+						       pages[pmapped]);
+				/*
+				 * Each RcvArray entry is programmed with one
+				 * page * worth of memory. This will handle
+				 * the 8K MTU as well as anything smaller
+				 * due to the fact that both entries in the
+				 * RcvTidPair are programmed with a page.
+				 * PSM currently does not handle anything
+				 * bigger than 8K MTU, so should we even worry
+				 * about 10K here?
+				 */
+				hfi1_put_tid(dd, tid, PT_EXPECTED,
+					     phys[pmapped],
+					     ilog2(tidsize >> PAGE_SHIFT) + 1);
+				pair_size += tidsize >> PAGE_SHIFT;
+				EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
+				if (!(tid % 2)) {
+					tidlist[pairidx] |=
+					   EXP_TID_SET(IDX,
+						(tid - uctxt->expected_base)
+						       / 2);
+					tidlist[pairidx] |=
+						EXP_TID_SET(CTRL, 1);
+					tidcnt++;
+				} else {
+					tidlist[pairidx] |=
+						EXP_TID_SET(CTRL, 2);
+					pair_size = 0;
+					pairidx++;
+				}
+			}
+			/*
+			 * We've programmed the entire group (or as much of the
+			 * group as we'll use. Now, it's time to push it out...
+			 */
+			flush_wc();
+		}
+		mapped += pinned;
+		atomic_set(&uctxt->tidcursor,
+			   (((useidx & 0xffffff) << 16) |
+			    ((bitidx + bits_used) & 0xffffff)));
+	}
+	trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap,
+			       uctxt->tidmapcnt);
+
+done:
+	/* If we've mapped anything, copy relevant info to user */
+	if (mapped) {
+		if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+				 tidlist, sizeof(tidlist[0]) * tidcnt)) {
+			ret = -EFAULT;
+			goto done;
+		}
+		/* copy TID info to user */
+		if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
+				 tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
+			ret = -EFAULT;
+	}
+bail:
+	/*
+	 * Calculate mapped length. New Exp TID protocol does not "unwind" and
+	 * report an error if it can't map the entire buffer. It just reports
+	 * the length that was mapped.
+	 */
+	tinfo->length = mapped * PAGE_SIZE;
+	tinfo->tidcnt = tidcnt;
+	return ret;
+}
+
+static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned long tidmap[uctxt->tidmapcnt];
+	struct page **pages;
+	dma_addr_t *phys;
+	u16 idx, bitidx, tid;
+	int ret = 0;
+
+	if (copy_from_user(&tidmap, (void __user *)(unsigned long)
+			   tinfo->tidmap,
+			   sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
+		ret = -EFAULT;
+		goto done;
+	}
+	for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
+		unsigned long map;
+
+		bitidx = 0;
+		if (!tidmap[idx])
+			continue;
+		map = tidmap[idx];
+		while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
+			int i, pcount = 0;
+			struct page *pshadow[dd->rcv_entries.group_size];
+			unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
+				dd->rcv_entries.group_size;
+
+			pages = uctxt->tid_pg_list + offset;
+			phys = uctxt->physshadow + offset;
+			tid = uctxt->expected_base + offset;
+			for (i = 0; i < dd->rcv_entries.group_size;
+			     i++, tid++) {
+				if (pages[i]) {
+					hfi1_put_tid(dd, tid, PT_INVALID,
+						      0, 0);
+					trace_hfi1_exp_rcv_free(uctxt->ctxt,
+								subctxt_fp(fp),
+								tid, phys[i],
+								pages[i]);
+					pci_unmap_page(dd->pcidev, phys[i],
+					      PAGE_SIZE, PCI_DMA_FROMDEVICE);
+					pshadow[pcount] = pages[i];
+					pages[i] = NULL;
+					pcount++;
+					phys[i] = 0;
+				}
+			}
+			flush_wc();
+			hfi1_release_user_pages(pshadow, pcount);
+			clear_bit(bitidx, &uctxt->tidusemap[idx]);
+			map &= ~(1ULL<<bitidx);
+		}
+	}
+	trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 1, uctxt->tidusemap,
+			       uctxt->tidmapcnt);
+done:
+	return ret;
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
+{
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned tid;
+
+	dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
+		    uctxt->ctxt);
+	for (tid = 0; tid < uctxt->expected_count; tid++) {
+		struct page *p = uctxt->tid_pg_list[tid];
+		dma_addr_t phys;
+
+		if (!p)
+			continue;
+
+		phys = uctxt->physshadow[tid];
+		uctxt->physshadow[tid] = 0;
+		uctxt->tid_pg_list[tid] = NULL;
+		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+		hfi1_release_user_pages(&p, 1);
+	}
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+			 u16 pkey)
+{
+	int ret = -ENOENT, i, intable = 0;
+	struct hfi1_pportdata *ppd = uctxt->ppd;
+	struct hfi1_devdata *dd = uctxt->dd;
+
+	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+		if (pkey == ppd->pkeys[i]) {
+			intable = 1;
+			break;
+		}
+
+	if (intable)
+		ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+	return ret;
+}
+
+static int ui_open(struct inode *inode, struct file *filp)
+{
+	struct hfi1_devdata *dd;
+
+	dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
+	filp->private_data = dd; /* for other methods */
+	return 0;
+}
+
+static int ui_release(struct inode *inode, struct file *filp)
+{
+	/* nothing to do */
+	return 0;
+}
+
+static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
+{
+	struct hfi1_devdata *dd = filp->private_data;
+
+	switch (whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		offset += filp->f_pos;
+		break;
+	case SEEK_END:
+		offset = ((dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) -
+			offset;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (offset < 0)
+		return -EINVAL;
+
+	if (offset >= (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE)
+		return -EINVAL;
+
+	filp->f_pos = offset;
+
+	return filp->f_pos;
+}
+
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *f_pos)
+{
+	struct hfi1_devdata *dd = filp->private_data;
+	void __iomem *base = dd->kregbase;
+	unsigned long total, csr_off,
+		barlen = (dd->kregend - dd->kregbase);
+	u64 data;
+
+	/* only read 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*f_pos % 8) != 0)
+		return -EINVAL;
+	/* destination buffer must be 8-byte aligned */
+	if ((unsigned long)buf % 8 != 0)
+		return -EINVAL;
+	/* must be in range */
+	if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
+		return -EINVAL;
+	/* only set the base if we are not starting past the BAR */
+	if (*f_pos < barlen)
+		base += *f_pos;
+	csr_off = *f_pos;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		/* accessing LCB CSRs requires more checks */
+		if (is_lcb_offset(csr_off)) {
+			if (read_lcb_csr(dd, csr_off, (u64 *)&data))
+				break; /* failed */
+		}
+		/*
+		 * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
+		 * false parity error.  Avoid the whole issue by not reading
+		 * them.  These registers are defined as having a read value
+		 * of 0.
+		 */
+		else if (csr_off == ASIC_GPIO_CLEAR
+				|| csr_off == ASIC_GPIO_FORCE
+				|| csr_off == ASIC_QSFP1_CLEAR
+				|| csr_off == ASIC_QSFP1_FORCE
+				|| csr_off == ASIC_QSFP2_CLEAR
+				|| csr_off == ASIC_QSFP2_FORCE)
+			data = 0;
+		else if (csr_off >= barlen) {
+			/*
+			 * read_8051_data can read more than just 8 bytes at
+			 * a time. However, folding this into the loop and
+			 * handling the reads in 8 byte increments allows us
+			 * to smoothly transition from chip memory to 8051
+			 * memory.
+			 */
+			if (read_8051_data(dd,
+					   (u32)(csr_off - barlen),
+					   sizeof(data), &data))
+				break; /* failed */
+		} else
+			data = readq(base + total);
+		if (put_user(data, (unsigned long __user *)(buf + total)))
+			break;
+	}
+	*f_pos += total;
+	return total;
+}
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_write(struct file *filp, const char __user *buf,
+			size_t count, loff_t *f_pos)
+{
+	struct hfi1_devdata *dd = filp->private_data;
+	void __iomem *base;
+	unsigned long total, data, csr_off;
+	int in_lcb;
+
+	/* only write 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*f_pos % 8) != 0)
+		return -EINVAL;
+	/* source buffer must be 8-byte aligned */
+	if ((unsigned long)buf % 8 != 0)
+		return -EINVAL;
+	/* must be in range */
+	if (*f_pos + count > dd->kregend - dd->kregbase)
+		return -EINVAL;
+
+	base = (void __iomem *)dd->kregbase + *f_pos;
+	csr_off = *f_pos;
+	in_lcb = 0;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		if (get_user(data, (unsigned long __user *)(buf + total)))
+			break;
+		/* accessing LCB CSRs requires a special procedure */
+		if (is_lcb_offset(csr_off)) {
+			if (!in_lcb) {
+				int ret = acquire_lcb_access(dd, 1);
+
+				if (ret)
+					break;
+				in_lcb = 1;
+			}
+		} else {
+			if (in_lcb) {
+				release_lcb_access(dd, 1);
+				in_lcb = 0;
+			}
+		}
+		writeq(data, base + total);
+	}
+	if (in_lcb)
+		release_lcb_access(dd, 1);
+	*f_pos += total;
+	return total;
+}
+
+static const struct file_operations ui_file_ops = {
+	.owner = THIS_MODULE,
+	.llseek = ui_lseek,
+	.read = ui_read,
+	.write = ui_write,
+	.open = ui_open,
+	.release = ui_release,
+};
+#define UI_OFFSET 192	/* device minor offset for UI devices */
+static int create_ui = 1;
+
+static struct cdev wildcard_cdev;
+static struct device *wildcard_device;
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+	if (atomic_dec_return(&user_count) == 0)
+		hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+	hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+	hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
+				     &wildcard_cdev, &wildcard_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+	ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
+			     &dd->user_cdev, &dd->user_device);
+	if (ret)
+		goto done;
+
+	if (create_ui) {
+		snprintf(name, sizeof(name),
+			 "%s_ui%d", class_name(), dd->unit);
+		ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
+				     &dd->ui_cdev, &dd->ui_device);
+		if (ret)
+			goto done;
+	}
+
+	return 0;
+done:
+	user_remove(dd);
+	return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+	int r, ret;
+
+	r = user_add(dd);
+	ret = hfi1_diag_add(dd);
+	if (r && !ret)
+		ret = r;
+	return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+	user_remove(dd);
+	hfi1_diag_remove(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
new file mode 100644
index 0000000..5c2f2ed
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -0,0 +1,1620 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+static uint platform_config_load = 1;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+	(((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+	 ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+	u32 module_type;
+	u32 header_len;
+	u32 header_version;
+	u32 module_id;
+	u32 module_vendor;
+	u32 date;		/* BCD yyyymmdd */
+	u32 size;		/* in DWORDs */
+	u32 key_size;		/* in DWORDs */
+	u32 modulus_size;	/* in DWORDs */
+	u32 exponent_size;	/* in DWORDs */
+	u32 reserved[22];
+};
+/* expected field values */
+#define CSS_MODULE_TYPE	   0x00000006
+#define CSS_HEADER_LEN	   0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR  0x00008086
+
+#define KEY_SIZE      256
+#define MU_SIZE		8
+#define EXPONENT_SIZE	4
+
+/* the file itself */
+struct firmware_file {
+	struct css_header css_header;
+	u8 modulus[KEY_SIZE];
+	u8 exponent[EXPONENT_SIZE];
+	u8 signature[KEY_SIZE];
+	u8 firmware[];
+};
+
+struct augmented_firmware_file {
+	struct css_header css_header;
+	u8 modulus[KEY_SIZE];
+	u8 exponent[EXPONENT_SIZE];
+	u8 signature[KEY_SIZE];
+	u8 r2[KEY_SIZE];
+	u8 mu[MU_SIZE];
+	u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+						sizeof(struct firmware_file))
+
+struct firmware_details {
+	/* Linux core piece */
+	const struct firmware *fw;
+
+	struct css_header *css_header;
+	u8 *firmware_ptr;		/* pointer to binary data */
+	u32 firmware_len;		/* length in bytes */
+	u8 *modulus;			/* pointer to the modulus */
+	u8 *exponent;			/* pointer to the exponent */
+	u8 *signature;			/* pointer to the signature */
+	u8 *r2;				/* pointer to r2 */
+	u8 *mu;				/* pointer to mu */
+	struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+	FW_EMPTY,
+	FW_ACQUIRED,
+	FW_ERR
+};
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS   0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT  0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE   0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE   0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 4000 /* 4 s */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+	{ 0x01, 0x02, 0x03, 0x04 },
+	{ 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+	{ 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+	  0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+	{ 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+	  0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+	{ 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+	  0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+	{ 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+	  0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset.  Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+	u64 reg;
+	int count;
+
+	/* start the read at the given address */
+	reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+		| DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+	/* wait until ACCESS_COMPLETED is set */
+	count = 0;
+	while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+		    & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+		    == 0) {
+		count++;
+		if (count > DC8051_ACCESS_TIMEOUT) {
+			dd_dev_err(dd, "timeout reading 8051 data\n");
+			return -ENXIO;
+		}
+		ndelay(10);
+	}
+
+	/* gather the data */
+	*result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+	return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+	unsigned long flags;
+	u32 done;
+	int ret = 0;
+
+	spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+	/* data read set-up, no auto-increment */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+	for (done = 0; done < len; addr += 8, done += 8, result++) {
+		ret = __read_8051_data(dd, addr, result);
+		if (ret)
+			break;
+	}
+
+	/* turn off read enable */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+	spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+	return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+		      const u8 *data, u32 len)
+{
+	u64 reg;
+	u32 offset;
+	int aligned, count;
+
+	/* check alignment */
+	aligned = ((unsigned long)data & 0x7) == 0;
+
+	/* write set-up */
+	reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+		| DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+	reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+		| DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+	/* write */
+	for (offset = 0; offset < len; offset += 8) {
+		int bytes = len - offset;
+
+		if (bytes < 8) {
+			reg = 0;
+			memcpy(&reg, &data[offset], bytes);
+		} else if (aligned) {
+			reg = *(u64 *)&data[offset];
+		} else {
+			memcpy(&reg, &data[offset], 8);
+		}
+		write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+		/* wait until ACCESS_COMPLETED is set */
+		count = 0;
+		while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+		    & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+		    == 0) {
+			count++;
+			if (count > DC8051_ACCESS_TIMEOUT) {
+				dd_dev_err(dd, "timeout writing 8051 data\n");
+				return -ENXIO;
+			}
+			udelay(1);
+		}
+	}
+
+	/* turn off write access, auto increment (also sets to data access) */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+	return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+			  u32 actual, u32 expected)
+{
+	if (actual == expected)
+		return 0;
+
+	dd_dev_err(dd,
+		"invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+		what, expected, actual);
+	return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+	/* verify CSS header fields (most sizes are in DW, so add /4) */
+	if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE)
+			|| invalid_header(dd, "header_len", css->header_len,
+					(sizeof(struct firmware_file)/4))
+			|| invalid_header(dd, "header_version",
+					css->header_version, CSS_HEADER_VERSION)
+			|| invalid_header(dd, "module_vendor",
+					css->module_vendor, CSS_MODULE_VENDOR)
+			|| invalid_header(dd, "key_size",
+					css->key_size, KEY_SIZE/4)
+			|| invalid_header(dd, "modulus_size",
+					css->modulus_size, KEY_SIZE/4)
+			|| invalid_header(dd, "exponent_size",
+					css->exponent_size, EXPONENT_SIZE/4)) {
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+			 long file_size, long prefix_size)
+{
+	/* make sure we have some payload */
+	if (prefix_size >= file_size) {
+		dd_dev_err(dd,
+			"firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+			name, file_size, prefix_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Request the firmware from the system.  Extract the pieces and fill in
+ * fdet.  If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+			       struct firmware_details *fdet)
+{
+	struct css_header *css;
+	int ret;
+
+	memset(fdet, 0, sizeof(*fdet));
+
+	ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+	if (ret) {
+		dd_dev_err(dd, "cannot load firmware \"%s\", err %d\n",
+			name, ret);
+		return ret;
+	}
+
+	/* verify the firmware */
+	if (fdet->fw->size < sizeof(struct css_header)) {
+		dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+		ret = -EINVAL;
+		goto done;
+	}
+	css = (struct css_header *)fdet->fw->data;
+
+	hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+	hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+	hfi1_cdbg(FIRMWARE, "CSS structure:");
+	hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
+	hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
+		  css->header_len, 4 * css->header_len);
+	hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
+	hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
+	hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
+	hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
+	hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
+		  css->size, 4 * css->size);
+	hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
+		  css->key_size, 4 * css->key_size);
+	hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
+		  css->modulus_size, 4 * css->modulus_size);
+	hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
+		  css->exponent_size, 4 * css->exponent_size);
+	hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+		  fdet->fw->size - sizeof(struct firmware_file));
+
+	/*
+	 * If the file does not have a valid CSS header, fail.
+	 * Otherwise, check the CSS size field for an expected size.
+	 * The augmented file has r2 and mu inserted after the header
+	 * was generated, so there will be a known difference between
+	 * the CSS header size and the actual file size.  Use this
+	 * difference to identify an augmented file.
+	 *
+	 * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+	 */
+	ret = verify_css_header(dd, css);
+	if (ret) {
+		dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+	} else if ((css->size*4) == fdet->fw->size) {
+		/* non-augmented firmware file */
+		struct firmware_file *ff = (struct firmware_file *)
+							fdet->fw->data;
+
+		/* make sure there are bytes in the payload */
+		ret = payload_check(dd, name, fdet->fw->size,
+						sizeof(struct firmware_file));
+		if (ret == 0) {
+			fdet->css_header = css;
+			fdet->modulus = ff->modulus;
+			fdet->exponent = ff->exponent;
+			fdet->signature = ff->signature;
+			fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+			fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+			fdet->firmware_ptr = ff->firmware;
+			fdet->firmware_len = fdet->fw->size -
+						sizeof(struct firmware_file);
+			/*
+			 * Header does not include r2 and mu - generate here.
+			 * For now, fail.
+			 */
+			dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+			ret = -EINVAL;
+		}
+	} else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) {
+		/* augmented firmware file */
+		struct augmented_firmware_file *aff =
+			(struct augmented_firmware_file *)fdet->fw->data;
+
+		/* make sure there are bytes in the payload */
+		ret = payload_check(dd, name, fdet->fw->size,
+					sizeof(struct augmented_firmware_file));
+		if (ret == 0) {
+			fdet->css_header = css;
+			fdet->modulus = aff->modulus;
+			fdet->exponent = aff->exponent;
+			fdet->signature = aff->signature;
+			fdet->r2 = aff->r2;
+			fdet->mu = aff->mu;
+			fdet->firmware_ptr = aff->firmware;
+			fdet->firmware_len = fdet->fw->size -
+					sizeof(struct augmented_firmware_file);
+		}
+	} else {
+		/* css->size check failed */
+		dd_dev_err(dd,
+			"invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+			fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4,
+			css->size);
+
+		ret = -EINVAL;
+	}
+
+done:
+	/* if returning an error, clean up after ourselves */
+	if (ret)
+		dispose_one_firmware(fdet);
+	return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+	release_firmware(fdet->fw);
+	fdet->fw = NULL;
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load.  Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+	int err = 0;
+
+	mutex_lock(&fw_mutex);
+	if (fw_state == FW_ACQUIRED) {
+		goto done;	/* already acquired */
+	} else if (fw_state == FW_ERR) {
+		err = fw_err;
+		goto done;	/* already tried and failed */
+	}
+
+	if (fw_8051_load) {
+		err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+		if (err)
+			goto done;
+	}
+
+	if (fw_fabric_serdes_load) {
+		err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+			&fw_fabric);
+		if (err)
+			goto done;
+	}
+
+	if (fw_sbus_load) {
+		err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+		if (err)
+			goto done;
+	}
+
+	if (fw_pcie_serdes_load) {
+		err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+		if (err)
+			goto done;
+	}
+
+	if (platform_config_load) {
+		platform_config = NULL;
+		err = request_firmware(&platform_config, platform_config_name,
+						&dd->pcidev->dev);
+		if (err) {
+			err = 0;
+			platform_config = NULL;
+		}
+	}
+
+	/* success */
+	fw_state = FW_ACQUIRED;
+
+done:
+	if (err) {
+		fw_err = err;
+		fw_state = FW_ERR;
+	}
+	mutex_unlock(&fw_mutex);
+
+	return err;
+}
+
+/*
+ * Called when the driver unloads.  The timing is asymmetric with its
+ * counterpart, obtain_firmware().  If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed.  The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+	dispose_one_firmware(&fw_8051);
+	dispose_one_firmware(&fw_fabric);
+	dispose_one_firmware(&fw_pcie);
+	dispose_one_firmware(&fw_sbus);
+
+	release_firmware(platform_config);
+	platform_config = NULL;
+
+	/* retain the error state, otherwise revert to empty */
+	if (fw_state != FW_ERR)
+		fw_state = FW_EMPTY;
+}
+
+/*
+ * Write a block of data to a given array CSR.  All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+			   const u8 *data, int nbytes)
+{
+	int qw_size = nbytes/8;
+	int i;
+
+	if (((unsigned long)data & 0x7) == 0) {
+		/* aligned */
+		u64 *ptr = (u64 *)data;
+
+		for (i = 0; i < qw_size; i++, ptr++)
+			write_csr(dd, what + (8*i), *ptr);
+	} else {
+		/* not aligned */
+		for (i = 0; i < qw_size; i++, data += 8) {
+			u64 value;
+
+			memcpy(&value, data, 8);
+			write_csr(dd, what + (8*i), value);
+		}
+	}
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes.  All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+				    const u8 *data, int nbytes)
+{
+	u64 *ptr = (u64 *)data;
+	int qw_size = nbytes/8;
+
+	for (; qw_size > 0; qw_size--, ptr++)
+		write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism.  Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+		   const u8 *signature)
+{
+	unsigned long timeout;
+	u64 reg;
+	u32 status;
+	int ret = 0;
+
+	/* write the signature */
+	write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+	/* initialize RSA */
+	write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+	/*
+	 * Make sure the engine is idle and insert a delay between the two
+	 * writes to MISC_CFG_RSA_CMD.
+	 */
+	status = (read_csr(dd, MISC_CFG_FW_CTRL)
+			   & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+			     >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+	if (status != RSA_STATUS_IDLE) {
+		dd_dev_err(dd, "%s security engine not idle - giving up\n",
+			who);
+		return -EBUSY;
+	}
+
+	/* start RSA */
+	write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+	/*
+	 * Look for the result.
+	 *
+	 * The RSA engine is hooked up to two MISC errors.  The driver
+	 * masks these errors as they do not respond to the standard
+	 * error "clear down" mechanism.  Look for these errors here and
+	 * clear them when possible.  This routine will exit with the
+	 * errors of the current run still set.
+	 *
+	 * MISC_FW_AUTH_FAILED_ERR
+	 *	Firmware authorization failed.  This can be cleared by
+	 *	re-initializing the RSA engine, then clearing the status bit.
+	 *	Do not re-init the RSA angine immediately after a successful
+	 *	run - this will reset the current authorization.
+	 *
+	 * MISC_KEY_MISMATCH_ERR
+	 *	Key does not match.  The only way to clear this is to load
+	 *	a matching key then clear the status bit.  If this error
+	 *	is raised, it will persist outside of this routine until a
+	 *	matching key is loaded.
+	 */
+	timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+	while (1) {
+		status = (read_csr(dd, MISC_CFG_FW_CTRL)
+			   & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+			     >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+		if (status == RSA_STATUS_IDLE) {
+			/* should not happen */
+			dd_dev_err(dd, "%s firmware security bad idle state\n",
+				who);
+			ret = -EINVAL;
+			break;
+		} else if (status == RSA_STATUS_DONE) {
+			/* finished successfully */
+			break;
+		} else if (status == RSA_STATUS_FAILED) {
+			/* finished unsuccessfully */
+			ret = -EINVAL;
+			break;
+		}
+		/* else still active */
+
+		if (time_after(jiffies, timeout)) {
+			/*
+			 * Timed out while active.  We can't reset the engine
+			 * if it is stuck active, but run through the
+			 * error code to see what error bits are set.
+			 */
+			dd_dev_err(dd, "%s firmware security time out\n", who);
+			ret = -ETIMEDOUT;
+			break;
+		}
+
+		msleep(20);
+	}
+
+	/*
+	 * Arrive here on success or failure.  Clear all RSA engine
+	 * errors.  All current errors will stick - the RSA logic is keeping
+	 * error high.  All previous errors will clear - the RSA logic
+	 * is not keeping the error high.
+	 */
+	write_csr(dd, MISC_ERR_CLEAR,
+			MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK
+			| MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+	/*
+	 * All that is left are the current errors.  Print failure details,
+	 * if any.
+	 */
+	reg = read_csr(dd, MISC_ERR_STATUS);
+	if (ret) {
+		if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+			dd_dev_err(dd, "%s firmware authorization failed\n",
+				who);
+		if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+			dd_dev_err(dd, "%s firmware key mismatch\n", who);
+	}
+
+	return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+				    struct firmware_details *fdet)
+{
+	/* Security variables a.  Write the modulus */
+	write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+	/* Security variables b.  Write the r2 */
+	write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+	/* Security variables c.  Write the mu */
+	write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+	/* Security variables d.  Write the header */
+	write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+			(u8 *)fdet->css_header, sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+	u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+	return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+				& DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+	unsigned long timeout;
+
+	/* in the simulator, the fake 8051 is always ready */
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		return 0;
+
+	timeout = msecs_to_jiffies(mstimeout) + jiffies;
+	while (1) {
+		if (get_firmware_state(dd) == 0xa0)	/* ready */
+			return 0;
+		if (time_after(jiffies, timeout))	/* timed out */
+			return -ETIMEDOUT;
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+			      struct firmware_details *fdet)
+{
+	u64 reg;
+	int ret;
+	u8 ver_a, ver_b;
+
+	/*
+	 * DC Reset sequence
+	 * Load DC 8051 firmware
+	 */
+	/*
+	 * DC reset step 1: Reset DC8051
+	 */
+	reg = DC_DC8051_CFG_RST_M8051W_SMASK
+		| DC_DC8051_CFG_RST_CRAM_SMASK
+		| DC_DC8051_CFG_RST_DRAM_SMASK
+		| DC_DC8051_CFG_RST_IRAM_SMASK
+		| DC_DC8051_CFG_RST_SFR_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+	/*
+	 * DC reset step 2 (optional): Load 8051 data memory with link
+	 * configuration
+	 */
+
+	/*
+	 * DC reset step 3: Load DC8051 firmware
+	 */
+	/* release all but the core reset */
+	reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+	/* Firmware load step 1 */
+	load_security_variables(dd, fdet);
+
+	/*
+	 * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+	 */
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+	/* Firmware load steps 3-5 */
+	ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+							fdet->firmware_len);
+	if (ret)
+		return ret;
+
+	/*
+	 * DC reset step 4. Host starts the DC8051 firmware
+	 */
+	/*
+	 * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+	 */
+	write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+	/* Firmware load steps 7-10 */
+	ret = run_rsa(dd, "8051", fdet->signature);
+	if (ret)
+		return ret;
+
+	/* clear all reset bits, releasing the 8051 */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+	/*
+	 * DC reset step 5. Wait for firmware to be ready to accept host
+	 * requests.
+	 */
+	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+	if (ret) { /* timed out */
+		dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+			get_firmware_state(dd));
+		return -ETIMEDOUT;
+	}
+
+	read_misc_status(dd, &ver_a, &ver_b);
+	dd_dev_info(dd, "8051 firmware version %d.%d\n",
+		(int)ver_b, (int)ver_a);
+	dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+	return 0;
+}
+
+/* SBus Master broadcast address */
+#define SBUS_MASTER_BROADCAST 0xfd
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+		  u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+	write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+		((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT)
+		| ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT)
+		| ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT)
+		| ((u64)receiver_addr
+			<< ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ *   when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+	/* only needed on A0 */
+	if (!is_a0(dd))
+		return;
+
+	dd_dev_info(dd, "Turning off spicos:%s%s\n",
+		flags & SPICO_SBUS ? " SBus" : "",
+		flags & SPICO_FABRIC ? " fabric" : "");
+
+	write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+	/* disable SBus spico */
+	if (flags & SPICO_SBUS)
+		sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+			WRITE_SBUS_RECEIVER, 0x00000040);
+
+	/* disable the fabric serdes spicos */
+	if (flags & SPICO_FABRIC)
+		sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+			     0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ *  Reset all of the fabric serdes for our HFI.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+	u8 ra;
+
+	if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */
+		return;
+
+	ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+	acquire_hw_mutex(dd);
+	set_sbus_fast_mode(dd);
+	/* place SerDes in reset and disable SPICO */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+	/* wait 100 refclk cycles @ 156.25MHz => 640ns */
+	udelay(1);
+	/* remove SerDes reset */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+	/* turn SPICO enable on */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+	clear_sbus_fast_mode(dd);
+	release_hw_mutex(dd);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+		      u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+	u64 reg, count = 0;
+
+	sbus_request(dd, receiver_addr, data_addr, command, data_in);
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+		  ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+	/* Wait for both DONE and RCV_DATA_VALID to go high */
+	reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+		 (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+		if (count++ >= SBUS_MAX_POLL_COUNT) {
+			u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+			/*
+			 * If the loop has timed out, we are OK if DONE bit
+			 * is set and RCV_DATA_VALID and EXECUTE counters
+			 * are the same. If not, we cannot proceed.
+			 */
+			if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+			    (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+			     SBUS_COUNTER(counts, EXECUTE)))
+				break;
+			return -ETIMEDOUT;
+		}
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	}
+	count = 0;
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+	/* Wait for DONE to clear after EXECUTE is cleared */
+	reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+		if (count++ >= SBUS_MAX_POLL_COUNT)
+			return -ETIME;
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	}
+	return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+				       struct firmware_details *fdet)
+{
+	int i, err;
+	const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+	dd_dev_info(dd, "Downloading fabric firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: place SerDes in reset and disable SPICO */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+	/* wait 100 refclk cycles @ 156.25MHz => 640ns */
+	udelay(1);
+	/* step 3:  remove SerDes reset */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+	/* step 4: assert IMEM override */
+	sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+	/* step 5: download SerDes machine code */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+					*(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 6: IMEM override off */
+	sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+	/* step 7: turn ECC on */
+	sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+	/* steps 8-11: run the RSA engine */
+	err = run_rsa(dd, "fabric serdes", fdet->signature);
+	if (err)
+		return err;
+
+	/* step 12: turn SPICO enable on */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+	/* step 13: enable core hardware interrupts */
+	sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+	return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+			      struct firmware_details *fdet)
+{
+	int i, err;
+	const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+	dd_dev_info(dd, "Downloading SBus firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: place SPICO into reset and enable off */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+	/* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+	/* step 4: set starting IMEM address for burst download */
+	sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+	/* step 5: download the SBus Master machine code */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+					*(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 6: set IMEM_CNTL_EN off */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+	/* step 7: turn ECC on */
+	sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+	/* steps 8-11: run the RSA engine */
+	err = run_rsa(dd, "SBus", fdet->signature);
+	if (err)
+		return err;
+
+	/* step 12: set SPICO_ENABLE on */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+	return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+				     struct firmware_details *fdet)
+{
+	int i;
+	const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+	dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: assert single step (halts the SBus Master spico) */
+	sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+	/* step 3: enable XDMEM access */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+	/* step 4: load firmware into SBus Master XDMEM */
+	/* NOTE: the dmem address, write_en, and wdata are all pre-packed,
+	   we only need to pick up the bytes and write them */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+					*(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 5: disable XDMEM access */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+	/* step 6: allow SBus Spico to run */
+	sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+	/* steps 7-11: run RSA, if it succeeds, firmware is available to
+	   be swapped */
+	return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+				 const u8 *addrs, int count)
+{
+	while (--count >= 0) {
+		/*
+		 * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+		 * defaults for everything else.  Do not read-modify-write,
+		 * per instruction from the manufacturer.
+		 *
+		 * Register 0xfd:
+		 *	bits    what
+		 *	-----	---------------------------------
+		 *	  0	IGNORE_BROADCAST  (default 0)
+		 *	11:4	BROADCAST_GROUP_1 (default 0xff)
+		 *	23:16	BROADCAST_GROUP_2 (default 0xff)
+		 */
+		sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+				(u32)bg1 << 4 | (u32)bg2 << 16);
+	}
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+	unsigned long timeout;
+	int try = 0;
+	u8 mask = 1 << dd->hfi1_id;
+	u8 user;
+
+retry:
+	timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+	while (1) {
+		write_csr(dd, ASIC_CFG_MUTEX, mask);
+		user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+		if (user == mask)
+			return 0; /* success */
+		if (time_after(jiffies, timeout))
+			break; /* timed out */
+		msleep(20);
+	}
+
+	/* timed out */
+	dd_dev_err(dd,
+		"Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+		(u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+	if (try == 0) {
+		/* break mutex and retry */
+		write_csr(dd, ASIC_CFG_MUTEX, 0);
+		try++;
+		goto retry;
+	}
+
+	return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+	write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+				ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+	u64 reg, count = 0;
+
+	reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+	while (SBUS_COUNTER(reg, EXECUTE) !=
+	       SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+		if (count++ >= SBUS_MAX_POLL_COUNT)
+			break;
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+	}
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	if (fw_sbus_load || fw_fabric_serdes_load) {
+		ret = acquire_hw_mutex(dd);
+		if (ret)
+			return ret;
+
+		set_sbus_fast_mode(dd);
+
+		/*
+		 * The SBus contains part of the fabric firmware and so must
+		 * also be downloaded.
+		 */
+		if (fw_sbus_load) {
+			turn_off_spicos(dd, SPICO_SBUS);
+			ret = load_sbus_firmware(dd, &fw_sbus);
+			if (ret)
+				goto clear;
+		}
+
+		if (fw_fabric_serdes_load) {
+			set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+					fabric_serdes_broadcast[dd->hfi1_id],
+					fabric_serdes_addrs[dd->hfi1_id],
+					NUM_FABRIC_SERDES);
+			turn_off_spicos(dd, SPICO_FABRIC);
+			ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+		}
+
+clear:
+		clear_sbus_fast_mode(dd);
+		release_hw_mutex(dd);
+		if (ret)
+			return ret;
+	}
+
+	if (fw_8051_load) {
+		ret = load_8051_firmware(dd, &fw_8051);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+	/* only RTL can use these */
+	if (dd->icode != ICODE_RTL_SILICON) {
+		fw_fabric_serdes_load = 0;
+		fw_pcie_serdes_load = 0;
+		fw_sbus_load = 0;
+	}
+
+	/* no 8051 or QSFP on simulator */
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		fw_8051_load = 0;
+		platform_config_load = 0;
+	}
+
+	if (!fw_8051_name) {
+		if (dd->icode == ICODE_RTL_SILICON)
+			fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+		else
+			fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+	}
+	if (!fw_fabric_serdes_name)
+		fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+	if (!fw_sbus_name)
+		fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+	if (!fw_pcie_serdes_name)
+		fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+	if (!platform_config_name)
+		platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+	return obtain_firmware(dd);
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+	u32 *ptr = NULL;
+	u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0;
+	u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+
+	if (platform_config == NULL) {
+		dd_dev_info(dd, "%s: Missing config file\n", __func__);
+		goto bail;
+	}
+	ptr = (u32 *)platform_config->data;
+
+	magic_num = *ptr;
+	ptr++;
+	if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+		dd_dev_info(dd, "%s: Bad config file\n", __func__);
+		goto bail;
+	}
+
+	while (ptr < (u32 *)(platform_config->data + platform_config->size)) {
+		header1 = *ptr;
+		header2 = *(ptr + 1);
+		if (header1 != ~header2) {
+			dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+				__func__, (ptr - (u32 *)platform_config->data));
+			goto bail;
+		}
+
+		record_idx = *ptr &
+			((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+		table_length_dwords = (*ptr >>
+				PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+		      ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+		table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+			((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+		/* Done with this set of headers */
+		ptr += 2;
+
+		if (record_idx) {
+			/* data table */
+			switch (table_type) {
+			case PLATFORM_CONFIG_SYSTEM_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+									1;
+				break;
+			case PLATFORM_CONFIG_PORT_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+									2;
+				break;
+			case PLATFORM_CONFIG_RX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_TX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+							table_length_dwords;
+				break;
+			default:
+				dd_dev_info(dd,
+				      "%s: Unknown data table %d, offset %ld\n",
+					__func__, table_type,
+				       (ptr - (u32 *)platform_config->data));
+				goto bail; /* We don't trust this file now */
+			}
+			pcfgcache->config_tables[table_type].table = ptr;
+		} else {
+			/* metadata table */
+			switch (table_type) {
+			case PLATFORM_CONFIG_SYSTEM_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_PORT_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_RX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_TX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+				break;
+			default:
+				dd_dev_info(dd,
+				  "%s: Unknown metadata table %d, offset %ld\n",
+				  __func__, table_type,
+				  (ptr - (u32 *)platform_config->data));
+				goto bail; /* We don't trust this file now */
+			}
+			pcfgcache->config_tables[table_type].table_metadata =
+									ptr;
+		}
+
+		/* Calculate and check table crc */
+		crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+				(table_length_dwords * 4));
+		crc ^= ~(u32)0;
+
+		/* Jump the table */
+		ptr += table_length_dwords;
+		if (crc != *ptr) {
+			dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+				__func__, (ptr - (u32 *)platform_config->data));
+			goto bail;
+		}
+		/* Jump the CRC DWORD */
+		ptr++;
+	}
+
+	pcfgcache->cache_valid = 1;
+	return 0;
+bail:
+	memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+	return -EINVAL;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+		int field, u32 *field_len_bits, u32 *field_start_bits)
+{
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+	u32 *src_ptr = NULL;
+
+	if (!pcfgcache->cache_valid)
+		return -EINVAL;
+
+	switch (table) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_PORT_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+		if (field && field < platform_config_table_limits[table])
+			src_ptr =
+			pcfgcache->config_tables[table].table_metadata + field;
+		break;
+	default:
+		dd_dev_info(dd, "%s: Unknown table\n", __func__);
+		break;
+	}
+
+	if (!src_ptr)
+		return -EINVAL;
+
+	if (field_start_bits)
+		*field_start_bits = *src_ptr &
+		      ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+	if (field_len_bits)
+		*field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+		       & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+	return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+			enum platform_config_table_type_encoding table_type,
+			int table_index, int field_index, u32 *data, u32 len)
+{
+	int ret = 0, wlen = 0, seek = 0;
+	u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+	if (data)
+		memset(data, 0, len);
+	else
+		return -EINVAL;
+
+	ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+					&field_len_bits, &field_start_bits);
+	if (ret)
+		return -EINVAL;
+
+	/* Convert length to bits */
+	len *= 8;
+
+	/* Our metadata function checked cache_valid and field_index for us */
+	switch (table_type) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		src_ptr = pcfgcache->config_tables[table_type].table;
+
+		if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+			if (len < field_len_bits)
+				return -EINVAL;
+
+			seek = field_start_bits/8;
+			wlen = field_len_bits/8;
+
+			src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+			/* We expect the field to be byte aligned and whole byte
+			 * lengths if we are here */
+			memcpy(data, src_ptr, wlen);
+			return 0;
+		}
+		break;
+	case PLATFORM_CONFIG_PORT_TABLE:
+		/* Port table is 4 DWORDS in META_VERSION 0 */
+		src_ptr = dd->hfi1_id ?
+			pcfgcache->config_tables[table_type].table + 4 :
+			pcfgcache->config_tables[table_type].table;
+		break;
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+		src_ptr = pcfgcache->config_tables[table_type].table;
+
+		if (table_index <
+			pcfgcache->config_tables[table_type].num_table)
+			src_ptr += table_index;
+		else
+			src_ptr = NULL;
+		break;
+	default:
+		dd_dev_info(dd, "%s: Unknown table\n", __func__);
+		break;
+	}
+
+	if (!src_ptr || len < field_len_bits)
+		return -EINVAL;
+
+	src_ptr += (field_start_bits/32);
+	*data = (*src_ptr >> (field_start_bits % 32)) &
+			((1 << field_len_bits) - 1);
+
+	return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes.  An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the HW mutex.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	/* both firmware loads below use the SBus */
+	set_sbus_fast_mode(dd);
+
+	if (fw_sbus_load) {
+		turn_off_spicos(dd, SPICO_SBUS);
+		ret = load_sbus_firmware(dd, &fw_sbus);
+		if (ret)
+			goto done;
+	}
+
+	if (fw_pcie_serdes_load) {
+		dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+		set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+					pcie_serdes_broadcast[dd->hfi1_id],
+					pcie_serdes_addrs[dd->hfi1_id],
+					NUM_PCIE_SERDES);
+		ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+		if (ret)
+			goto done;
+	}
+
+done:
+	clear_sbus_fast_mode(dd);
+
+	return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+	dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+	dd_dev_info(dd, "GUID %llx",
+		(unsigned long long)dd->base_guid);
+}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
new file mode 100644
index 0000000..8ca171b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -0,0 +1,1821 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform_config.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF		0
+#define DROP_PACKET_ON		1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+	(((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+			HFI1_CAP_MISC_MASK)
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+	__u64 sps_ints; /* number of interrupts handled */
+	__u64 sps_errints; /* number of error interrupts */
+	__u64 sps_txerrs; /* tx-related packet errors */
+	__u64 sps_rcverrs; /* non-crc rcv packet errors */
+	__u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+	__u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+	__u64 sps_ctxts; /* number of contexts currently open */
+	__u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+	__u64 sps_buffull;
+	__u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+/*
+ * struct ps_state keeps state associated with RX queue "prescanning"
+ * (prescanning for FECNs, and BECNs), if prescanning is in use.
+ */
+struct ps_state {
+	u32 ps_head;
+	int initialized;
+};
+
+struct ctxt_eager_bufs {
+	ssize_t size;            /* total size of eager buffers */
+	u32 count;               /* size of buffers array */
+	u32 numbufs;             /* number of buffers allocated */
+	u32 alloced;             /* number of rcvarray entries used */
+	u32 rcvtid_size;         /* size of each eager rcv tid */
+	u32 threshold;           /* head update threshold */
+	struct eager_buffer {
+		void *addr;
+		dma_addr_t phys;
+		ssize_t len;
+	} *buffers;
+	struct {
+		void *addr;
+		dma_addr_t phys;
+	} *rcvtids;
+};
+
+struct hfi1_ctxtdata {
+	/* shadow the ctxt's RcvCtrl register */
+	u64 rcvctrl;
+	/* rcvhdrq base, needs mmap before useful */
+	void *rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	volatile __le64 *rcvhdrtail_kvaddr;
+	/*
+	 * Shared page for kernel to signal user processes that send buffers
+	 * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
+	 * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+	 */
+	unsigned long *user_event_mask;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t wait;
+	/* rcvhdrq size (for freeing) */
+	size_t rcvhdrq_size;
+	/* number of rcvhdrq entries */
+	u16 rcvhdrq_cnt;
+	/* size of each of the rcvhdrq entries */
+	u16 rcvhdrqentsize;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t rcvhdrq_phys;
+	dma_addr_t rcvhdrqtailaddr_phys;
+	struct ctxt_eager_bufs egrbufs;
+	/* this receive context's assigned PIO ACK send context */
+	struct send_context *sc;
+
+	/* dynamic receive available interrupt timeout */
+	u32 rcvavail_timeout;
+	/*
+	 * number of opens (including slave sub-contexts) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned ctxt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_cnt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_id;
+	u8 uuid[16];
+	/* job key */
+	u16 jkey;
+	/* number of RcvArray groups for this context. */
+	u32 rcv_array_groups;
+	/* index of first eager TID entry. */
+	u32 eager_base;
+	/* number of expected TID entries */
+	u32 expected_count;
+	/* index of first expected TID entry. */
+	u32 expected_base;
+	/* cursor into the exp group sets */
+	atomic_t tidcursor;
+	/* number of exp TID groups assigned to the ctxt */
+	u16 numtidgroups;
+	/* size of exp TID group fields in tidusemap */
+	u16 tidmapcnt;
+	/* exp TID group usage bitfield array */
+	unsigned long *tidusemap;
+	/* pinned pages for exp sends, allocated at open */
+	struct page **tid_pg_list;
+	/* dma handles for exp tid pages */
+	dma_addr_t *physshadow;
+	/* lock protecting all Expected TID data */
+	spinlock_t exp_lock;
+	/* number of pio bufs for this ctxt (all procs, if shared) */
+	u32 piocnt;
+	/* first pio buffer for this ctxt */
+	u32 pio_base;
+	/* chip offset of PIO buffers for this ctxt */
+	u32 piobufs;
+	/* per-context configuration flags */
+	u16 flags;
+	/* per-context event flags for fileops/intr communication */
+	unsigned long event_flags;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 pionowait;
+	/* total number of polled urgent packets */
+	u32 urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 urgent_poll;
+	/* pid of process using this ctxt */
+	pid_t pid;
+	pid_t subpid[HFI1_MAX_SHARED_CTXTS];
+	/* same size as task_struct .comm[], command that opened context */
+	char comm[16];
+	/* so file ops can get at unit */
+	struct hfi1_devdata *dd;
+	/* so functions that need physical port can get it easily */
+	struct hfi1_pportdata *ppd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subctxt_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subctxt_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subctxt_rcvhdr_base;
+	/* The version of the library which opened this ctxt */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* receive packet sequence counter */
+	u8 seq_cnt;
+	u8 redirect_seq_cnt;
+	/* ctxt rcvhdrq head offset */
+	u32 head;
+	u32 pkt_count;
+	/* QPs waiting for context processing */
+	struct list_head qp_wait_list;
+	/* interrupt handling */
+	u64 imask;	/* clear interrupt mask */
+	int ireg;	/* clear interrupt register */
+	unsigned numa_id; /* numa node of this context */
+	/* verbs stats per CTX */
+	struct hfi1_opcode_stats_perctx *opstats;
+	/*
+	 * This is the kernel thread that will keep making
+	 * progress on the user sdma requests behind the scenes.
+	 * There is one per context (shared contexts use the master's).
+	 */
+	struct task_struct *progress;
+	struct list_head sdma_queues;
+	spinlock_t sdma_qlock;
+
+#ifdef CONFIG_PRESCAN_RXQ
+	struct ps_state ps_state;
+#endif /* CONFIG_PRESCAN_RXQ */
+
+	/*
+	 * The interrupt handler for a particular receive context can vary
+	 * throughout it's lifetime. This is not a lock protected data member so
+	 * it must be updated atomically and the prev and new value must always
+	 * be valid. Worst case is we process an extra interrupt and up to 64
+	 * packets with the wrong interrupt handler.
+	 */
+	void (*do_interrupt)(struct hfi1_ctxtdata *rcd);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+	void *ebuf;
+	void *hdr;
+	struct hfi1_ctxtdata *rcd;
+	__le32 *rhf_addr;
+	struct hfi1_qp *qp;
+	struct hfi1_other_headers *ohdr;
+	u64 rhf;
+	u32 maxcnt;
+	u32 rhqoff;
+	u32 hdrqtail;
+	int numpkt;
+	u16 tlen;
+	u16 hlen;
+	s16 etail;
+	u16 rsize;
+	u8 updegr;
+	u8 rcv_flags;
+	u8 etype;
+};
+
+static inline bool has_sc4_bit(struct hfi1_packet *p)
+{
+	return !!rhf_dc_info(p->rhf);
+}
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+	int mode_flag;
+	struct cdev cdev;
+	struct device *class_dev;
+	spinlock_t snoop_lock;
+	struct list_head queue;
+	wait_queue_head_t waitq;
+	void *filter_value;
+	int (*filter_callback)(void *hdr, void *data, void *value);
+	u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE     1U
+#define HFI1_PORT_CAPTURE_MODE   2U
+
+struct hfi1_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in.  Used as an argument to set_link_state().  Implemented
+ * as bits for easy multi-state checking.  The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP	0
+#define __HLS_UP_ARMED_BP	1
+#define __HLS_UP_ACTIVE_BP	2
+#define __HLS_DN_DOWNDEF_BP	3	/* link down default */
+#define __HLS_DN_POLL_BP	4
+#define __HLS_DN_DISABLE_BP	5
+#define __HLS_DN_OFFLINE_BP	6
+#define __HLS_VERIFY_CAP_BP	7
+#define __HLS_GOING_UP_BP	8
+#define __HLS_GOING_OFFLINE_BP  9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT	  (1 << __HLS_UP_INIT_BP)
+#define HLS_UP_ARMED	  (1 << __HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE	  (1 << __HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF	  (1 << __HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL	  (1 << __HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE	  (1 << __HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE	  (1 << __HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP	  (1 << __HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP	  (1 << __HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 8192
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 8192
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB		1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB		2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL		3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT			4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS		5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX	6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN	0x1
+#define HFI1_PART_ENFORCE_OUT	0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL		0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH		0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED		0x2 /* Disable this counter */
+#define CNTR_32BIT		0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL			0x8 /* Per VL counter */
+#define CNTR_INVALID_VL		-1  /* Specifies invalid VL */
+#define CNTR_MODE_W		0x0
+#define CNTR_MODE_R		0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+	if (*cntr < (u64)-1LL)
+		(*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+	if (*cntr < (u32)-1LL)
+		(*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+	struct msix_entry msix;
+	void *arg;
+	char name[MAX_NAME_SIZE];
+	cpumask_var_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+	struct hrtimer hrtimer;
+	struct hfi1_pportdata *ppd; /* read-only */
+	int sl; /* read-only */
+	u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+	/*
+	 * SMA-facing value.  Should be set from .latest when
+	 * HLS_UP_* -> HLS_DN_* transition actually occurs.
+	 */
+	u8 sma;
+	u8 latest;
+};
+
+enum {
+	LO_PRIO_TABLE,
+	HI_PRIO_TABLE,
+	MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+	spinlock_t lock;
+	struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+	struct hfi1_ibport ibport_data;
+
+	struct hfi1_devdata *dd;
+	struct kobject pport_cc_kobj;
+	struct kobject sc2vl_kobj;
+	struct kobject sl2sc_kobj;
+	struct kobject vl2mtu_kobj;
+
+	/* QSFP support */
+	struct qsfp_data qsfp_info;
+
+	/* GUID for this interface, in host order */
+	u64 guid;
+	/* GUID for peer interface, in host order */
+	u64 neighbor_guid;
+
+	/* up or down physical link state */
+	u32 linkup;
+
+	/*
+	 * this address is mapped read-only into user processes so they can
+	 * get status cheaply, whenever they want.  One qword of status per port
+	 */
+	u64 *statusp;
+
+	/* SendDMA related entries */
+
+	struct workqueue_struct *hfi1_wq;
+
+	/* move out of interrupt context */
+	struct work_struct link_vc_work;
+	struct work_struct link_up_work;
+	struct work_struct link_down_work;
+	struct work_struct sma_message_work;
+	struct work_struct freeze_work;
+	struct work_struct link_downgrade_work;
+	struct work_struct link_bounce_work;
+	/* host link state variables */
+	struct mutex hls_lock;
+	u32 host_link_state;
+
+	spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
+
+	u32 lstate;	/* logical link state */
+
+	/* these are the "32 bit" regs */
+
+	u32 ibmtu; /* The MTU programmed for this unit */
+	/*
+	 * Current max size IB packet (in bytes) including IB headers, that
+	 * we can send. Changes when ibmtu changes.
+	 */
+	u32 ibmaxlen;
+	u32 current_egress_rate; /* units [10^6 bits/sec] */
+	/* LID programmed for this instance */
+	u16 lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 pkeys[MAX_PKEY_VALUES];
+	u16 link_width_supported;
+	u16 link_width_downgrade_supported;
+	u16 link_speed_supported;
+	u16 link_width_enabled;
+	u16 link_width_downgrade_enabled;
+	u16 link_speed_enabled;
+	u16 link_width_active;
+	u16 link_width_downgrade_tx_active;
+	u16 link_width_downgrade_rx_active;
+	u16 link_speed_active;
+	u8 vls_supported;
+	u8 vls_operational;
+	/* LID mask control */
+	u8 lmc;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 rx_pol_inv;
+
+	u8 hw_pidx;     /* physical port index */
+	u8 port;        /* IB port number and index into dd->pports - 1 */
+	/* type of neighbor node */
+	u8 neighbor_type;
+	u8 neighbor_normal;
+	u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+	u8 neighbor_port_number;
+	u8 is_sm_config_started;
+	u8 offline_disabled_reason;
+	u8 is_active_optimize_enabled;
+	u8 driver_link_ready;	/* driver ready for active link */
+	u8 link_enabled;	/* link enabled? */
+	u8 linkinit_reason;
+	u8 local_tx_rate;	/* rate given to 8051 firmware */
+
+	/* placeholders for IB MAD packet settings */
+	u8 overrun_threshold;
+	u8 phy_error_threshold;
+
+	/* used to override LED behavior */
+	u8 led_override;  /* Substituted for normal value, if non-zero */
+	u16 led_override_timeoff; /* delta to next timer event */
+	u8 led_override_vals[2]; /* Alternates per blink-frame */
+	u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list led_override_timer;
+	u32 sm_trap_qp;
+	u32 sa_qp;
+
+	/*
+	 * cca_timer_lock protects access to the per-SL cca_timer
+	 * structures (specifically the ccti member).
+	 */
+	spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+	struct cca_timer cca_timer[OPA_MAX_SLS];
+
+	/* List of congestion control table entries */
+	struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+	/* congestion entries, each entry corresponding to a SL */
+	struct opa_congestion_setting_entry_shadow
+		congestion_entries[OPA_MAX_SLS];
+
+	/*
+	 * cc_state_lock protects (write) access to the per-port
+	 * struct cc_state.
+	 */
+	spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+	struct cc_state __rcu *cc_state;
+
+	/* Total number of congestion control table entries */
+	u16 total_cct_entry;
+
+	/* Bit map identifying service level */
+	u32 cc_sl_control_map;
+
+	/* CA's max number of 64 entry units in the congestion control table */
+	u8 cc_max_table_entries;
+
+	/* begin congestion log related entries
+	 * cc_log_lock protects all congestion log related data */
+	spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+	u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+	u16 threshold_event_counter;
+	struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+	int cc_log_idx; /* index for logging events */
+	int cc_mad_idx; /* index for reporting events */
+	/* end congestion log related entries */
+
+	struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+	/* port relative counter buffer */
+	u64 *cntrs;
+	/* port relative synthetic counter buffer */
+	u64 *scntrs;
+	/* we synthesize port_xmit_discards from several egress errors */
+	u64 port_xmit_discards;
+	u64 port_xmit_constraint_errors;
+	u64 port_rcv_constraint_errors;
+	/* count of 'link_err' interrupts from DC */
+	u64 link_downed;
+	/* number of times link retrained successfully */
+	u64 link_up;
+	/* port_ltp_crc_mode is returned in 'portinfo' MADs */
+	u16 port_ltp_crc_mode;
+	/* port_crc_mode_enabled is the crc we support */
+	u8 port_crc_mode_enabled;
+	/* mgmt_allowed is also returned in 'portinfo' MADs */
+	u8 mgmt_allowed;
+	u8 part_enforce; /* partition enforcement flags */
+	struct link_down_reason local_link_down_reason;
+	struct link_down_reason neigh_link_down_reason;
+	/* Value to be sent to link peer on LinkDown .*/
+	u8 remote_link_down_reason;
+	/* Error events that will cause a port bounce. */
+	u32 port_error_action;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE  0	/* keep going */
+#define RHF_RCV_DONE	  1	/* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2	/* stop. retain this packet */
+
+struct rcv_array_data {
+	u8 group_size;
+	u16 ngroups;
+	u16 nctxt_extra;
+};
+
+struct per_vl_data {
+	u16 mtu;
+	struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+	u8 status_and_code;
+	u64 packet_flit1;
+	u64 packet_flit2;
+};
+
+struct err_info_constraint {
+	u8 status;
+	u16 pkey;
+	u32 slid;
+};
+
+struct hfi1_temp {
+	unsigned int curr;       /* current temperature */
+	unsigned int lo_lim;     /* low temperature limit */
+	unsigned int hi_lim;     /* high temperature limit */
+	unsigned int crit_lim;   /* critical temperature limit */
+	u8 triggers;      /* temperature triggers */
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+struct hfi1_devdata {
+	struct hfi1_ibdev verbs_dev;     /* must be first */
+	struct list_head list;
+	/* pointers to related structs for this device */
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev user_cdev;
+	struct cdev diag_cdev;
+	struct cdev ui_cdev;
+	struct device *user_device;
+	struct device *diag_device;
+	struct device *ui_device;
+
+	/* mem-mapped pointer to base of chip regs */
+	u8 __iomem *kregbase;
+	/* end of mem-mapped chip space excluding sendbuf and user regs */
+	u8 __iomem *kregend;
+	/* physical address of chip for io_remap, etc. */
+	resource_size_t physaddr;
+	/* receive context data */
+	struct hfi1_ctxtdata **rcd;
+	/* send context data */
+	struct send_context_info *send_contexts;
+	/* map hardware send contexts to software index */
+	u8 *hw_to_sw;
+	/* spinlock for allocating and releasing send context resources */
+	spinlock_t sc_lock;
+	/* Per VL data. Enough for all VLs but not all elements are set/used. */
+	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+	/* seqlock for sc2vl */
+	seqlock_t sc2vl_lock;
+	u64 sc2vl[4];
+	/* Send Context initialization lock. */
+	spinlock_t sc_init_lock;
+
+	/* fields common to all SDMA engines */
+
+	/* default flags to last descriptor */
+	u64 default_desc1;
+	volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
+	dma_addr_t                          sdma_heads_phys;
+	void                               *sdma_pad_dma; /* DMA'ed by chip */
+	dma_addr_t                          sdma_pad_phys;
+	/* for deallocation */
+	size_t                              sdma_heads_size;
+	/* number from the chip */
+	u32                                 chip_sdma_engines;
+	/* num used */
+	u32                                 num_sdma;
+	/* lock for sdma_map */
+	spinlock_t                          sde_map_lock;
+	/* array of engines sized by num_sdma */
+	struct sdma_engine                 *per_sdma;
+	/* array of vl maps */
+	struct sdma_vl_map __rcu           *sdma_map;
+	/* SPC freeze waitqueue and variable */
+	wait_queue_head_t		  sdma_unfreeze_wq;
+	atomic_t			  sdma_unfreeze_count;
+
+
+	/* hfi1_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct hfi1_pportdata *pport;
+
+	/* mem-mapped pointer to base of PIO buffers */
+	void __iomem *piobase;
+	/*
+	 * write-combining mem-mapped pointer to base of RcvArray
+	 * memory.
+	 */
+	void __iomem *rcvarray_wc;
+	/*
+	 * credit return base - a per-NUMA range of DMA address that
+	 * the chip will use to update the per-context free counter
+	 */
+	struct credit_return_base *cr_base;
+
+	/* send context numbers and sizes for each type */
+	struct sc_config_sizes sc_sizes[SC_MAX];
+
+	u32 lcb_access_count;		/* count of LCB users */
+
+	char *boardname; /* human readable board info */
+
+	/* device (not port) flags, basically device capabilities */
+	u32 flags;
+
+	/* reset value */
+	u64 z_int_counter;
+	u64 z_rcv_limit;
+	/* percpu int_counter */
+	u64 __percpu *int_counter;
+	u64 __percpu *rcv_limit;
+
+	/* number of receive contexts in use by the driver */
+	u32 num_rcv_contexts;
+	/* number of pio send contexts in use by the driver */
+	u32 num_send_contexts;
+	/*
+	 * number of ctxts available for PSM open
+	 */
+	u32 freectxts;
+	/* base receive interrupt timeout, in CSR units */
+	u32 rcv_intr_timeout_csr;
+
+	u64 __iomem *egrtidbase;
+	spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+	spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+	/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+	spinlock_t uctxt_lock; /* rcd and user context changes */
+	/* exclusive access to 8051 */
+	spinlock_t dc8051_lock;
+	/* exclusive access to 8051 memory */
+	spinlock_t dc8051_memlock;
+	int dc8051_timed_out;	/* remember if the 8051 timed out */
+	/*
+	 * A page that will hold event notification bitmaps for all
+	 * contexts. This page will be mapped into all processes.
+	 */
+	unsigned long *events;
+	/*
+	 * per unit status, see also portdata statusp
+	 * mapped read-only into user processes so they can get unit and
+	 * IB link status cheaply
+	 */
+	struct hfi1_status *status;
+	u32 freezelen; /* max length of freezemsg */
+
+	/* revision register shadow */
+	u64 revision;
+	/* Base GUID for device (network order) */
+	u64 base_guid;
+
+	/* these are the "32 bit" regs */
+
+	/* value we put in kr_rcvhdrsize */
+	u32 rcvhdrsize;
+	/* number of receive contexts the chip supports */
+	u32 chip_rcv_contexts;
+	/* number of receive array entries */
+	u32 chip_rcv_array_count;
+	/* number of PIO send contexts the chip supports */
+	u32 chip_send_contexts;
+	/* number of bytes in the PIO memory buffer */
+	u32 chip_pio_mem_size;
+	/* number of bytes in the SDMA memory buffer */
+	u32 chip_sdma_mem_size;
+
+	/* size of each rcvegrbuffer */
+	u32 rcvegrbufsize;
+	/* log2 of above */
+	u16 rcvegrbufsize_shift;
+	/* both sides of the PCIe link are gen3 capable */
+	u8 link_gen3_capable;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 lbus_width;
+	/* localbus speed in MHz */
+	u32 lbus_speed;
+	int unit; /* unit # of this chip */
+	int node; /* home node of this chip */
+
+	/* save these PCI fields to restore after a reset */
+	u32 pcibar0;
+	u32 pcibar1;
+	u32 pci_rom;
+	u16 pci_command;
+	u16 pcie_devctl;
+	u16 pcie_lnkctl;
+	u16 pcie_devctl2;
+	u32 pci_msix0;
+	u32 pci_lnkctl3;
+	u32 pci_tph2;
+
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer serial number format
+	 */
+	u8 serial[SERIAL_MAX];
+	/* human readable board version */
+	u8 boardversion[BOARD_VERS_MAX];
+	u8 lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from CceRevision */
+	u8 majrev;
+	/* chip minor rev, from CceRevision */
+	u8 minrev;
+	/* hardware ID */
+	u8 hfi1_id;
+	/* implementation code */
+	u8 icode;
+	/* default link down value (poll/sleep) */
+	u8 link_default;
+	/* vAU of this device */
+	u8 vau;
+	/* vCU of this device */
+	u8 vcu;
+	/* link credits of this device */
+	u16 link_credits;
+	/* initial vl15 credits to use */
+	u16 vl15_init;
+
+	/* Misc small ints */
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes */
+	u8 first_user_ctxt;
+	u8 n_krcv_queues;
+	u8 qos_shift;
+	u8 qpn_mask;
+
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+	u16 irev;	/* implementation revision */
+	u16 dc8051_ver; /* 8051 firmware version */
+
+	struct platform_config_cache pcfg_cache;
+	/* control high-level access to qsfp */
+	struct mutex qsfp_i2c_mutex;
+
+	struct diag_client *diag_client;
+	spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+	u8 psxmitwait_supported;
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+	/* high volume overflow errors deferred to tasklet */
+	struct tasklet_struct error_tasklet;
+	/* per device cq worker */
+	struct kthread_worker *worker;
+
+	/* MSI-X information */
+	struct hfi1_msix_entry *msix_entries;
+	u32 num_msix_entries;
+
+	/* INTx information */
+	u32 requested_intx_irq;		/* did we request one? */
+	char intx_name[MAX_NAME_SIZE];	/* INTx name */
+
+	/* general interrupt: mask of handled interrupts */
+	u64 gi_mask[CCE_NUM_INT_CSRS];
+
+	struct rcv_array_data rcv_entries;
+
+	/*
+	 * 64 bit synthetic counters
+	 */
+	struct timer_list synth_stats_timer;
+
+	/*
+	 * device counters
+	 */
+	char *cntrnames;
+	size_t cntrnameslen;
+	size_t ndevcntrs;
+	u64 *cntrs;
+	u64 *scntrs;
+
+	/*
+	 * remembered values for synthetic counters
+	 */
+	u64 last_tx;
+	u64 last_rx;
+
+	/*
+	 * per-port counters
+	 */
+	size_t nportcntrs;
+	char *portcntrnames;
+	size_t portcntrnameslen;
+
+	struct hfi1_snoop_data hfi1_snoop;
+
+	struct err_info_rcvport err_info_rcvport;
+	struct err_info_constraint err_info_rcv_constraint;
+	struct err_info_constraint err_info_xmit_constraint;
+	u8 err_info_uncorrectable;
+	u8 err_info_fmconfig;
+
+	atomic_t drop_packet;
+	u8 do_drop;
+
+	/* receive interrupt functions */
+	rhf_rcv_function_ptr *rhf_rcv_function_map;
+	rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+	/*
+	 * Handlers for outgoing data so that snoop/capture does not
+	 * have to have its hooks in the send path
+	 */
+	int (*process_pio_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+				u32 hdrwords, struct hfi1_sge_state *ss,
+				u32 len, u32 plen, u32 dwords, u64 pbc);
+	int (*process_dma_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+				u32 hdrwords, struct hfi1_sge_state *ss,
+				u32 len, u32 plen, u32 dwords, u64 pbc);
+	void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+				u64 pbc, const void *from, size_t count);
+
+	/* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+	u8 oui1;
+	u8 oui2;
+	u8 oui3;
+	/* Timer and counter used to detect RcvBufOvflCnt changes */
+	struct timer_list rcverr_timer;
+	u32 rcv_ovfl_cnt;
+
+	int assigned_node_id;
+	wait_queue_head_t event_queue;
+
+	/* Save the enabled LCB error bits */
+	u64 lcb_err_en;
+	u8 dc_shutdown;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER    1
+#define PT_INVALID  2
+
+/* Private data for file operations */
+struct hfi1_filedata {
+	struct hfi1_ctxtdata *uctxt;
+	unsigned subctxt;
+	struct hfi1_user_sdma_comp_q *cq;
+	struct hfi1_user_sdma_pkt_q *pq;
+	/* for cpu affinity; -1 if none */
+	int rec_cpu_num;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+			 struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+void handle_receive_interrupt(struct hfi1_ctxtdata *);
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd);
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd);
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+	return ppd->lstate; /* use the cached value */
+}
+
+static inline u16 generate_jkey(kuid_t uid)
+{
+	return from_kuid(current_user_ns(), uid) & 0xffff;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+	u16 link_speed = ppd->link_speed_active;
+	u16 link_width = ppd->link_width_active;
+	u32 egress_rate;
+
+	if (link_speed == OPA_LINK_SPEED_25G)
+		egress_rate = 25000;
+	else /* assume OPA_LINK_SPEED_12_5G */
+		egress_rate = 12500;
+
+	switch (link_width) {
+	case OPA_LINK_WIDTH_4X:
+		egress_rate *= 4;
+		break;
+	case OPA_LINK_WIDTH_3X:
+		egress_rate *= 3;
+		break;
+	case OPA_LINK_WIDTH_2X:
+		egress_rate *= 2;
+		break;
+	default:
+		/* assume IB_WIDTH_1X */
+		break;
+	}
+
+	return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+	u32 cycles;
+
+	/*
+	 * cycles is:
+	 *
+	 *          (length) [bits] / (rate) [bits/sec]
+	 *  ---------------------------------------------------
+	 *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+	 */
+
+	cycles = len * 8; /* bits */
+	cycles *= 805;
+	cycles /= rate;
+
+	return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
+		  u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+		u32 pkey, u32 slid, u32 dlid, u8 sc5,
+		const struct ib_grh *old_grh);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+	/* Pause at least 1us, to ensure chip returns all credits */
+	u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+	udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+	unsigned seq;
+	u8 rval;
+
+	if (sc5 >= OPA_MAX_SCS)
+		return (u8)(0xff);
+
+	do {
+		seq = read_seqbegin(&dd->sc2vl_lock);
+		rval = *(((u8 *)dd->sc2vl) + sc5);
+	} while (read_seqretry(&dd->sc2vl_lock, seq));
+
+	return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+	u16 mkey = pkey & PKEY_LOW_15_MASK;
+	u16 ment = ent & PKEY_LOW_15_MASK;
+
+	if (mkey == ment) {
+		/*
+		 * If pkey[15] is clear (limited partition member),
+		 * is bit 15 in the corresponding table element
+		 * clear (limited member)?
+		 */
+		if (!(pkey & PKEY_MEMBER_MASK))
+			return !!(ent & PKEY_MEMBER_MASK);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+	int i;
+
+	for (i = 0; i < MAX_PKEY_VALUES; i++) {
+		if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+				    u16 slid)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	incr_cntr64(&ppd->port_rcv_constraint_errors);
+	if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+		dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+		dd->err_info_rcv_constraint.slid = slid;
+		dd->err_info_rcv_constraint.pkey = pkey;
+	}
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+				     u8 sc5, u8 idx, u16 slid)
+{
+	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+		return 0;
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+	/* Is the pkey = 0x0, or 0x8000? */
+	if ((pkey & PKEY_LOW_15_MASK) == 0)
+		goto bad;
+
+	/* The most likely matching pkey has index 'idx' */
+	if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+		return 0;
+
+	/* no match - try the whole table */
+	if (!ingress_pkey_table_search(ppd, pkey))
+		return 0;
+
+bad:
+	ingress_pkey_table_fail(ppd, pkey, slid);
+	return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+				 u8 sc5, u16 slid)
+{
+	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+		return 0;
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+	return 0;
+bad:
+	ingress_pkey_table_fail(ppd, pkey, slid);
+	return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0     0
+#define OPA_MTU_256   1
+#define OPA_MTU_512   2
+#define OPA_MTU_1024  3
+#define OPA_MTU_2048  4
+#define OPA_MTU_4096  5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+	return mtu == 256 || mtu == 512 ||
+		mtu == 1024 || mtu == 2048 ||
+		mtu == 4096;
+}
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+	return mtu >= 2048 &&
+		(valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+			   u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			   u32 plen, u32 dwords, u64 pbc);
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+			   u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			   u32 plen, u32 dwords, u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+			   u64 pbc, const void *from, size_t count);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+	(((struct hfi1_filedata *)(fp)->private_data)->uctxt)
+#define subctxt_fp(fp) \
+	(((struct hfi1_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+	(((struct hfi1_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_pkt_fp(fp) \
+	(((struct hfi1_filedata *)(fp)->private_data)->pq)
+#define user_sdma_comp_fp(fp) \
+	(((struct hfi1_filedata *)(fp)->private_data)->cq)
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+	return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+	return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+	return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+	return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	WARN_ON(pidx >= dd->num_pports);
+	return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u16 ret;
+
+	if (index >= ARRAY_SIZE(ppd->pkeys))
+		ret = 0;
+	else
+		ret = ppd->pkeys[index];
+
+	return ret;
+}
+
+/*
+ * Readers of cc_state must call get_cc_state() under rcu_read_lock().
+ * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+	return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED           0x1    /* chip and driver up and initted */
+#define HFI1_PRESENT           0x2    /* chip accesses can be done */
+#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT  0x8
+#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
+#define HFI1_DO_INIT_ASIC      0x100  /* This device will init the ASIC */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+		/* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+		/* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+				  const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+void cc_state_reclaim(struct rcu_head *rcu);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define HFI1_LED_LOG 2  /* Logical (link) YELLOW LED */
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field.  If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines.  The typical local KDETH header would
+ * be:
+ *
+ *	Bytes	Field
+ *	  8	LRH
+ *	 12	BHT
+ *	 ??	KDETH
+ *	  8	RHF
+ *	---
+ *	 28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ *	Bytes	Field
+ *	  8	LRH
+ *	 40	GRH (optional)
+ *	 12	BTH
+ *	 ??	KDETH
+ *	  8	RHF
+ *	---
+ *	 68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes.  Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+int hfi1_get_user_pages(unsigned long, size_t, struct page **);
+void hfi1_release_user_pages(struct page **, size_t);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+	*((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+	/*
+	 * volatile because it's a DMA target from the chip, routine is
+	 * inlined, and don't want register caching or reordering.
+	 */
+	return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+			   struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+		     const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void hfi1_nomsix(struct hfi1_devdata *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+			enum platform_config_table_type_encoding table_type,
+			int table_index, int field_index, u32 *data, u32 len);
+
+dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
+			 size_t, int);
+const char *get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+	asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct hfi1_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern uint num_rcv_contexts;
+extern unsigned n_krcvqs;
+extern u8 krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME		"hfi1"
+#define HFI1_USER_MINOR_BASE     0
+#define HFI1_TRACE_MINOR         127
+#define HFI1_DIAGPKT_MINOR       128
+#define HFI1_DIAG_MINOR_BASE     129
+#define HFI1_SNOOP_CAPTURE_BASE  200
+#define HFI1_NMINORS             255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY					    \
+	(SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK	    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK		    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY					    \
+	(SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+						  u16 ctxt_type)
+{
+	u64 base_sc_integrity =
+	SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+	if (ctxt_type == SC_USER)
+		base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+	else
+		base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+	if (is_a0(dd))
+		/* turn off send-side job key checks - A0 erratum */
+		return base_sc_integrity &
+		       ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+	return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+	u64 base_sdma_integrity =
+	SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+	if (is_a0(dd))
+		/* turn off send-side job key checks - A0 erratum */
+		return base_sdma_integrity &
+		       ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+	return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+	dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+	dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+		  get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+			get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+			get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+	dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+			get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+	dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+			get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+			get_unit_name((dd)->unit), (dd)->unit, (port), \
+			##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+	size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+			  const struct hfi1_hwerror_msgs *hwerrmsgs,
+			  size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+	dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->ibport_data.z_rc_acks =
+			get_all_cpu_total(ppd->ibport_data.rc_acks);
+		ppd->ibport_data.z_rc_qacks =
+			get_all_cpu_total(ppd->ibport_data.rc_qacks);
+	}
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+	if (on)
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+	else
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
new file mode 100644
index 0000000..a877eda
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -0,0 +1,1722 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
+ */
+uint num_rcv_contexts;
+module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+	num_rcv_contexts, "Set max number of user receive contexts to use");
+
+u8 krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33;	/* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+	unsigned i;
+	int ret;
+	int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+	if (local_node_id < 0)
+		local_node_id = numa_node_id();
+	dd->assigned_node_id = local_node_id;
+
+	dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
+	if (!dd->rcd) {
+		dd_dev_err(dd,
+			"Unable to allocate receive context array, failing\n");
+		goto nomem;
+	}
+
+	/* create one or more kernel contexts */
+	for (i = 0; i < dd->first_user_ctxt; ++i) {
+		struct hfi1_pportdata *ppd;
+		struct hfi1_ctxtdata *rcd;
+
+		ppd = dd->pport + (i % dd->num_pports);
+		rcd = hfi1_create_ctxtdata(ppd, i);
+		if (!rcd) {
+			dd_dev_err(dd,
+				"Unable to allocate kernel receive context, failing\n");
+			goto nomem;
+		}
+		/*
+		 * Set up the kernel context flags here and now because they
+		 * use default values for all receive side memories.  User
+		 * contexts will be handled as they are created.
+		 */
+		rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+			HFI1_CAP_KGET(NODROP_EGR_FULL) |
+			HFI1_CAP_KGET(DMA_RTAIL);
+		rcd->seq_cnt = 1;
+
+		rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+		if (!rcd->sc) {
+			dd_dev_err(dd,
+				"Unable to allocate kernel send context, failing\n");
+			dd->rcd[rcd->ctxt] = NULL;
+			hfi1_free_ctxtdata(dd, rcd);
+			goto nomem;
+		}
+
+		ret = hfi1_init_ctxt(rcd->sc);
+		if (ret < 0) {
+			dd_dev_err(dd,
+				   "Failed to setup kernel receive context, failing\n");
+			sc_free(rcd->sc);
+			dd->rcd[rcd->ctxt] = NULL;
+			hfi1_free_ctxtdata(dd, rcd);
+			ret = -EFAULT;
+			goto bail;
+		}
+	}
+
+	return 0;
+nomem:
+	ret = -ENOMEM;
+bail:
+	kfree(dd->rcd);
+	dd->rcd = NULL;
+	return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ctxtdata *rcd;
+	unsigned kctxt_ngroups = 0;
+	u32 base;
+
+	if (dd->rcv_entries.nctxt_extra >
+	    dd->num_rcv_contexts - dd->first_user_ctxt)
+		kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+				 (dd->num_rcv_contexts - dd->first_user_ctxt));
+	rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+	if (rcd) {
+		u32 rcvtids, max_entries;
+
+		dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt);
+
+		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		rcd->ppd = ppd;
+		rcd->dd = dd;
+		rcd->cnt = 1;
+		rcd->ctxt = ctxt;
+		dd->rcd[ctxt] = rcd;
+		rcd->numa_id = numa_node_id();
+		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+		spin_lock_init(&rcd->exp_lock);
+
+		/*
+		 * Calculate the context's RcvArray entry starting point.
+		 * We do this here because we have to take into account all
+		 * the RcvArray entries that previous context would have
+		 * taken and we have to account for any extra groups
+		 * assigned to the kernel or user contexts.
+		 */
+		if (ctxt < dd->first_user_ctxt) {
+			if (ctxt < kctxt_ngroups) {
+				base = ctxt * (dd->rcv_entries.ngroups + 1);
+				rcd->rcv_array_groups++;
+			} else
+				base = kctxt_ngroups +
+					(ctxt * dd->rcv_entries.ngroups);
+		} else {
+			u16 ct = ctxt - dd->first_user_ctxt;
+
+			base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+				kctxt_ngroups);
+			if (ct < dd->rcv_entries.nctxt_extra) {
+				base += ct * (dd->rcv_entries.ngroups + 1);
+				rcd->rcv_array_groups++;
+			} else
+				base += dd->rcv_entries.nctxt_extra +
+					(ct * dd->rcv_entries.ngroups);
+		}
+		rcd->eager_base = base * dd->rcv_entries.group_size;
+
+		/* Validate and initialize Rcv Hdr Q variables */
+		if (rcvhdrcnt % HDRQ_INCREMENT) {
+			dd_dev_err(dd,
+				   "ctxt%u: header queue count %d must be divisible by %d\n",
+				   rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+			goto bail;
+		}
+		rcd->rcvhdrq_cnt = rcvhdrcnt;
+		rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+		/*
+		 * Simple Eager buffer allocation: we have already pre-allocated
+		 * the number of RcvArray entry groups. Each ctxtdata structure
+		 * holds the number of groups for that context.
+		 *
+		 * To follow CSR requirements and maintain cacheline alignment,
+		 * make sure all sizes and bases are multiples of group_size.
+		 *
+		 * The expected entry count is what is left after assigning
+		 * eager.
+		 */
+		max_entries = rcd->rcv_array_groups *
+			dd->rcv_entries.group_size;
+		rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+		rcd->egrbufs.count = round_down(rcvtids,
+						dd->rcv_entries.group_size);
+		if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+			dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+				   rcd->ctxt);
+			rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+		}
+		dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+			    rcd->ctxt, rcd->egrbufs.count);
+
+		/*
+		 * Allocate array that will hold the eager buffer accounting
+		 * data.
+		 * This will allocate the maximum possible buffer count based
+		 * on the value of the RcvArray split parameter.
+		 * The resulting value will be rounded down to the closest
+		 * multiple of dd->rcv_entries.group_size.
+		 */
+		rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
+					       rcd->egrbufs.count, GFP_KERNEL);
+		if (!rcd->egrbufs.buffers)
+			goto bail;
+		rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
+					       rcd->egrbufs.count, GFP_KERNEL);
+		if (!rcd->egrbufs.rcvtids)
+			goto bail;
+		rcd->egrbufs.size = eager_buffer_size;
+		/*
+		 * The size of the buffers programmed into the RcvArray
+		 * entries needs to be big enough to handle the highest
+		 * MTU supported.
+		 */
+		if (rcd->egrbufs.size < hfi1_max_mtu) {
+			rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+			dd_dev_info(dd,
+				    "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+				    rcd->ctxt, rcd->egrbufs.size);
+		}
+		rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+		if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+			rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+				GFP_KERNEL);
+			if (!rcd->opstats) {
+				dd_dev_err(dd,
+					   "ctxt%u: Unable to allocate per ctxt stats buffer\n",
+					   rcd->ctxt);
+				goto bail;
+			}
+		}
+	}
+	return rcd;
+bail:
+	kfree(rcd->opstats);
+	kfree(rcd->egrbufs.rcvtids);
+	kfree(rcd->egrbufs.buffers);
+	kfree(rcd);
+	return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+	/* there are only 3 valid receive header entry sizes */
+	if (size == 2)
+		return 1;
+	if (size == 16)
+		return 2;
+	else if (size == 32)
+		return 4;
+	return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct cc_state *cc_state;
+	int i;
+	u16 cce, ccti_limit, max_ccti = 0;
+	u16 shift, mult;
+	u64 src;
+	u32 current_egress_rate; /* Mbits /sec */
+	u32 max_pkt_time;
+	/*
+	 * max_pkt_time is the maximum packet egress time in units
+	 * of the fabric clock period 1/(805 MHz).
+	 */
+
+	cc_state = get_cc_state(ppd);
+
+	if (cc_state == NULL)
+		/*
+		 * This should _never_ happen - rcu_read_lock() is held,
+		 * and set_link_ipg() should not be called if cc_state
+		 * is NULL.
+		 */
+		return;
+
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		u16 ccti = ppd->cca_timer[i].ccti;
+
+		if (ccti > max_ccti)
+			max_ccti = ccti;
+	}
+
+	ccti_limit = cc_state->cct.ccti_limit;
+	if (max_ccti > ccti_limit)
+		max_ccti = ccti_limit;
+
+	cce = cc_state->cct.entries[max_ccti].entry;
+	shift = (cce & 0xc000) >> 14;
+	mult = (cce & 0x3fff);
+
+	current_egress_rate = active_egress_rate(ppd);
+
+	max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+	src = (max_pkt_time >> shift) * mult;
+
+	src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+	src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+	write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+	struct cca_timer *cca_timer;
+	struct hfi1_pportdata *ppd;
+	int sl;
+	u16 ccti, ccti_timer, ccti_min;
+	struct cc_state *cc_state;
+
+	cca_timer = container_of(t, struct cca_timer, hrtimer);
+	ppd = cca_timer->ppd;
+	sl = cca_timer->sl;
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (cc_state == NULL) {
+		rcu_read_unlock();
+		return HRTIMER_NORESTART;
+	}
+
+	/*
+	 * 1) decrement ccti for SL
+	 * 2) calculate IPG for link (set_link_ipg())
+	 * 3) restart timer, unless ccti is at min value
+	 */
+
+	ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+	spin_lock(&ppd->cca_timer_lock);
+
+	ccti = cca_timer->ccti;
+
+	if (ccti > ccti_min) {
+		cca_timer->ccti--;
+		set_link_ipg(ppd);
+	}
+
+	spin_unlock(&ppd->cca_timer_lock);
+
+	rcu_read_unlock();
+
+	if (ccti > ccti_min) {
+		unsigned long nsec = 1024 * ccti_timer;
+		/* ccti_timer is in units of 1.024 usec */
+		hrtimer_forward_now(t, ns_to_ktime(nsec));
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+	int i, size;
+	uint default_pkey_idx;
+
+	ppd->dd = dd;
+	ppd->hw_pidx = hw_pidx;
+	ppd->port = port; /* IB port number, not index */
+
+	default_pkey_idx = 1;
+
+	ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+	if (loopback) {
+		hfi1_early_err(&pdev->dev,
+			       "Faking data partition 0x8001 in idx %u\n",
+			       !default_pkey_idx);
+		ppd->pkeys[!default_pkey_idx] = 0x8001;
+	}
+
+	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+	INIT_WORK(&ppd->link_up_work, handle_link_up);
+	INIT_WORK(&ppd->link_down_work, handle_link_down);
+	INIT_WORK(&ppd->freeze_work, handle_freeze);
+	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+	INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+	mutex_init(&ppd->hls_lock);
+	spin_lock_init(&ppd->sdma_alllock);
+	spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+	ppd->sm_trap_qp = 0x0;
+	ppd->sa_qp = 0x1;
+
+	ppd->hfi1_wq = NULL;
+
+	spin_lock_init(&ppd->cca_timer_lock);
+
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		ppd->cca_timer[i].ppd = ppd;
+		ppd->cca_timer[i].sl = i;
+		ppd->cca_timer[i].ccti = 0;
+		ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+	}
+
+	ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+	spin_lock_init(&ppd->cc_state_lock);
+	spin_lock_init(&ppd->cc_log_lock);
+	size = sizeof(struct cc_state);
+	RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
+	if (!rcu_dereference(ppd->cc_state))
+		goto bail;
+	return;
+
+bail:
+
+	hfi1_early_err(&pdev->dev,
+		       "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/*
+	 * Ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize.  This is mostly
+	 * for the driver data structures, not chip registers.
+	 */
+	for (i = 0; i < dd->num_rcv_contexts; i++)
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+				  HFI1_RCVCTRL_INTRAVAIL_DIS |
+				  HFI1_RCVCTRL_TAILUPD_DIS, i);
+	pio_send_control(dd, PSC_GLOBAL_DISABLE);
+	for (i = 0; i < dd->num_send_contexts; i++)
+		sc_disable(dd->send_contexts[i].sc);
+
+	return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+	u32 rcvmask;
+	u32 i;
+
+	/* enable PIO send */
+	pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+	/*
+	 * Enable kernel ctxts' receive and receive interrupt.
+	 * Other ctxts done as user opens and initializes them.
+	 */
+	rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+	for (i = 0; i < dd->first_user_ctxt; ++i) {
+		rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+		if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+			rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+		if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+		if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+		hfi1_rcvctrl(dd, rcvmask, i);
+		sc_enable(dd->rcd[i]->sc);
+	}
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+	int pidx;
+	struct hfi1_pportdata *ppd;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (!ppd->hfi1_wq) {
+			char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+
+			snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
+				 dd->unit, pidx);
+			ppd->hfi1_wq =
+				create_singlethread_workqueue(wq_name);
+			if (!ppd->hfi1_wq)
+				goto wq_error;
+		}
+	}
+	return 0;
+wq_error:
+	pr_err("create_singlethread_workqueue failed for port %d\n",
+		pidx + 1);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->hfi1_wq) {
+			destroy_workqueue(ppd->hfi1_wq);
+			ppd->hfi1_wq = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+	int ret = 0, pidx, lastfail = 0;
+	unsigned i, len;
+	struct hfi1_ctxtdata *rcd;
+	struct hfi1_pportdata *ppd;
+
+	/* Set up recv low level handlers */
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+						kdeth_process_expected;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+						kdeth_process_eager;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+						process_receive_error;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+						process_receive_bypass;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+						process_receive_invalid;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+						process_receive_invalid;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+						process_receive_invalid;
+	dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+	/* Set up send low level handlers */
+	dd->process_pio_send = hfi1_verbs_send_pio;
+	dd->process_dma_send = hfi1_verbs_send_dma;
+	dd->pio_inline_send = pio_copy;
+
+	if (is_a0(dd)) {
+		atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+		dd->do_drop = 1;
+	} else {
+		atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+		dd->do_drop = 0;
+	}
+
+	/* make sure the link is not "up" */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		ppd->linkup = 0;
+	}
+
+	if (reinit)
+		ret = init_after_reset(dd);
+	else
+		ret = loadtime_init(dd);
+	if (ret)
+		goto done;
+
+	/* dd->rcd can be NULL if early initialization failed */
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		/*
+		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+		 * re-init, the simplest way to handle this is to free
+		 * existing, and re-allocate.
+		 * Need to re-create rest of ctxt 0 ctxtdata as well.
+		 */
+		rcd = dd->rcd[i];
+		if (!rcd)
+			continue;
+
+		rcd->do_interrupt = &handle_receive_interrupt;
+
+		lastfail = hfi1_create_rcvhdrq(dd, rcd);
+		if (!lastfail)
+			lastfail = hfi1_setup_eagerbufs(rcd);
+		if (lastfail)
+			dd_dev_err(dd,
+				"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+	}
+	if (lastfail)
+		ret = lastfail;
+
+	/* Allocate enough memory for user event notification. */
+	len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+		    sizeof(*dd->events), PAGE_SIZE);
+	dd->events = vmalloc_user(len);
+	if (!dd->events)
+		dd_dev_err(dd, "Failed to allocate user events page\n");
+	/*
+	 * Allocate a page for device and port status.
+	 * Page will be shared amongst all user processes.
+	 */
+	dd->status = vmalloc_user(PAGE_SIZE);
+	if (!dd->status)
+		dd_dev_err(dd, "Failed to allocate dev status page\n");
+	else
+		dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+					     sizeof(dd->status->freezemsg));
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (dd->status)
+			/* Currently, we only have one port */
+			ppd->statusp = &dd->status->port;
+
+		set_mtu(ppd);
+	}
+
+	/* enable chip even if we have an error, so we can debug cause */
+	enable_chip(dd);
+
+	ret = hfi1_cq_init(dd);
+done:
+	/*
+	 * Set status even if port serdes is not initialized
+	 * so that diags will work.
+	 */
+	if (dd->status)
+		dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+			HFI1_STATUS_INITTED;
+	if (!ret) {
+		/* enable all interrupts from the chip */
+		set_intr_state(dd, 1);
+
+		/* chip is OK for user apps; mark it as initialized */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+
+			/* initialize the qsfp if it exists
+			 * Requires interrupts to be enabled so we are notified
+			 * when the QSFP completes reset, and has
+			 * to be done before bringing up the SERDES
+			 */
+			init_qsfp(ppd);
+
+			/* start the serdes - must be after interrupts are
+			   enabled so we are notified when the link goes up */
+			lastfail = bringup_serdes(ppd);
+			if (lastfail)
+				dd_dev_info(dd,
+					"Failed to bring up port %u\n",
+					ppd->port);
+
+			/*
+			 * Set status even if port serdes is not initialized
+			 * so that diags will work.
+			 */
+			if (ppd->statusp)
+				*ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+							HFI1_STATUS_INITTED;
+			if (!ppd->link_speed_enabled)
+				continue;
+		}
+	}
+
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+	return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	dd = __hfi1_lookup(unit);
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+	return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->led_override_timer.data) {
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+	}
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	unsigned pidx;
+	int i;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		ppd->linkup = 0;
+		if (ppd->statusp)
+			*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+					   HFI1_STATUS_IB_READY);
+	}
+	dd->flags &= ~HFI1_INITTED;
+
+	/* mask interrupts, but not errors */
+	set_intr_state(dd, 0);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		for (i = 0; i < dd->num_rcv_contexts; i++)
+			hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+					  HFI1_RCVCTRL_CTXT_DIS |
+					  HFI1_RCVCTRL_INTRAVAIL_DIS |
+					  HFI1_RCVCTRL_PKEY_DIS |
+					  HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+		/*
+		 * Gracefully stop all sends allowing any in progress to
+		 * trickle out first.
+		 */
+		for (i = 0; i < dd->num_send_contexts; i++)
+			sc_flush(dd->send_contexts[i].sc);
+	}
+
+	/*
+	 * Enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(20);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		/* disable all contexts */
+		for (i = 0; i < dd->num_send_contexts; i++)
+			sc_disable(dd->send_contexts[i].sc);
+		/* disable the send device */
+		pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+		/*
+		 * Clear SerdesEnable.
+		 * We can't count on interrupts since we are stopping.
+		 */
+		hfi1_quiet_serdes(ppd);
+
+		if (ppd->hfi1_wq) {
+			destroy_workqueue(ppd->hfi1_wq);
+			ppd->hfi1_wq = NULL;
+		}
+	}
+	sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+	unsigned e;
+
+	if (!rcd)
+		return;
+
+	if (rcd->rcvhdrq) {
+		dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+				  rcd->rcvhdrq, rcd->rcvhdrq_phys);
+		rcd->rcvhdrq = NULL;
+		if (rcd->rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					  (void *)rcd->rcvhdrtail_kvaddr,
+					  rcd->rcvhdrqtailaddr_phys);
+			rcd->rcvhdrtail_kvaddr = NULL;
+		}
+	}
+
+	/* all the RcvArray entries should have been cleared by now */
+	kfree(rcd->egrbufs.rcvtids);
+
+	for (e = 0; e < rcd->egrbufs.alloced; e++) {
+		if (rcd->egrbufs.buffers[e].phys)
+			dma_free_coherent(&dd->pcidev->dev,
+					  rcd->egrbufs.buffers[e].len,
+					  rcd->egrbufs.buffers[e].addr,
+					  rcd->egrbufs.buffers[e].phys);
+	}
+	kfree(rcd->egrbufs.buffers);
+
+	sc_free(rcd->sc);
+	vfree(rcd->physshadow);
+	vfree(rcd->tid_pg_list);
+	vfree(rcd->user_event_mask);
+	vfree(rcd->subctxt_uregbase);
+	vfree(rcd->subctxt_rcvegrbuf);
+	vfree(rcd->subctxt_rcvhdr_base);
+	kfree(rcd->tidusemap);
+	kfree(rcd->opstats);
+	kfree(rcd);
+}
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	idr_remove(&hfi1_unit_table, dd->unit);
+	list_del(&dd->list);
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+	rcu_barrier(); /* wait for rcu callbacks to complete */
+	free_percpu(dd->int_counter);
+	free_percpu(dd->rcv_limit);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	int ret;
+
+	dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra);
+	if (!dd)
+		return ERR_PTR(-ENOMEM);
+	/* extra is * number of ports */
+	dd->num_pports = extra / sizeof(struct hfi1_pportdata);
+	dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+	INIT_LIST_HEAD(&dd->list);
+	dd->node = dev_to_node(&pdev->dev);
+	if (dd->node < 0)
+		dd->node = 0;
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+	ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		dd->unit = ret;
+		list_add(&dd->list, &hfi1_dev_list);
+	}
+
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	idr_preload_end();
+
+	if (ret < 0) {
+		hfi1_early_err(&pdev->dev,
+			       "Could not allocate unit ID: error %d\n", -ret);
+		goto bail;
+	}
+	/*
+	 * Initialize all locks for the device. This needs to be as early as
+	 * possible so locks are usable.
+	 */
+	spin_lock_init(&dd->sc_lock);
+	spin_lock_init(&dd->sendctrl_lock);
+	spin_lock_init(&dd->rcvctrl_lock);
+	spin_lock_init(&dd->uctxt_lock);
+	spin_lock_init(&dd->hfi1_diag_trans_lock);
+	spin_lock_init(&dd->sc_init_lock);
+	spin_lock_init(&dd->dc8051_lock);
+	spin_lock_init(&dd->dc8051_memlock);
+	mutex_init(&dd->qsfp_i2c_mutex);
+	seqlock_init(&dd->sc2vl_lock);
+	spin_lock_init(&dd->sde_map_lock);
+	init_waitqueue_head(&dd->event_queue);
+
+	dd->int_counter = alloc_percpu(u64);
+	if (!dd->int_counter) {
+		ret = -ENOMEM;
+		hfi1_early_err(&pdev->dev,
+			       "Could not allocate per-cpu int_counter\n");
+		goto bail;
+	}
+
+	dd->rcv_limit = alloc_percpu(u64);
+	if (!dd->rcv_limit) {
+		ret = -ENOMEM;
+		hfi1_early_err(&pdev->dev,
+			       "Could not allocate per-cpu rcv_limit\n");
+		goto bail;
+	}
+
+	if (!hfi1_cpulist_count) {
+		u32 count = num_online_cpus();
+
+		hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
+				      sizeof(long), GFP_KERNEL);
+		if (hfi1_cpulist)
+			hfi1_cpulist_count = count;
+		else
+			hfi1_early_err(
+			&pdev->dev,
+			"Could not alloc cpulist info, cpu affinity might be wrong\n");
+	}
+	hfi1_dbg_ibdev_init(&dd->verbs_dev);
+	return dd;
+
+bail:
+	if (!list_empty(&dd->list))
+		list_del_init(&dd->list);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+	if (dd->flags & HFI1_INITTED) {
+		u32 pidx;
+
+		dd->flags &= ~HFI1_INITTED;
+		if (dd->pport)
+			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+				struct hfi1_pportdata *ppd;
+
+				ppd = dd->pport + pidx;
+				if (dd->flags & HFI1_PRESENT)
+					set_link_state(ppd, HLS_DN_DISABLE);
+
+				if (ppd->statusp)
+					*ppd->statusp &= ~HFI1_STATUS_IB_READY;
+			}
+	}
+
+	/*
+	 * Mark as having had an error for driver, and also
+	 * for /sys and status word mapped to user programs.
+	 * This marks unit as not usable, until reset.
+	 */
+	if (dd->status)
+		dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+static const struct pci_device_id hfi1_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+	.name = DRIVER_NAME,
+	.probe = init_one,
+	.remove = remove_one,
+	.id_table = hfi1_pci_tbl,
+	.err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+	int i;
+
+	for (i = 0; i < krcvqsset; i++)
+		n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+	int ret;
+
+	ret = dev_init();
+	if (ret)
+		goto bail;
+
+	/* validate max MTU before any devices start */
+	if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+		pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+		       hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+		hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+	}
+	/* valid CUs run from 1-128 in powers of 2 */
+	if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+		hfi1_cu = 1;
+	/* valid credit return threshold is 0-100, variable is unsigned */
+	if (user_credit_return_threshold > 100)
+		user_credit_return_threshold = 100;
+
+	compute_krcvqs();
+	/* sanitize receive interrupt count, time must wait until after
+	   the hardware type is known */
+	if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+		rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+	/* reject invalid combinations */
+	if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+		pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+		rcv_intr_count = 1;
+	}
+	if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+		/*
+		 * Avoid indefinite packet delivery by requiring a timeout
+		 * if count is > 1.
+		 */
+		pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+		rcv_intr_timeout = 1;
+	}
+	if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+		/*
+		 * The dynamic algorithm expects a non-zero timeout
+		 * and a count > 1.
+		 */
+		pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+		rcv_intr_dynamic = 0;
+	}
+
+	/* sanitize link CRC options */
+	link_crc_mask &= SUPPORTED_CRCS;
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&hfi1_unit_table);
+
+	hfi1_dbg_init();
+	ret = pci_register_driver(&hfi1_pci_driver);
+	if (ret < 0) {
+		pr_err("Unable to register driver: error %d\n", -ret);
+		goto bail_dev;
+	}
+	goto bail; /* all OK */
+
+bail_dev:
+	hfi1_dbg_exit();
+	idr_destroy(&hfi1_unit_table);
+	dev_cleanup();
+bail:
+	return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+	pci_unregister_driver(&hfi1_pci_driver);
+	hfi1_dbg_exit();
+	hfi1_cpulist_count = 0;
+	kfree(hfi1_cpulist);
+
+	idr_destroy(&hfi1_unit_table);
+	dispose_firmware();	/* asymmetric with obtain_firmware() */
+	dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+	int ctxt;
+	int pidx;
+	struct hfi1_ctxtdata **tmp;
+	unsigned long flags;
+
+	/* users can't do anything more with chip */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		struct hfi1_pportdata *ppd = &dd->pport[pidx];
+		struct cc_state *cc_state;
+		int i;
+
+		if (ppd->statusp)
+			*ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+		for (i = 0; i < OPA_MAX_SLS; i++)
+			hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+		spin_lock(&ppd->cc_state_lock);
+		cc_state = get_cc_state(ppd);
+		rcu_assign_pointer(ppd->cc_state, NULL);
+		spin_unlock(&ppd->cc_state_lock);
+
+		if (cc_state)
+			call_rcu(&cc_state->rcu, cc_state_reclaim);
+	}
+
+	free_credit_return(dd);
+
+	/*
+	 * Free any resources still in use (usually just kernel contexts)
+	 * at unload; we do for ctxtcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that rcd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	tmp = dd->rcd;
+	dd->rcd = NULL;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+		struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+		tmp[ctxt] = NULL; /* debugging paranoia */
+		if (rcd) {
+			hfi1_clear_tids(rcd);
+			hfi1_free_ctxtdata(dd, rcd);
+		}
+	}
+	kfree(tmp);
+	/* must follow rcv context free - need to remove rcv's hooks */
+	for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+		sc_free(dd->send_contexts[ctxt].sc);
+	dd->num_send_contexts = 0;
+	kfree(dd->send_contexts);
+	dd->send_contexts = NULL;
+	kfree(dd->boardname);
+	vfree(dd->events);
+	vfree(dd->status);
+	hfi1_cq_exit(dd);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+	hfi1_start_cleanup(dd);
+
+	hfi1_pcie_ddcleanup(dd);
+	hfi1_pcie_cleanup(dd->pcidev);
+
+	cleanup_device_data(dd);
+
+	hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret = 0, j, pidx, initfail;
+	struct hfi1_devdata *dd = NULL;
+
+	/* First, lock the non-writable module parameters */
+	HFI1_CAP_LOCK();
+
+	/* Validate some global module parameters */
+	if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+		hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* use the encoding function as a sanitization check */
+	if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+		hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+			       hfi1_hdrq_entsize);
+		goto bail;
+	}
+
+	/* The receive eager buffer size must be set before the receive
+	 * contexts are created.
+	 *
+	 * Set the eager buffer size.  Validate that it falls in a range
+	 * allowed by the hardware - all powers of 2 between the min and
+	 * max.  The maximum valid MTU is within the eager buffer range
+	 * so we do not need to cap the max_mtu by an eager buffer size
+	 * setting.
+	 */
+	if (eager_buffer_size) {
+		if (!is_power_of_2(eager_buffer_size))
+			eager_buffer_size =
+				roundup_pow_of_two(eager_buffer_size);
+		eager_buffer_size =
+			clamp_val(eager_buffer_size,
+				  MIN_EAGER_BUFFER * 8,
+				  MAX_EAGER_BUFFER_TOTAL);
+		hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+				eager_buffer_size);
+	} else {
+		hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* restrict value of hfi1_rcvarr_split */
+	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+	ret = hfi1_pcie_init(pdev, ent);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Do device-specific initialization, function table setup, dd
+	 * allocation, etc.
+	 */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_INTEL0:
+	case PCI_DEVICE_ID_INTEL1:
+		dd = hfi1_init_dd(pdev, ent);
+		break;
+	default:
+		hfi1_early_err(&pdev->dev,
+			       "Failing on unknown Intel deviceid 0x%x\n",
+			       ent->device);
+		ret = -ENODEV;
+	}
+
+	if (IS_ERR(dd))
+		ret = PTR_ERR(dd);
+	if (ret)
+		goto clean_bail; /* error already printed */
+
+	ret = create_workqueues(dd);
+	if (ret)
+		goto clean_bail;
+
+	/* do the generic initialization */
+	initfail = hfi1_init(dd, 0);
+
+	ret = hfi1_register_ib_device(dd);
+
+	/*
+	 * Now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.  If earlier failure,
+	 * we still create devices, so diags, etc. can be used
+	 * to determine cause of problem.
+	 */
+	if (!initfail && !ret)
+		dd->flags |= HFI1_INITTED;
+
+	j = hfi1_device_create(dd);
+	if (j)
+		dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+	if (initfail || ret) {
+		stop_timers(dd);
+		flush_workqueue(ib_wq);
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			hfi1_quiet_serdes(dd->pport + pidx);
+		if (!j)
+			hfi1_device_remove(dd);
+		if (!ret)
+			hfi1_unregister_ib_device(dd);
+		postinit_cleanup(dd);
+		if (initfail)
+			ret = initfail;
+		goto bail;	/* everything already cleaned */
+	}
+
+	sdma_start(dd);
+
+	return 0;
+
+clean_bail:
+	hfi1_pcie_cleanup(pdev);
+bail:
+	return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	/* unregister from IB core */
+	hfi1_unregister_ib_device(dd);
+
+	/*
+	 * Disable the IB link, disable interrupts on the device,
+	 * clear dma engines, etc.
+	 */
+	shutdown_device(dd);
+
+	stop_timers(dd);
+
+	/* wait until all of our (qsfp) queue_work() calls complete */
+	flush_workqueue(ib_wq);
+
+	hfi1_device_remove(dd);
+
+	postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+	unsigned amt;
+	u64 reg;
+
+	if (!rcd->rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags;
+
+		/*
+		 * rcvhdrqentsize is in DWs, so we have to convert to bytes
+		 * (* sizeof(u32)).
+		 */
+		amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+			    sizeof(u32), PAGE_SIZE);
+
+		gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+			GFP_USER : GFP_KERNEL;
+		rcd->rcvhdrq = dma_zalloc_coherent(
+			&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+			gfp_flags | __GFP_COMP);
+
+		if (!rcd->rcvhdrq) {
+			dd_dev_err(dd,
+				"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+				amt, rcd->ctxt);
+			goto bail;
+		}
+
+		/* Event mask is per device now and is in hfi1_devdata */
+		/*if (rcd->ctxt >= dd->first_user_ctxt) {
+			rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+			if (!rcd->user_event_mask)
+				goto bail_free_hdrq;
+				}*/
+
+		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+			rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				gfp_flags);
+			if (!rcd->rcvhdrtail_kvaddr)
+				goto bail_free;
+			rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+		}
+
+		rcd->rcvhdrq_size = amt;
+	}
+	/*
+	 * These values are per-context:
+	 *	RcvHdrCnt
+	 *	RcvHdrEntSize
+	 *	RcvHdrSize
+	 */
+	reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+			& RCV_HDR_CNT_CNT_MASK)
+		<< RCV_HDR_CNT_CNT_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+	reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+			& RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+		<< RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+	reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+		<< RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+	return 0;
+
+bail_free:
+	dd_dev_err(dd,
+		"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+		rcd->ctxt);
+	vfree(rcd->user_event_mask);
+	rcd->user_event_mask = NULL;
+	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+			  rcd->rcvhdrq_phys);
+	rcd->rcvhdrq = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+	gfp_t gfp_flags;
+	u16 order;
+	int ret = 0;
+	u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	/*
+	 * The minimum size of the eager buffers is a groups of MTU-sized
+	 * buffers.
+	 * The global eager_buffer_size parameter is checked against the
+	 * theoretical lower limit of the value. Here, we check against the
+	 * MTU.
+	 */
+	if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+		rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+	/*
+	 * If using one-pkt-per-egr-buffer, lower the eager buffer
+	 * size to the max MTU (page-aligned).
+	 */
+	if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+		rcd->egrbufs.rcvtid_size = round_mtu;
+
+	/*
+	 * Eager buffers sizes of 1MB or less require smaller TID sizes
+	 * to satisfy the "multiple of 8 RcvArray entries" requirement.
+	 */
+	if (rcd->egrbufs.size <= (1 << 20))
+		rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+			rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+	while (alloced_bytes < rcd->egrbufs.size &&
+	       rcd->egrbufs.alloced < rcd->egrbufs.count) {
+		rcd->egrbufs.buffers[idx].addr =
+			dma_zalloc_coherent(&dd->pcidev->dev,
+					    rcd->egrbufs.rcvtid_size,
+					    &rcd->egrbufs.buffers[idx].phys,
+					    gfp_flags);
+		if (rcd->egrbufs.buffers[idx].addr) {
+			rcd->egrbufs.buffers[idx].len =
+				rcd->egrbufs.rcvtid_size;
+			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+				rcd->egrbufs.buffers[idx].addr;
+			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+				rcd->egrbufs.buffers[idx].phys;
+			rcd->egrbufs.alloced++;
+			alloced_bytes += rcd->egrbufs.rcvtid_size;
+			idx++;
+		} else {
+			u32 new_size, i, j;
+			u64 offset = 0;
+
+			/*
+			 * Fail the eager buffer allocation if:
+			 *   - we are already using the lowest acceptable size
+			 *   - we are using one-pkt-per-egr-buffer (this implies
+			 *     that we are accepting only one size)
+			 */
+			if (rcd->egrbufs.rcvtid_size == round_mtu ||
+			    !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+				dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+					rcd->ctxt);
+				goto bail_rcvegrbuf_phys;
+			}
+
+			new_size = rcd->egrbufs.rcvtid_size / 2;
+
+			/*
+			 * If the first attempt to allocate memory failed, don't
+			 * fail everything but continue with the next lower
+			 * size.
+			 */
+			if (idx == 0) {
+				rcd->egrbufs.rcvtid_size = new_size;
+				continue;
+			}
+
+			/*
+			 * Re-partition already allocated buffers to a smaller
+			 * size.
+			 */
+			rcd->egrbufs.alloced = 0;
+			for (i = 0, j = 0, offset = 0; j < idx; i++) {
+				if (i >= rcd->egrbufs.count)
+					break;
+				rcd->egrbufs.rcvtids[i].phys =
+					rcd->egrbufs.buffers[j].phys + offset;
+				rcd->egrbufs.rcvtids[i].addr =
+					rcd->egrbufs.buffers[j].addr + offset;
+				rcd->egrbufs.alloced++;
+				if ((rcd->egrbufs.buffers[j].phys + offset +
+				     new_size) ==
+				    (rcd->egrbufs.buffers[j].phys +
+				     rcd->egrbufs.buffers[j].len)) {
+					j++;
+					offset = 0;
+				} else
+					offset += new_size;
+			}
+			rcd->egrbufs.rcvtid_size = new_size;
+		}
+	}
+	rcd->egrbufs.numbufs = idx;
+	rcd->egrbufs.size = alloced_bytes;
+
+	dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+		rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
+		rcd->egrbufs.size);
+
+	/*
+	 * Set the contexts rcv array head update threshold to the closest
+	 * power of 2 (so we can use a mask instead of modulo) below half
+	 * the allocated entries.
+	 */
+	rcd->egrbufs.threshold =
+		rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+	/*
+	 * Compute the expected RcvArray entry base. This is done after
+	 * allocating the eager buffers in order to maximize the
+	 * expected RcvArray entries for the context.
+	 */
+	max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+	egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+	rcd->expected_count = max_entries - egrtop;
+	if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+		rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+	rcd->expected_base = rcd->eager_base + egrtop;
+	dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+		    rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+		    rcd->eager_base, rcd->expected_base);
+
+	if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+		dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n",
+			   rcd->ctxt, rcd->egrbufs.rcvtid_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+		hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+			      rcd->egrbufs.rcvtids[idx].phys, order);
+		cond_resched();
+	}
+	goto bail;
+
+bail_rcvegrbuf_phys:
+	for (idx = 0; idx < rcd->egrbufs.alloced &&
+		     rcd->egrbufs.buffers[idx].addr;
+	     idx++) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  rcd->egrbufs.buffers[idx].len,
+				  rcd->egrbufs.buffers[idx].addr,
+				  rcd->egrbufs.buffers[idx].phys);
+		rcd->egrbufs.buffers[idx].addr = NULL;
+		rcd->egrbufs.buffers[idx].phys = 0;
+		rcd->egrbufs.buffers[idx].len = 0;
+	}
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
new file mode 100644
index 0000000..426582b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/intr.c
@@ -0,0 +1,207 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+			  size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+	int i;
+
+	for (i = 0; i < nhwerrmsgs; i++)
+		if (hwerrs & hwerrmsgs[i].mask)
+			format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+	struct ib_event event;
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * Only call ib_dispatch_event() if the IB device has been
+	 * registered.  HFI1_INITED is set iff the driver has successfully
+	 * registered with the IB core.
+	 */
+	if (!(dd->flags & HFI1_INITTED))
+		return;
+	event.device = &dd->verbs_dev.ibdev;
+	event.element.port_num = ppd->port;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+	struct hfi1_pportdata *ppd = &dd->pport[0];
+	enum ib_event_type ev;
+
+	if (!(ppd->linkup ^ !!linkup))
+		return;	/* no change, nothing to do */
+
+	if (linkup) {
+		/*
+		 * Quick linkup and all link up on the simulator does not
+		 * trigger or implement:
+		 *	- VerifyCap interrupt
+		 *	- VerifyCap frames
+		 * But rather moves directly to LinkUp.
+		 *
+		 * Do the work of the VerifyCap interrupt handler,
+		 * handle_verify_cap(), but do not try moving the state to
+		 * LinkUp as we are already there.
+		 *
+		 * NOTE: This uses this device's vAU, vCU, and vl15_init for
+		 * the remote values.  Both sides must be using the values.
+		 */
+		if (quick_linkup
+			    || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+			set_up_vl15(dd, dd->vau, dd->vl15_init);
+			assign_remote_cm_au_table(dd, dd->vcu);
+			ppd->neighbor_guid =
+				read_csr(dd,
+					DC_DC8051_STS_REMOTE_GUID);
+			ppd->neighbor_type =
+				read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+					DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+			ppd->neighbor_port_number =
+				read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+					DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+			dd_dev_info(dd,
+				"Neighbor GUID: %llx Neighbor type %d\n",
+				ppd->neighbor_guid,
+				ppd->neighbor_type);
+		}
+
+		/* physical link went up */
+		ppd->linkup = 1;
+		ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+
+		/* link widths are not available until the link is fully up */
+		get_linkup_link_widths(ppd);
+
+	} else {
+		/* physical link went down */
+		ppd->linkup = 0;
+
+		/* clear HW details of the previous connection */
+		reset_link_credits(dd);
+
+		/* freeze after a link down to guarantee a clean egress */
+		start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN);
+
+		ev = IB_EVENT_PORT_ERR;
+
+		hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+		/* if we are down, the neighbor is down */
+		ppd->neighbor_normal = 0;
+
+		/* notify IB of the link change */
+		signal_ib_event(ppd, ev);
+	}
+
+
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts.  This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (!rcd->cnt)
+		goto done;
+
+	if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+		wake_up_interruptible(&rcd->wait);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+	} else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+							&rcd->event_flags)) {
+		rcd->urgent++;
+		wake_up_interruptible(&rcd->wait);
+	}
+done:
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
new file mode 100644
index 0000000..fa361b4
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/iowait.h
@@ -0,0 +1,186 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+	struct list_head list;
+	struct list_head tx_head;
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait *wait,
+		struct sdma_txreq *tx,
+		unsigned seq);
+	void (*wakeup)(struct iowait *wait, int reason);
+	struct work_struct iowork;
+	wait_queue_head_t wait_dma;
+	atomic_t sdma_busy;
+	u32 count;
+	u32 tx_limit;
+	u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @wakeup: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+	struct iowait *wait,
+	u32 tx_limit,
+	void (*func)(struct work_struct *work),
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait *wait,
+		struct sdma_txreq *tx,
+		unsigned seq),
+	void (*wakeup)(struct iowait *wait, int reason))
+{
+	wait->count = 0;
+	INIT_LIST_HEAD(&wait->list);
+	INIT_LIST_HEAD(&wait->tx_head);
+	INIT_WORK(&wait->iowork, func);
+	init_waitqueue_head(&wait->wait_dma);
+	atomic_set(&wait->sdma_busy, 0);
+	wait->tx_limit = tx_limit;
+	wait->sleep = sleep;
+	wait->wakeup = wakeup;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ */
+static inline void iowait_schedule(
+	struct iowait *wait,
+	struct workqueue_struct *wq)
+{
+	queue_work(wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+	wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+	wake_up(&wait->wait_dma);
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c
new file mode 100644
index 0000000..f6eff17
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/keys.c
@@ -0,0 +1,411 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/**
+ * hfi1_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret = 0;
+	struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+	struct hfi1_lkey_table *rkt = &dev->lk_table;
+
+	hfi1_get_mr(mr);
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* special case for dma_mr lkey == 0 */
+	if (dma_region) {
+		struct hfi1_mregion *tmr;
+
+		tmr = rcu_access_pointer(dev->dma_mr);
+		if (!tmr) {
+			rcu_assign_pointer(dev->dma_mr, mr);
+			mr->lkey_published = 1;
+		} else {
+			hfi1_put_mr(mr);
+		}
+		goto success;
+	}
+
+	/* Find the next available LKEY */
+	r = rkt->next;
+	n = r;
+	for (;;) {
+		if (!rcu_access_pointer(rkt->table[r]))
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n)
+			goto bail;
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	/*
+	 * bits are capped in verbs.c to ensure enough bits for
+	 * generation number
+	 */
+	mr->lkey = (r << (32 - hfi1_lkey_table_size)) |
+		((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	rcu_assign_pointer(rkt->table[r], mr);
+	mr->lkey_published = 1;
+success:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+	return ret;
+bail:
+	hfi1_put_mr(mr);
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = -ENOMEM;
+	goto out;
+}
+
+/**
+ * hfi1_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+void hfi1_free_lkey(struct hfi1_mregion *mr)
+{
+	unsigned long flags;
+	u32 lkey = mr->lkey;
+	u32 r;
+	struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+	struct hfi1_lkey_table *rkt = &dev->lk_table;
+	int freed = 0;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (!mr->lkey_published)
+		goto out;
+	if (lkey == 0)
+		RCU_INIT_POINTER(dev->dma_mr, NULL);
+	else {
+		r = lkey >> (32 - hfi1_lkey_table_size);
+		RCU_INIT_POINTER(rkt->table[r], NULL);
+	}
+	mr->lkey_published = 0;
+	freed++;
+out:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	if (freed) {
+		synchronize_rcu();
+		hfi1_put_mr(mr);
+	}
+}
+
+/**
+ * hfi1_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * increments the reference count upon success
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+		 struct hfi1_sge *isge, struct ib_sge *sge, int acc)
+{
+	struct hfi1_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see hfi1_get_dma_mr and dma.c).
+	 */
+	rcu_read_lock();
+	if (sge->lkey == 0) {
+		struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		atomic_inc(&mr->refcount);
+		rcu_read_unlock();
+
+		isge->mr = mr;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		isge->m = 0;
+		isge->n = 0;
+		goto ok;
+	}
+	mr = rcu_dereference(
+		rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+		goto bail;
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc))
+		goto bail;
+	atomic_inc(&mr->refcount);
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off / HFI1_SEGSZ;
+		n = entries_spanned_by_off % HFI1_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= HFI1_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * hfi1_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+		 u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct hfi1_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see hfi1_get_dma_mr and dma.c).
+	 */
+	rcu_read_lock();
+	if (rkey == 0) {
+		struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+		struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		atomic_inc(&mr->refcount);
+		rcu_read_unlock();
+
+		sge->mr = mr;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		sge->m = 0;
+		sge->n = 0;
+		goto ok;
+	}
+
+	mr = rcu_dereference(
+		rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0))
+		goto bail;
+	atomic_inc(&mr->refcount);
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off / HFI1_SEGSZ;
+		n = entries_spanned_by_off % HFI1_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= HFI1_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/*
+ * Initialize the memory region specified by the work request.
+ */
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+	struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+	struct hfi1_mregion *mr;
+	u32 rkey = wr->wr.fast_reg.rkey;
+	unsigned i, n, m;
+	int ret = -EINVAL;
+	unsigned long flags;
+	u64 *page_list;
+	size_t ps;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (pd->user || rkey == 0)
+		goto bail;
+
+	mr = rcu_dereference_protected(
+		rkt->table[(rkey >> (32 - hfi1_lkey_table_size))],
+		lockdep_is_held(&rkt->lock));
+	if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	if (wr->wr.fast_reg.page_list_len > mr->max_segs)
+		goto bail;
+
+	ps = 1UL << wr->wr.fast_reg.page_shift;
+	if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
+		goto bail;
+
+	mr->user_base = wr->wr.fast_reg.iova_start;
+	mr->iova = wr->wr.fast_reg.iova_start;
+	mr->lkey = rkey;
+	mr->length = wr->wr.fast_reg.length;
+	mr->access_flags = wr->wr.fast_reg.access_flags;
+	page_list = wr->wr.fast_reg.page_list->page_list;
+	m = 0;
+	n = 0;
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		mr->map[m]->segs[n].vaddr = (void *) page_list[i];
+		mr->map[m]->segs[n].length = ps;
+		if (++n == HFI1_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
new file mode 100644
index 0000000..0a18fee
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -0,0 +1,4257 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+			/ (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+	void *data = opa_get_smp_data(smp);
+	size_t size = opa_get_smp_data_size(smp);
+
+	memset(data, 0, size);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	int ret;
+	unsigned long flags;
+	unsigned long timeout;
+	int pkey_idx;
+	u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+	agent = ibp->send_agent;
+	if (!agent)
+		return;
+
+	/* o14-3.2.1 */
+	if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+		return;
+
+	/* o14-2 */
+	if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+		return;
+
+	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+	if (pkey_idx < 0) {
+		pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+			__func__, hfi1_get_pkey(ibp, 1));
+		pkey_idx = 1;
+	}
+
+	send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf))
+		return;
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_TRAP;
+	ibp->tid++;
+	smp->tid = cpu_to_be64(ibp->tid);
+	smp->attr_id = IB_SMP_ATTR_NOTICE;
+	/* o14-1: smp->mkey = 0; */
+	memcpy(smp->data, data, len);
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	if (!ibp->sm_ah) {
+		if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+			struct ib_ah *ah;
+
+			ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid);
+			if (IS_ERR(ah))
+				ret = PTR_ERR(ah);
+			else {
+				send_buf->ah = ah;
+				ibp->sm_ah = to_iah(ah);
+				ret = 0;
+			}
+		} else
+			ret = -EINVAL;
+	} else {
+		send_buf->ah = &ibp->sm_ah->ibah;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (!ret) {
+		/* 4.096 usec. */
+		timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
+		ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+	} else {
+		ib_free_send_mad(send_buf);
+		ibp->trap_timeout = 0;
+	}
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		    u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+	struct ib_mad_notice_attr data;
+
+	if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
+		ibp->pkey_violations++;
+	else
+		ibp->qkey_violations++;
+	ibp->n_pkt_drops++;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = trap_num;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_257_258.lid1 = lid1;
+	data.details.ntc_257_258.lid2 = lid2;
+	data.details.ntc_257_258.key = cpu_to_be32(key);
+	data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+	data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+		     __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+	struct ib_mad_notice_attr data;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_256.lid = data.issuer_lid;
+	data.details.ntc_256.method = mad->method;
+	data.details.ntc_256.attr_id = mad->attr_id;
+	data.details.ntc_256.attr_mod = mad->attr_mod;
+	data.details.ntc_256.mkey = mkey;
+	if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+
+		data.details.ntc_256.dr_slid = (__force __be16)dr_slid;
+		data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+			data.details.ntc_256.dr_trunc_hop |=
+				IB_NOTICE_TRAP_DR_TRUNC;
+			hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+		}
+		data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(data.details.ntc_256.dr_rtn_path, return_path,
+		       hop_cnt);
+	}
+
+	send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+
+	send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_145.lid = data.issuer_lid;
+	data.details.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+
+	send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.local_changes = 1;
+	data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+	send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+				   u8 *data, struct ib_device *ibdev,
+				   u8 port, u32 *resp_len)
+{
+	struct opa_node_description *nd;
+
+	if (am) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	nd = (struct opa_node_description *)data;
+
+	memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+	if (resp_len)
+		*resp_len += sizeof(*nd);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct opa_node_info *ni;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+	ni = (struct opa_node_info *)data;
+
+	/* GUID 0 is illegal */
+	if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+	ni->base_version = OPA_MGMT_BASE_VERSION;
+	ni->class_version = OPA_SMI_CLASS_VERSION;
+	ni->node_type = 1;     /* channel adapter */
+	ni->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	ni->system_image_guid = ib_hfi1_sys_image_guid;
+	/* Use first-port GUID as node */
+	ni->node_guid = cpu_to_be64(dd->pport->guid);
+	ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+	ni->device_id = cpu_to_be16(dd->pcidev->device);
+	ni->revision = cpu_to_be32(dd->minrev);
+	ni->local_port_num = port;
+	ni->vendor_id[0] = dd->oui1;
+	ni->vendor_id[1] = dd->oui2;
+	ni->vendor_id[2] = dd->oui3;
+
+	if (resp_len)
+		*resp_len += sizeof(*ni);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || pidx >= dd->num_pports ||
+	    dd->pport[pidx].guid == 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+	nip->base_version = OPA_MGMT_BASE_VERSION;
+	nip->class_version = OPA_SMI_CLASS_VERSION;
+	nip->node_type = 1;     /* channel adapter */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = ib_hfi1_sys_image_guid;
+	 /* Use first-port GUID as node */
+	nip->node_guid = cpu_to_be64(dd->pport->guid);
+	nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->pcidev->device);
+	nip->revision = cpu_to_be32(dd->minrev);
+	nip->local_port_num = port;
+	nip->vendor_id[0] = dd->oui1;
+	nip->vendor_id[1] = dd->oui2;
+	nip->vendor_id[2] = dd->oui3;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+		      int mad_flags, __be64 mkey, __be32 dr_slid,
+		      u8 return_path[], u8 hop_cnt)
+{
+	int valid_mkey = 0;
+	int ret = 0;
+
+	/* Is the mkey in the process of expiring? */
+	if (ibp->mkey_lease_timeout &&
+	    time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		ibp->mkey_lease_timeout = 0;
+		ibp->mkeyprot = 0;
+	}
+
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->mkey == 0 ||
+	    ibp->mkey == mkey)
+		valid_mkey = 1;
+
+	/* Unset lease timeout on any valid Get/Set/TrapRepress */
+	if (valid_mkey && ibp->mkey_lease_timeout &&
+	    (mad->method == IB_MGMT_METHOD_GET ||
+	     mad->method == IB_MGMT_METHOD_SET ||
+	     mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+		ibp->mkey_lease_timeout = 0;
+
+	if (!valid_mkey) {
+		switch (mad->method) {
+		case IB_MGMT_METHOD_GET:
+			/* Bad mkey not a violation below level 2 */
+			if (ibp->mkeyprot < 2)
+				break;
+		case IB_MGMT_METHOD_SET:
+		case IB_MGMT_METHOD_TRAP_REPRESS:
+			if (ibp->mkey_violations != 0xFFFF)
+				++ibp->mkey_violations;
+			if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
+				ibp->mkey_lease_timeout = jiffies +
+					ibp->mkey_lease_period * HZ;
+			/* Generate a trap notice. */
+			bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+				 hop_cnt);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+	u32 off;
+	u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+	{ DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		if (lcb_cache[i].off == off) {
+			lcb_cache[i].val = val;
+			return 0;
+		}
+	}
+
+	pr_warn("%s bad offset 0x%x\n", __func__, off);
+	return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		if (lcb_cache[i].off == off) {
+			*val = lcb_cache[i].val;
+			return 0;
+		}
+	}
+
+	pr_warn("%s bad offset 0x%x\n", __func__, off);
+	return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+		dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+	else
+		write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static u8 __opa_porttype(struct hfi1_pportdata *ppd)
+{
+	if (qsfp_mod_present(ppd)) {
+		if (ppd->qsfp_info.cache_valid)
+			return OPA_PORT_TYPE_STANDARD;
+		return OPA_PORT_TYPE_DISCONNECTED;
+	}
+	return OPA_PORT_TYPE_UNKNOWN;
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	int i;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct opa_port_info *pi = (struct opa_port_info *)data;
+	u8 mtu;
+	u8 credit_rate;
+	u32 state;
+	u32 num_ports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 buffer_units;
+	u64 tmp = 0;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	ibp = &ppd->ibport_data;
+
+	if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+		ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	pi->lid = cpu_to_be32(ppd->lid);
+
+	/* Only return the mkey if the protection field allows it. */
+	if (!(smp->method == IB_MGMT_METHOD_GET &&
+	      ibp->mkey != smp->mkey &&
+	      ibp->mkeyprot == 1))
+		pi->mkey = ibp->mkey;
+
+	pi->subnet_prefix = ibp->gid_prefix;
+	pi->sm_lid = cpu_to_be32(ibp->sm_lid);
+	pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+	pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+	pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+	pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+	pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+	pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+	pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+	pi->link_width_downgrade.supported =
+			cpu_to_be16(ppd->link_width_downgrade_supported);
+	pi->link_width_downgrade.enabled =
+			cpu_to_be16(ppd->link_width_downgrade_enabled);
+	pi->link_width_downgrade.tx_active =
+			cpu_to_be16(ppd->link_width_downgrade_tx_active);
+	pi->link_width_downgrade.rx_active =
+			cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+	pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+	pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+	pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+	state = driver_lstate(ppd);
+
+	if (start_of_sm_config && (state == IB_PORT_INIT))
+		ppd->is_sm_config_started = 1;
+
+	pi->port_phys_conf = __opa_porttype(ppd) & 0xf;
+
+#if PI_LED_ENABLE_SUP
+	pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+	pi->port_states.ledenable_offlinereason |=
+		ppd->is_sm_config_started << 5;
+	pi->port_states.ledenable_offlinereason |=
+		ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+	pi->port_states.offline_reason = ppd->neighbor_normal << 4;
+	pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+	pi->port_states.offline_reason |= ppd->offline_disabled_reason &
+						OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+	pi->port_states.portphysstate_portstate =
+		(hfi1_ibphys_portstate(ppd) << 4) | state;
+
+	pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+
+	memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+	for (i = 0; i < ppd->vls_supported; i++) {
+		mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+		if ((i % 2) == 0)
+			pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4);
+		else
+			pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu;
+	}
+	/* don't forget VL 15 */
+	mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+	pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu;
+	pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL;
+	pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+	pi->partenforce_filterraw |=
+		(ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+	if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+		pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+	if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+		pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+	pi->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pi->pkey_violations = cpu_to_be16(ibp->pkey_violations);
+	pi->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+
+	pi->vl.cap = ppd->vls_supported;
+	pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit);
+	pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+	pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+	pi->clientrereg_subnettimeout = ibp->subnet_timeout;
+
+	pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+					  OPA_PORT_LINK_MODE_OPA << 5 |
+					  OPA_PORT_LINK_MODE_OPA);
+
+	pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+	pi->port_mode = cpu_to_be16(
+				ppd->is_active_optimize_enabled ?
+					OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+	pi->port_packet_format.supported =
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+	pi->port_packet_format.enabled =
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+	/* flit_control.interleave is (OPA V1, version .76):
+	 * bits		use
+	 * ----		---
+	 * 2		res
+	 * 2		DistanceSupported
+	 * 2		DistanceEnabled
+	 * 5		MaxNextLevelTxEnabled
+	 * 5		MaxNestLevelRxSupported
+	 *
+	 * HFI supports only "distance mode 1" (see OPA V1, version .76,
+	 * section 9.6.2), so set DistanceSupported, DistanceEnabled
+	 * to 0x1.
+	 */
+	pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+	pi->link_down_reason = ppd->local_link_down_reason.sma;
+	pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+	pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+	pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+	/* 32.768 usec. response time (guessing) */
+	pi->resptimevalue = 3;
+
+	pi->local_port_num = port;
+
+	/* buffer info for FM */
+	pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+	pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+	pi->neigh_port_num = ppd->neighbor_port_number;
+	pi->port_neigh_mode =
+		(ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+		(ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+		(ppd->neighbor_fm_security ?
+			OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+	/* HFIs shall always return VL15 credits to their
+	 * neighbor in a timely manner, without any credit return pacing.
+	 */
+	credit_rate = 0;
+	buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+	buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+	buffer_units |= (credit_rate << 6) &
+				OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+	buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+	pi->buffer_units = cpu_to_be32(buffer_units);
+
+	pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+	/* HFI supports a replay buffer 128 LTPs in size */
+	pi->replay_depth.buffer = 0x80;
+	/* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+	read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+	/* this counter is 16 bits wide, but the replay_depth.wire
+	 * variable is only 8 bits */
+	if (tmp > 0xff)
+		tmp = 0xff;
+	pi->replay_depth.wire = tmp;
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_port_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+	memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+	return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 n_blocks_req = OPA_AM_NBLK(am);
+	u32 start_block = am & 0x7ff;
+	__be16 *p;
+	u16 *q;
+	int i;
+	u16 n_blocks_avail;
+	unsigned npkeys = hfi1_get_npkeys(dd);
+	size_t size;
+
+	if (n_blocks_req == 0) {
+		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+			port, start_block, n_blocks_req);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+	size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+	if (start_block + n_blocks_req > n_blocks_avail ||
+	    n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+		pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+			"avail 0x%x; blk/smp 0x%lx\n",
+			start_block, n_blocks_req, n_blocks_avail,
+			OPA_NUM_PKEY_BLOCKS_PER_SMP);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	p = (__be16 *) data;
+	q = (u16 *)data;
+	/* get the real pkeys if we are requesting the first block */
+	if (start_block == 0) {
+		get_pkeys(dd, port, q);
+		for (i = 0; i < npkeys; i++)
+			p[i] = cpu_to_be16(q[i]);
+		if (resp_len)
+			*resp_len += size;
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+	HFI_TRANSITION_DISALLOWED,
+	HFI_TRANSITION_IGNORED,
+	HFI_TRANSITION_ALLOWED,
+	HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+	__D = HFI_TRANSITION_DISALLOWED,
+	__I = HFI_TRANSITION_IGNORED,
+	__A = HFI_TRANSITION_ALLOWED,
+	__U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+	u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+	{
+		/* 2    3    4    5    6    7    8    9   10   11 */
+	/* 2 */	{ __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+	/* 3 */	{ __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+	/* 4 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 5 */	{ __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+	/* 6 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 7 */	{ __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+	/* 8 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 9 */	{ __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+	/*10 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/*11 */	{ __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+	}
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+	u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+	{
+		/* 1    2    3    4    5 */
+	/* 1 */	{ __I, __D, __D, __D, __U},
+	/* 2 */	{ __D, __I, __A, __D, __U},
+	/* 3 */	{ __D, __D, __I, __A, __U},
+	/* 4 */	{ __D, __D, __I, __I, __U},
+	/* 5 */	{ __U, __U, __U, __U, __U},
+	}
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+	if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+	    new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+		pr_warn("invalid logical state(s) (old %d new %d)\n",
+			old, new);
+		return HFI_TRANSITION_UNDEFINED;
+	}
+
+	if (new == IB_PORT_NOP)
+		return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+	/* adjust states for indexing into logical_state_transitions */
+	old -= IB_PORT_DOWN;
+	new -= IB_PORT_DOWN;
+
+	if (old < 0 || new < 0)
+		return HFI_TRANSITION_UNDEFINED;
+	return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+	if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+	    new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+		pr_warn("invalid physical state(s) (old %d new %d)\n",
+			old, new);
+		return HFI_TRANSITION_UNDEFINED;
+	}
+
+	if (new == IB_PORTPHYSSTATE_NOP)
+		return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+	/* adjust states for indexing into physical_state_transitions */
+	old -= IB_PORTPHYSSTATE_POLLING;
+	new -= IB_PORTPHYSSTATE_POLLING;
+
+	if (old < 0 || new < 0)
+		return HFI_TRANSITION_UNDEFINED;
+	return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+					  u32 logical_new, u32 physical_new)
+{
+	u32 physical_old = driver_physical_state(ppd);
+	u32 logical_old = driver_logical_state(ppd);
+	int ret, logical_allowed, physical_allowed;
+
+	logical_allowed = ret =
+		logical_transition_allowed(logical_old, logical_new);
+
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		pr_warn("invalid logical state transition %s -> %s\n",
+			opa_lstate_name(logical_old),
+			opa_lstate_name(logical_new));
+		return ret;
+	}
+
+	physical_allowed = ret =
+		physical_transition_allowed(physical_old, physical_new);
+
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		pr_warn("invalid physical state transition %s -> %s\n",
+			opa_pstate_name(physical_old),
+			opa_pstate_name(physical_new));
+		return ret;
+	}
+
+	if (logical_allowed == HFI_TRANSITION_IGNORED &&
+	    physical_allowed == HFI_TRANSITION_IGNORED)
+		return HFI_TRANSITION_IGNORED;
+
+	/*
+	 * Either physical_allowed or logical_allowed is
+	 * HFI_TRANSITION_ALLOWED.
+	 */
+	return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+			   u32 logical_state, u32 phys_state,
+			   int suppress_idle_sma)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 link_state;
+	int ret;
+
+	ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		/* error message emitted above */
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return 0;
+	}
+
+	if (ret == HFI_TRANSITION_IGNORED)
+		return 0;
+
+	if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+	    !(logical_state == IB_PORT_DOWN ||
+	      logical_state == IB_PORT_NOP)){
+		pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+			logical_state, phys_state);
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	/*
+	 * Logical state changes are summarized in OPAv1g1 spec.,
+	 * Table 9-12; physical state changes are summarized in
+	 * OPAv1g1 spec., Table 6.4.
+	 */
+	switch (logical_state) {
+	case IB_PORT_NOP:
+		if (phys_state == IB_PORTPHYSSTATE_NOP)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (phys_state == IB_PORTPHYSSTATE_NOP)
+			link_state = HLS_DN_DOWNDEF;
+		else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+			link_state = HLS_DN_POLL;
+			set_link_down_reason(ppd,
+			     OPA_LINKDOWN_REASON_FM_BOUNCE, 0,
+			     OPA_LINKDOWN_REASON_FM_BOUNCE);
+		} else if (phys_state == IB_PORTPHYSSTATE_DISABLED)
+			link_state = HLS_DN_DISABLE;
+		else {
+			pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+				phys_state);
+			smp->status |= IB_SMP_INVALID_FIELD;
+			break;
+		}
+
+		set_link_state(ppd, link_state);
+		if (link_state == HLS_DN_DISABLE &&
+		    (ppd->offline_disabled_reason >
+		     OPA_LINKDOWN_REASON_SMA_DISABLED ||
+		     ppd->offline_disabled_reason ==
+		     OPA_LINKDOWN_REASON_NONE))
+			ppd->offline_disabled_reason =
+			OPA_LINKDOWN_REASON_SMA_DISABLED;
+		/*
+		 * Don't send a reply if the response would be sent
+		 * through the disabled port.
+		 */
+		if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		break;
+	case IB_PORT_ARMED:
+		ret = set_link_state(ppd, HLS_UP_ARMED);
+		if ((ret == 0) && (suppress_idle_sma == 0))
+			send_idle_sma(dd, SMA_IDLE_ARM);
+		break;
+	case IB_PORT_ACTIVE:
+		if (ppd->neighbor_normal) {
+			ret = set_link_state(ppd, HLS_UP_ACTIVE);
+			if (ret == 0)
+				send_idle_sma(dd, SMA_IDLE_ACTIVE);
+		} else {
+			pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+			smp->status |= IB_SMP_INVALID_FIELD;
+		}
+		break;
+	default:
+		pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+			logical_state);
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct opa_port_info *pi = (struct opa_port_info *)data;
+	struct ib_event event;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u8 clientrereg;
+	unsigned long flags;
+	u32 smlid, opa_lid; /* tmp vars to hold LID values */
+	u16 lid;
+	u8 ls_old, ls_new, ps_new;
+	u8 vls;
+	u8 msl;
+	u8 crc_enabled;
+	u16 lse, lwe, mtu;
+	u32 num_ports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	int ret, i, invalid = 0, call_set_mtu = 0;
+	int call_link_downgrade_policy = 0;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	opa_lid = be32_to_cpu(pi->lid);
+	if (opa_lid & 0xFFFF0000) {
+		pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		goto get_only;
+	}
+
+	lid = (u16)(opa_lid & 0x0000FFFF);
+
+	smlid = be32_to_cpu(pi->sm_lid);
+	if (smlid & 0xFFFF0000) {
+		pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		goto get_only;
+	}
+	smlid &= 0x0000FFFF;
+
+	clientrereg = (pi->clientrereg_subnettimeout &
+			OPA_PI_MASK_CLIENT_REREGISTER);
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	ibp = &ppd->ibport_data;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	ls_old = driver_lstate(ppd);
+
+	ibp->mkey = pi->mkey;
+	ibp->gid_prefix = pi->subnet_prefix;
+	ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+	/* Must be a valid unicast LID address. */
+	if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+	     lid >= HFI1_MULTICAST_LID_BASE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+			lid);
+	} else if (ppd->lid != lid ||
+		 ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+		if (ppd->lid != lid)
+			hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+		if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+			hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+		hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	msl = pi->smsl & OPA_PI_MASK_SMSL;
+	if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+		ppd->linkinit_reason =
+			(pi->partenforce_filterraw &
+			 OPA_PI_MASK_LINKINIT_REASON);
+	/* enable/disable SW pkey checking as per FM control */
+	if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+		ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+	else
+		ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+	if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+		ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+	else
+		ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+	/* Must be a valid unicast LID address. */
+	if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+	     smlid >= HFI1_MULTICAST_LID_BASE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+	} else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+		pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+		spin_lock_irqsave(&ibp->lock, flags);
+		if (ibp->sm_ah) {
+			if (smlid != ibp->sm_lid)
+				ibp->sm_ah->attr.dlid = smlid;
+			if (msl != ibp->sm_sl)
+				ibp->sm_ah->attr.sl = msl;
+		}
+		spin_unlock_irqrestore(&ibp->lock, flags);
+		if (smlid != ibp->sm_lid)
+			ibp->sm_lid = smlid;
+		if (msl != ibp->sm_sl)
+			ibp->sm_sl = msl;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	if (pi->link_down_reason == 0) {
+		ppd->local_link_down_reason.sma = 0;
+		ppd->local_link_down_reason.latest = 0;
+	}
+
+	if (pi->neigh_link_down_reason == 0) {
+		ppd->neigh_link_down_reason.sma = 0;
+		ppd->neigh_link_down_reason.latest = 0;
+	}
+
+	ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+	ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+	ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+	lwe = be16_to_cpu(pi->link_width.enabled);
+	if (lwe) {
+		if (lwe == OPA_LINK_WIDTH_RESET
+				|| lwe == OPA_LINK_WIDTH_RESET_OLD)
+			set_link_width_enabled(ppd, ppd->link_width_supported);
+		else if ((lwe & ~ppd->link_width_supported) == 0)
+			set_link_width_enabled(ppd, lwe);
+		else
+			smp->status |= IB_SMP_INVALID_FIELD;
+	}
+	lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+	/* LWD.E is always applied - 0 means "disabled" */
+	if (lwe == OPA_LINK_WIDTH_RESET
+			|| lwe == OPA_LINK_WIDTH_RESET_OLD) {
+		set_link_width_downgrade_enabled(ppd,
+				ppd->link_width_downgrade_supported);
+	} else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+		/* only set and apply if something changed */
+		if (lwe != ppd->link_width_downgrade_enabled) {
+			set_link_width_downgrade_enabled(ppd, lwe);
+			call_link_downgrade_policy = 1;
+		}
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	lse = be16_to_cpu(pi->link_speed.enabled);
+	if (lse) {
+		if (lse & be16_to_cpu(pi->link_speed.supported))
+			set_link_speed_enabled(ppd, lse);
+		else
+			smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+	ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+				    ibp->vl_high_limit);
+
+	if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+		ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+	for (i = 0; i < ppd->vls_supported; i++) {
+		if ((i % 2) == 0)
+			mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4)
+					  & 0xF);
+		else
+			mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF);
+		if (mtu == 0xffff) {
+			pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+				mtu,
+				(pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+			smp->status |= IB_SMP_INVALID_FIELD;
+			mtu = hfi1_max_mtu; /* use a valid MTU */
+		}
+		if (dd->vld[i].mtu != mtu) {
+			dd_dev_info(dd,
+				"MTU change on vl %d from %d to %d\n",
+				i, dd->vld[i].mtu, mtu);
+			dd->vld[i].mtu = mtu;
+			call_set_mtu++;
+		}
+	}
+	/* As per OPAV1 spec: VL15 must support and be configured
+	 * for operation with a 2048 or larger MTU.
+	 */
+	mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF);
+	if (mtu < 2048 || mtu == 0xffff)
+		mtu = 2048;
+	if (dd->vld[15].mtu != mtu) {
+		dd_dev_info(dd,
+			"MTU change on vl 15 from %d to %d\n",
+			dd->vld[15].mtu, mtu);
+		dd->vld[15].mtu = mtu;
+		call_set_mtu++;
+	}
+	if (call_set_mtu)
+		set_mtu(ppd);
+
+	/* Set operational VLs */
+	vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+	if (vls) {
+		if (vls > ppd->vls_supported) {
+			pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+				pi->operational_vls);
+			smp->status |= IB_SMP_INVALID_FIELD;
+		} else {
+			if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+						vls) == -EINVAL)
+				smp->status |= IB_SMP_INVALID_FIELD;
+		}
+	}
+
+	if (pi->mkey_violations == 0)
+		ibp->mkey_violations = 0;
+
+	if (pi->pkey_violations == 0)
+		ibp->pkey_violations = 0;
+
+	if (pi->qkey_violations == 0)
+		ibp->qkey_violations = 0;
+
+	ibp->subnet_timeout =
+		pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+	crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+	crc_enabled >>= 4;
+	crc_enabled &= 0xf;
+
+	if (crc_enabled != 0)
+		ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+	ppd->is_active_optimize_enabled =
+			!!(be16_to_cpu(pi->port_mode)
+					& OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+	ls_new = pi->port_states.portphysstate_portstate &
+			OPA_PI_MASK_PORT_STATE;
+	ps_new = (pi->port_states.portphysstate_portstate &
+			OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+	if (ls_old == IB_PORT_INIT) {
+		if (start_of_sm_config) {
+			if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+				ppd->is_sm_config_started = 1;
+		} else if (ls_new == IB_PORT_ARMED) {
+			if (ppd->is_sm_config_started == 0)
+				invalid = 1;
+		}
+	}
+
+	/* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+	if (clientrereg) {
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+
+	ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+	if (ret)
+		return ret;
+
+	ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+	/* restore re-reg bit per o14-12.2.1 */
+	pi->clientrereg_subnettimeout |= clientrereg;
+
+	/*
+	 * Apply the new link downgrade policy.  This may result in a link
+	 * bounce.  Do this after everything else so things are settled.
+	 * Possible problem: if setting the port state above fails, then
+	 * the policy change is not applied.
+	 */
+	if (call_link_downgrade_policy)
+		apply_link_downgrade_policy(ppd, 0);
+
+	return ret;
+
+get_only:
+	return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+	int changed = 0;
+	int update_includes_mgmt_partition = 0;
+
+	/*
+	 * IB port one/two always maps to context zero/one,
+	 * always a kernel context, no locking needed
+	 * If we get here with ppd setup, no need to check
+	 * that rcd is valid.
+	 */
+	ppd = dd->pport + (port - 1);
+	/*
+	 * If the update does not include the management pkey, don't do it.
+	 */
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (pkeys[i] == LIM_MGMT_P_KEY) {
+			update_includes_mgmt_partition = 1;
+			break;
+		}
+	}
+
+	if (!update_includes_mgmt_partition)
+		return 1;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = ppd->pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The SM gives us the complete PKey table. We have
+		 * to ensure that we put the PKeys in the matching
+		 * slots.
+		 */
+		ppd->pkeys[i] = key;
+		changed = 1;
+	}
+
+	if (changed) {
+		struct ib_event event;
+
+		(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev.ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 n_blocks_sent = OPA_AM_NBLK(am);
+	u32 start_block = am & 0x7ff;
+	u16 *p = (u16 *) data;
+	__be16 *q = (__be16 *)data;
+	int i;
+	u16 n_blocks_avail;
+	unsigned npkeys = hfi1_get_npkeys(dd);
+
+	if (n_blocks_sent == 0) {
+		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+			port, start_block, n_blocks_sent);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+	if (start_block + n_blocks_sent > n_blocks_avail ||
+	    n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+		pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+			start_block, n_blocks_sent, n_blocks_avail,
+			OPA_NUM_PKEY_BLOCKS_PER_SMP);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+		p[i] = be16_to_cpu(q[i]);
+
+	if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+	u64 *val = (u64 *)data;
+
+	*val++ = read_csr(dd, SEND_SC2VLT0);
+	*val++ = read_csr(dd, SEND_SC2VLT1);
+	*val++ = read_csr(dd, SEND_SC2VLT2);
+	*val++ = read_csr(dd, SEND_SC2VLT3);
+	return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+	int i;
+	u8 *pd = (u8 *)data;
+
+	for (i = 0; i < OPA_MAX_SCS; i++) {
+		if (i == 15)
+			continue;
+		if ((pd[i] & 0x1f) == 0xf)
+			pd[i] = ILLEGAL_VL;
+	}
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+	u64 *val = (u64 *)data;
+
+	filter_sc2vlt(data);
+
+	write_csr(dd, SEND_SC2VLT0, *val++);
+	write_csr(dd, SEND_SC2VLT1, *val++);
+	write_csr(dd, SEND_SC2VLT2, *val++);
+	write_csr(dd, SEND_SC2VLT3, *val++);
+	write_seqlock_irq(&dd->sc2vl_lock);
+	memcpy(dd->sc2vl, (u64 *)data, sizeof(dd->sc2vl));
+	write_sequnlock_irq(&dd->sc2vl_lock);
+	return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *)data;
+	size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+	unsigned i;
+
+	if (am) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+		*p++ = ibp->sl_to_sc[i];
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *)data;
+	int i;
+
+	if (am) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++)
+		ibp->sl_to_sc[i] = *p++;
+
+	return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *)data;
+	size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+	unsigned i;
+
+	if (am) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+		*p++ = ibp->sc_to_sl[i];
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *)data;
+	int i;
+
+	if (am) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+		ibp->sc_to_sl[i] = *p++;
+
+	return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len)
+{
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	void *vp = (void *) data;
+	size_t size = 4 * sizeof(u64);
+
+	if (n_blocks != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	get_sc2vlt_tables(dd, vp);
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len)
+{
+	u32 n_blocks = OPA_AM_NBLK(am);
+	int async_update = OPA_AM_ASYNC(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	void *vp = (void *) data;
+	struct hfi1_pportdata *ppd;
+	int lstate;
+
+	if (n_blocks != 1 || async_update) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	lstate = driver_lstate(ppd);
+	/* it's known that async_update is 0 by this point, but include
+	 * the explicit check for clarity */
+	if (!async_update &&
+	    (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	set_sc2vlt_tables(dd, vp);
+
+	return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len)
+{
+	u32 n_blocks = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	void *vp = (void *) data;
+	int size;
+
+	if (n_blocks != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+
+	size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len)
+{
+	u32 n_blocks = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	void *vp = (void *) data;
+	int lstate;
+
+	if (n_blocks != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	lstate = driver_lstate(ppd);
+	if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+
+	fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+	return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+					 resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port,
+			      u32 *resp_len)
+{
+	u32 nports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 lstate;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+
+	if (nports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ibp = to_iport(ibdev, port);
+	ppd = ppd_from_ibp(ibp);
+
+	lstate = driver_lstate(ppd);
+
+	if (start_of_sm_config && (lstate == IB_PORT_INIT))
+		ppd->is_sm_config_started = 1;
+
+#if PI_LED_ENABLE_SUP
+	psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+	psi->port_states.ledenable_offlinereason |=
+		ppd->is_sm_config_started << 5;
+	psi->port_states.ledenable_offlinereason |=
+		ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+	psi->port_states.offline_reason = ppd->neighbor_normal << 4;
+	psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+	psi->port_states.offline_reason |= ppd->offline_disabled_reason &
+				OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+	psi->port_states.portphysstate_portstate =
+		(hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+	psi->link_width_downgrade_tx_active =
+	  ppd->link_width_downgrade_tx_active;
+	psi->link_width_downgrade_rx_active =
+	  ppd->link_width_downgrade_rx_active;
+	if (resp_len)
+		*resp_len += sizeof(struct opa_port_state_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port,
+			      u32 *resp_len)
+{
+	u32 nports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 ls_old;
+	u8 ls_new, ps_new;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+	int ret, invalid = 0;
+
+	if (nports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ibp = to_iport(ibdev, port);
+	ppd = ppd_from_ibp(ibp);
+
+	ls_old = driver_lstate(ppd);
+
+	ls_new = port_states_to_logical_state(&psi->port_states);
+	ps_new = port_states_to_phys_state(&psi->port_states);
+
+	if (ls_old == IB_PORT_INIT) {
+		if (start_of_sm_config) {
+			if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+				ppd->is_sm_config_started = 1;
+		} else if (ls_new == IB_PORT_ARMED) {
+			if (ppd->is_sm_config_started == 0)
+				invalid = 1;
+		}
+	}
+
+	ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+	if (ret)
+		return ret;
+
+	if (invalid)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 addr = OPA_AM_CI_ADDR(am);
+	u32 len = OPA_AM_CI_LEN(am) + 1;
+	int ret;
+
+#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+	/* check that addr is within spec, and
+	 * addr and (addr + len - 1) are on the same "page" */
+	if (addr >= 4096 ||
+		(__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ret = get_cable_info(dd, port, addr, len, data);
+
+	if (ret == -ENODEV) {
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* The address range for the CableInfo SMA query is wider than the
+	 * memory available on the QSFP cable. We want to return a valid
+	 * response, albeit zeroed out, for address ranges beyond available
+	 * memory but that are within the CableInfo query spec
+	 */
+	if (ret < 0 && ret != -ERANGE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	if (resp_len)
+		*resp_len += len;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	u32 num_ports = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	struct buffer_control *p = (struct buffer_control *) data;
+	int size;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+	size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+	trace_bct_get(dd, p);
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	u32 num_ports = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	struct buffer_control *p = (struct buffer_control *) data;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+	ppd = dd->pport + (port - 1);
+	trace_bct_set(dd, p);
+	if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+				 struct ib_device *ibdev, u8 port,
+				 u32 *resp_len)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+	u32 num_ports = OPA_AM_NPORT(am);
+	u8 section = (am & 0x00ff0000) >> 16;
+	u8 *p = data;
+	int size = 0;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	switch (section) {
+	case OPA_VLARB_LOW_ELEMENTS:
+		size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+		break;
+	case OPA_VLARB_HIGH_ELEMENTS:
+		size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+		break;
+	case OPA_VLARB_PREEMPT_ELEMENTS:
+		size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+		break;
+	case OPA_VLARB_PREEMPT_MATRIX:
+		size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+		break;
+	default:
+		pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+			be32_to_cpu(smp->attr_mod));
+		smp->status |= IB_SMP_INVALID_FIELD;
+		break;
+	}
+
+	if (size > 0 && resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+				 struct ib_device *ibdev, u8 port,
+				 u32 *resp_len)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+	u32 num_ports = OPA_AM_NPORT(am);
+	u8 section = (am & 0x00ff0000) >> 16;
+	u8 *p = data;
+
+	if (num_ports != 1) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	switch (section) {
+	case OPA_VLARB_LOW_ELEMENTS:
+		(void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+		break;
+	case OPA_VLARB_HIGH_ELEMENTS:
+		(void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+		break;
+	/* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+	 * can be changed from the default values */
+	case OPA_VLARB_PREEMPT_ELEMENTS:
+		/* FALLTHROUGH */
+	case OPA_VLARB_PREEMPT_MATRIX:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		break;
+	default:
+		pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+			be32_to_cpu(smp->attr_mod));
+		smp->status |= IB_SMP_INVALID_FIELD;
+		break;
+	}
+
+	return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+	struct ib_mad_hdr mad_hdr;
+	u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	__be32 cap_mask2_resp_time;
+
+	u8 redirect_gid[16];
+	__be32 redirect_tc_fl;
+	__be32 redirect_lid;
+	__be32 redirect_sl_qp;
+	__be32 redirect_qkey;
+
+	u8 trap_gid[16];
+	__be32 trap_tc_fl;
+	__be32 trap_lid;
+	__be32 trap_hl_qp;
+	__be32 trap_qkey;
+
+	__be16 trap_pkey;
+	__be16 redirect_pkey;
+
+	u8 trap_sl_rsvd;
+	u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+	__u8 port_num;
+	__u8 reserved[3];
+	__be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL		0x000080ff
+
+struct opa_port_status_rsp {
+	__u8 port_num;
+	__u8 reserved[3];
+	__be32  vl_select_mask;
+
+	/* Data counters */
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_pkts;
+	__be64 port_rcv_pkts;
+	__be64 port_multicast_xmit_pkts;
+	__be64 port_multicast_rcv_pkts;
+	__be64 port_xmit_wait;
+	__be64 sw_port_congestion;
+	__be64 port_rcv_fecn;
+	__be64 port_rcv_becn;
+	__be64 port_xmit_time_cong;
+	__be64 port_xmit_wasted_bw;
+	__be64 port_xmit_wait_data;
+	__be64 port_rcv_bubble;
+	__be64 port_mark_fecn;
+	/* Error counters */
+	__be64 port_rcv_constraint_errors;
+	__be64 port_rcv_switch_relay_errors;
+	__be64 port_xmit_discards;
+	__be64 port_xmit_constraint_errors;
+	__be64 port_rcv_remote_physical_errors;
+	__be64 local_link_integrity_errors;
+	__be64 port_rcv_errors;
+	__be64 excessive_buffer_overruns;
+	__be64 fm_config_errors;
+	__be32 link_error_recovery;
+	__be32 link_downed;
+	u8 uncorrectable_errors;
+
+	u8 link_quality_indicator; /* 5res, 3bit */
+	u8 res2[6];
+	struct _vls_pctrs {
+		/* per-VL Data counters */
+		__be64 port_vl_xmit_data;
+		__be64 port_vl_rcv_data;
+		__be64 port_vl_xmit_pkts;
+		__be64 port_vl_rcv_pkts;
+		__be64 port_vl_xmit_wait;
+		__be64 sw_port_vl_congestion;
+		__be64 port_vl_rcv_fecn;
+		__be64 port_vl_rcv_becn;
+		__be64 port_xmit_time_cong;
+		__be64 port_vl_xmit_wasted_bw;
+		__be64 port_vl_xmit_wait_data;
+		__be64 port_vl_rcv_bubble;
+		__be64 port_vl_mark_fecn;
+		__be64 port_vl_xmit_discards;
+	} vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+	CS_PORT_XMIT_DATA			= (1 << 31),
+	CS_PORT_RCV_DATA			= (1 << 30),
+	CS_PORT_XMIT_PKTS			= (1 << 29),
+	CS_PORT_RCV_PKTS			= (1 << 28),
+	CS_PORT_MCAST_XMIT_PKTS			= (1 << 27),
+	CS_PORT_MCAST_RCV_PKTS			= (1 << 26),
+	CS_PORT_XMIT_WAIT			= (1 << 25),
+	CS_SW_PORT_CONGESTION			= (1 << 24),
+	CS_PORT_RCV_FECN			= (1 << 23),
+	CS_PORT_RCV_BECN			= (1 << 22),
+	CS_PORT_XMIT_TIME_CONG			= (1 << 21),
+	CS_PORT_XMIT_WASTED_BW			= (1 << 20),
+	CS_PORT_XMIT_WAIT_DATA			= (1 << 19),
+	CS_PORT_RCV_BUBBLE			= (1 << 18),
+	CS_PORT_MARK_FECN			= (1 << 17),
+	CS_PORT_RCV_CONSTRAINT_ERRORS		= (1 << 16),
+	CS_PORT_RCV_SWITCH_RELAY_ERRORS		= (1 << 15),
+	CS_PORT_XMIT_DISCARDS			= (1 << 14),
+	CS_PORT_XMIT_CONSTRAINT_ERRORS		= (1 << 13),
+	CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS	= (1 << 12),
+	CS_LOCAL_LINK_INTEGRITY_ERRORS		= (1 << 11),
+	CS_PORT_RCV_ERRORS			= (1 << 10),
+	CS_EXCESSIVE_BUFFER_OVERRUNS		= (1 << 9),
+	CS_FM_CONFIG_ERRORS			= (1 << 8),
+	CS_LINK_ERROR_RECOVERY			= (1 << 7),
+	CS_LINK_DOWNED				= (1 << 6),
+	CS_UNCORRECTABLE_ERRORS			= (1 << 5),
+};
+
+struct opa_clear_port_status {
+	__be64 port_select_mask[4];
+	__be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+	__be16 attr_id;
+	__be16 err_reqlength;	/* 1 bit, 8 res, 7 bit */
+	__be32 attr_mod;
+	u8 data[0];
+};
+
+/* Request contains first two fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+	__be64 port_select_mask[4];
+	__be32 vl_select_mask;
+
+	/* Response fields follow */
+	__be32 reserved1;
+	struct _port_dctrs {
+		u8 port_number;
+		u8 reserved2[3];
+		__be32 link_quality_indicator; /* 29res, 3bit */
+
+		/* Data counters */
+		__be64 port_xmit_data;
+		__be64 port_rcv_data;
+		__be64 port_xmit_pkts;
+		__be64 port_rcv_pkts;
+		__be64 port_multicast_xmit_pkts;
+		__be64 port_multicast_rcv_pkts;
+		__be64 port_xmit_wait;
+		__be64 sw_port_congestion;
+		__be64 port_rcv_fecn;
+		__be64 port_rcv_becn;
+		__be64 port_xmit_time_cong;
+		__be64 port_xmit_wasted_bw;
+		__be64 port_xmit_wait_data;
+		__be64 port_rcv_bubble;
+		__be64 port_mark_fecn;
+
+		__be64 port_error_counter_summary;
+		/* Sum of error counts/port */
+
+		struct _vls_dctrs {
+			/* per-VL Data counters */
+			__be64 port_vl_xmit_data;
+			__be64 port_vl_rcv_data;
+			__be64 port_vl_xmit_pkts;
+			__be64 port_vl_rcv_pkts;
+			__be64 port_vl_xmit_wait;
+			__be64 sw_port_vl_congestion;
+			__be64 port_vl_rcv_fecn;
+			__be64 port_vl_rcv_becn;
+			__be64 port_xmit_time_cong;
+			__be64 port_vl_xmit_wasted_bw;
+			__be64 port_vl_xmit_wait_data;
+			__be64 port_vl_rcv_bubble;
+			__be64 port_vl_mark_fecn;
+		} vls[0];
+		/* array size defined by #bits set in vl_select_mask*/
+	} port[1]; /* array size defined by  #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+	/* Request contains first two fields, response contains the
+	 * whole magilla */
+	__be64 port_select_mask[4];
+	__be32 vl_select_mask;
+
+	/* Response-only fields follow */
+	__be32 reserved1;
+	struct _port_ectrs {
+		u8 port_number;
+		u8 reserved2[7];
+		__be64 port_rcv_constraint_errors;
+		__be64 port_rcv_switch_relay_errors;
+		__be64 port_xmit_discards;
+		__be64 port_xmit_constraint_errors;
+		__be64 port_rcv_remote_physical_errors;
+		__be64 local_link_integrity_errors;
+		__be64 port_rcv_errors;
+		__be64 excessive_buffer_overruns;
+		__be64 fm_config_errors;
+		__be32 link_error_recovery;
+		__be32 link_downed;
+		u8 uncorrectable_errors;
+		u8 reserved3[7];
+		struct _vls_ectrs {
+			__be64 port_vl_xmit_discards;
+		} vls[0];
+		/* array size defined by #bits set in vl_select_mask */
+	} port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+	__be64 port_select_mask[4];
+	__be32 error_info_select_mask;
+	__be32 reserved1;
+	struct _port_ei {
+
+		u8 port_number;
+		u8 reserved2[7];
+
+		/* PortRcvErrorInfo */
+		struct {
+			u8 status_and_code;
+			union {
+				u8 raw[17];
+				struct {
+					/* EI1to12 format */
+					u8 packet_flit1[8];
+					u8 packet_flit2[8];
+					u8 remaining_flit_bits12;
+				} ei1to12;
+				struct {
+					u8 packet_bytes[8];
+					u8 remaining_flit_bits;
+				} ei13;
+			} ei;
+			u8 reserved3[6];
+		} __packed port_rcv_ei;
+
+		/* ExcessiveBufferOverrunInfo */
+		struct {
+			u8 status_and_sc;
+			u8 reserved4[7];
+		} __packed excessive_buffer_overrun_ei;
+
+		/* PortXmitConstraintErrorInfo */
+		struct {
+			u8 status;
+			u8 reserved5;
+			__be16 pkey;
+			__be32 slid;
+		} __packed port_xmit_constraint_ei;
+
+		/* PortRcvConstraintErrorInfo */
+		struct {
+			u8 status;
+			u8 reserved6;
+			__be16 pkey;
+			__be32 slid;
+		} __packed port_rcv_constraint_ei;
+
+		/* PortRcvSwitchRelayErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 reserved7[3];
+			__u32 error_info;
+		} __packed port_rcv_switch_relay_ei;
+
+		/* UncorrectableErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 reserved8;
+		} __packed uncorrectable_ei;
+
+		/* FMConfigErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 error_info;
+		} __packed fm_config_ei;
+		__u32 reserved9;
+	} port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+	ES_PORT_RCV_ERROR_INFO			= (1 << 31),
+	ES_EXCESSIVE_BUFFER_OVERRUN_INFO	= (1 << 30),
+	ES_PORT_XMIT_CONSTRAINT_ERROR_INFO	= (1 << 29),
+	ES_PORT_RCV_CONSTRAINT_ERROR_INFO	= (1 << 28),
+	ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO	= (1 << 27),
+	ES_UNCORRECTABLE_ERROR_INFO		= (1 << 26),
+	ES_FM_CONFIG_ERROR_INFO			= (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+				struct ib_device *ibdev, u32 *resp_len)
+{
+	struct opa_class_port_info *p =
+		(struct opa_class_port_info *)pmp->data;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	p->base_version = OPA_MGMT_BASE_VERSION;
+	p->class_version = OPA_SMI_CLASS_VERSION;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->cap_mask2_resp_time = cpu_to_be32(18);
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+			  struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+	if (!is_bx(ppd->dd)) {
+		unsigned long vl;
+		int vfi = 0;
+		u64 max_vl_xmit_wait = 0, tmp;
+		u32 vl_all_mask = VL_MASK_ALL;
+		u64 rcv_data, rcv_bubble;
+
+		rcv_data = be64_to_cpu(rsp->port_rcv_data);
+		rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+		/* In the measured time period, calculate the total number
+		 * of flits that were received. Subtract out one false
+		 * rcv_bubble increment for every 32 received flits but
+		 * don't let the number go negative.
+		 */
+		if (rcv_bubble >= (rcv_data>>5)) {
+			rcv_bubble -= (rcv_data>>5);
+			rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+		}
+		for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+				 8 * sizeof(vl_select_mask)) {
+			rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+			rcv_bubble =
+				be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+			if (rcv_bubble >= (rcv_data>>5)) {
+				rcv_bubble -= (rcv_data>>5);
+				rsp->vls[vfi].port_vl_rcv_bubble =
+							cpu_to_be64(rcv_bubble);
+			}
+			vfi++;
+		}
+
+		for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+				 8 * sizeof(vl_all_mask)) {
+			tmp = read_port_cntr(ppd, C_TX_WAIT_VL,
+					     idx_from_vl(vl));
+			if (tmp > max_vl_xmit_wait)
+				max_vl_xmit_wait = tmp;
+		}
+		rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait);
+	}
+}
+
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	struct opa_port_status_req *req =
+		(struct opa_port_status_req *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct opa_port_status_rsp *rsp;
+	u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	unsigned long vl;
+	size_t response_data_size;
+	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	u8 port_num = req->port_num;
+	u8 num_vls = hweight32(vl_select_mask);
+	struct _vls_pctrs *vlinfo;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	int vfi;
+	u64 tmp, tmp2;
+
+	response_data_size = sizeof(struct opa_port_status_rsp) +
+				num_vls * sizeof(struct _vls_pctrs);
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	if (nports != 1 || (port_num && port_num != port)
+	    || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	rsp = (struct opa_port_status_rsp *)pmp->data;
+	if (port_num)
+		rsp->port_num = port_num;
+	else
+		rsp->port_num = port;
+
+	rsp->port_rcv_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+					   CNTR_INVALID_VL));
+
+	hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+	rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+	rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+					  CNTR_INVALID_VL));
+	rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+					 CNTR_INVALID_VL));
+	rsp->port_rcv_bubble =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+	rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+					  CNTR_INVALID_VL));
+	rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+					 CNTR_INVALID_VL));
+	rsp->port_multicast_xmit_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+					CNTR_INVALID_VL));
+	rsp->port_multicast_rcv_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+					  CNTR_INVALID_VL));
+	rsp->port_xmit_wait =
+		cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+	rsp->port_rcv_fecn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+	rsp->port_rcv_becn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+	rsp->port_xmit_discards =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+					   CNTR_INVALID_VL));
+	rsp->port_xmit_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	rsp->port_rcv_remote_physical_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+					  CNTR_INVALID_VL));
+	tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+	if (tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->local_link_integrity_errors = cpu_to_be64(~0);
+	} else {
+		rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+	}
+	tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+					CNTR_INVALID_VL);
+	if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->link_error_recovery = cpu_to_be32(~0);
+	} else {
+		rsp->link_error_recovery = cpu_to_be32(tmp2);
+	}
+	rsp->port_rcv_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+	rsp->excessive_buffer_overruns =
+		cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+	rsp->fm_config_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+					  CNTR_INVALID_VL));
+	rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+					  CNTR_INVALID_VL));
+
+	/* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+	rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+	vlinfo = &(rsp->vls[0]);
+	vfi = 0;
+	/* The vl_select_mask has been checked above, and we know
+	 * that it contains only entries which represent valid VLs.
+	 * So in the for_each_set_bit() loop below, we don't need
+	 * any additional checks for vl.
+	 */
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+
+		tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+		rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+		rsp->vls[vfi].port_vl_rcv_bubble =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_pkts =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_data =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_pkts =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_wait =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_fecn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_becn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+					idx_from_vl(vl)));
+
+		vlinfo++;
+		vfi++;
+	}
+
+	a0_portstatus(ppd, rsp, vl_select_mask);
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 error_counter_summary = 0, tmp;
+
+	error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+						CNTR_INVALID_VL);
+	/* port_rcv_switch_relay_errors is 0 for HFIs */
+	error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_TX_REPLAY,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_RX_REPLAY,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_SEQ_CRC_CNT,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+						CNTR_INVALID_VL);
+	/* ppd->link_downed is a 32-bit value */
+	error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+						CNTR_INVALID_VL);
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+	/* this is an 8-bit quantity */
+	error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+	return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp,
+			    u32 vl_select_mask)
+{
+	if (!is_bx(dd)) {
+		unsigned long vl;
+		int vfi = 0;
+		u64 rcv_data, rcv_bubble, sum_vl_xmit_wait = 0;
+
+		rcv_data = be64_to_cpu(rsp->port_rcv_data);
+		rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+		/* In the measured time period, calculate the total number
+		 * of flits that were received. Subtract out one false
+		 * rcv_bubble increment for every 32 received flits but
+		 * don't let the number go negative.
+		 */
+		if (rcv_bubble >= (rcv_data>>5)) {
+			rcv_bubble -= (rcv_data>>5);
+			rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+		}
+		for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+				8 * sizeof(vl_select_mask)) {
+			rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+			rcv_bubble =
+				be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+			if (rcv_bubble >= (rcv_data>>5)) {
+				rcv_bubble -= (rcv_data>>5);
+				rsp->vls[vfi].port_vl_rcv_bubble =
+							cpu_to_be64(rcv_bubble);
+			}
+			vfi++;
+		}
+		vfi = 0;
+		for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+				8 * sizeof(vl_select_mask)) {
+			u64 tmp = sum_vl_xmit_wait +
+				be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait);
+			if (tmp < sum_vl_xmit_wait) {
+				/* we wrapped */
+				sum_vl_xmit_wait = (u64) ~0;
+				break;
+			}
+			sum_vl_xmit_wait = tmp;
+		}
+		if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+			rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+	}
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	struct opa_port_data_counters_msg *req =
+		(struct opa_port_data_counters_msg *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct _port_dctrs *rsp;
+	struct _vls_dctrs *vlinfo;
+	size_t response_data_size;
+	u32 num_ports;
+	u8 num_pslm;
+	u8 lq, num_vls;
+	u64 port_mask;
+	unsigned long port_num;
+	unsigned long vl;
+	u32 vl_select_mask;
+	int vfi;
+
+	num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+	num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+	vl_select_mask = be32_to_cpu(req->vl_select_mask);
+
+	if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* Sanity check */
+	response_data_size = sizeof(struct opa_port_data_counters_msg) +
+				num_vls * sizeof(struct _vls_dctrs);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the
+	 * port the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask));
+
+	if ((u8)port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	rsp = (struct _port_dctrs *)&(req->port[0]);
+	memset(rsp, 0, sizeof(*rsp));
+
+	rsp->port_number = port;
+	/*
+	 * Note that link_quality_indicator is a 32 bit quantity in
+	 * 'datacounters' queries (as opposed to 'portinfo' queries,
+	 * where it's a byte).
+	 */
+	hfi1_read_link_quality(dd, &lq);
+	rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+
+	/* rsp->sw_port_congestion is 0 for HFIs */
+	/* rsp->port_xmit_time_cong is 0 for HFIs */
+	/* rsp->port_xmit_wasted_bw ??? */
+	/* rsp->port_xmit_wait_data ??? */
+	/* rsp->port_mark_fecn is 0 for HFIs */
+
+	rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_bubble =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+	rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_multicast_xmit_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_multicast_rcv_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_xmit_wait =
+		cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+	rsp->port_rcv_fecn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+	rsp->port_rcv_becn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+
+	rsp->port_error_counter_summary =
+		cpu_to_be64(get_error_counter_summary(ibdev, port));
+
+	vlinfo = &(rsp->vls[0]);
+	vfi = 0;
+	/* The vl_select_mask has been checked above, and we know
+	 * that it contains only entries which represent valid VLs.
+	 * So in the for_each_set_bit() loop below, we don't need
+	 * any additional checks for vl.
+	 */
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+		 8 * sizeof(req->vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+
+		rsp->vls[vfi].port_vl_xmit_data =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+							idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_data =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+							idx_from_vl(vl)));
+		rsp->vls[vfi].port_vl_rcv_bubble =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+					idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_pkts =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+							idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_pkts =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+							idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_wait =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+							idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_fecn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+							idx_from_vl(vl)));
+		rsp->vls[vfi].port_vl_rcv_becn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+							idx_from_vl(vl)));
+
+		/* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+		/* rsp->port_vl_xmit_wasted_bw ??? */
+		/* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+		 * does this differ from rsp->vls[vfi].port_vl_xmit_wait */
+		/*rsp->vls[vfi].port_vl_mark_fecn =
+			cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+				+ offset));
+		*/
+		vlinfo++;
+		vfi++;
+	}
+
+	a0_datacounters(dd, rsp, vl_select_mask);
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	size_t response_data_size;
+	struct _port_ectrs *rsp;
+	unsigned long port_num;
+	struct opa_port_error_counters64_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 num_ports;
+	u8 num_pslm;
+	u8 num_vls;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct _vls_ectrs *vlinfo;
+	unsigned long vl;
+	u64 port_mask, tmp, tmp2;
+	u32 vl_select_mask;
+	int vfi;
+
+	req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+	num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+	num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+				num_vls * sizeof(struct _vls_ectrs);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+	/*
+	 * The bit set in the mask needs to be consistent with the
+	 * port the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+					sizeof(port_mask));
+
+	if ((u8)port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	rsp = (struct _port_ectrs *)&(req->port[0]);
+
+	ibp = to_iport(ibdev, port_num);
+	ppd = ppd_from_ibp(ibp);
+
+	memset(rsp, 0, sizeof(*rsp));
+	rsp->port_number = (u8)port_num;
+
+	rsp->port_rcv_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	/* port_rcv_switch_relay_errors is 0 for HFIs */
+	rsp->port_xmit_discards =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_remote_physical_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+						CNTR_INVALID_VL));
+	tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+	if (tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->local_link_integrity_errors = cpu_to_be64(~0);
+	} else {
+		rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+	}
+	tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+					CNTR_INVALID_VL);
+	if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->link_error_recovery = cpu_to_be32(~0);
+	} else {
+		rsp->link_error_recovery = cpu_to_be32(tmp2);
+	}
+	rsp->port_xmit_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	rsp->excessive_buffer_overruns =
+		cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+	rsp->fm_config_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+						CNTR_INVALID_VL));
+	rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+						CNTR_INVALID_VL));
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+	rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+	vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]);
+	vfi = 0;
+	vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(req->vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+		/* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+		vlinfo += 1;
+		vfi++;
+	}
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	size_t response_data_size;
+	struct _port_ei *rsp;
+	struct opa_port_error_info_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u64 port_mask;
+	u32 num_ports;
+	unsigned long port_num;
+	u8 num_pslm;
+	u64 reg;
+
+	req = (struct opa_port_error_info_msg *)pmp->data;
+	rsp = (struct _port_ei *)&(req->port[0]);
+
+	num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+	memset(rsp, 0, sizeof(*rsp));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* Sanity check */
+	response_data_size = sizeof(struct opa_port_error_info_msg);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the port
+	 * the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask));
+
+	if ((u8)port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* PortRcvErrorInfo */
+	rsp->port_rcv_ei.status_and_code =
+		dd->err_info_rcvport.status_and_code;
+	memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+		&dd->err_info_rcvport.packet_flit1, sizeof(u64));
+	memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+		&dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+	/* ExcessiverBufferOverrunInfo */
+	reg = read_csr(dd, RCV_ERR_INFO);
+	if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+		/* if the RcvExcessBufferOverrun bit is set, save SC of
+		 * first pkt that encountered an excess buffer overrun */
+		u8 tmp = (u8)reg;
+
+		tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+		tmp <<= 2;
+		rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+		/* set the status bit */
+		rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+	}
+
+	rsp->port_xmit_constraint_ei.status =
+		dd->err_info_xmit_constraint.status;
+	rsp->port_xmit_constraint_ei.pkey =
+		cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+	rsp->port_xmit_constraint_ei.slid =
+		cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+	rsp->port_rcv_constraint_ei.status =
+		dd->err_info_rcv_constraint.status;
+	rsp->port_rcv_constraint_ei.pkey =
+		cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+	rsp->port_rcv_constraint_ei.slid =
+		cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+	/* UncorrectableErrorInfo */
+	rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+	/* FMConfigErrorInfo */
+	rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	struct opa_clear_port_status *req =
+		(struct opa_clear_port_status *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	u64 portn = be64_to_cpu(req->port_select_mask[3]);
+	u32 counter_select = be32_to_cpu(req->counter_select_mask);
+	u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+	unsigned long vl;
+
+	if ((nports != 1) || (portn != 1 << port)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+	/*
+	 * only counters returned by pma_get_opa_portstatus() are
+	 * handled, so when pma_get_opa_portstatus() gets a fix,
+	 * the corresponding change should be made here as well.
+	 */
+
+	if (counter_select & CS_PORT_XMIT_DATA)
+		write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_DATA)
+		write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_PKTS)
+		write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_PKTS)
+		write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+		write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+		write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_WAIT)
+		write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+	/* ignore cs_sw_portCongestion for HFIs */
+
+	if (counter_select & CS_PORT_RCV_FECN)
+		write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_BECN)
+		write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+	/* ignore cs_port_xmit_time_cong for HFIs */
+	/* ignore cs_port_xmit_wasted_bw for now */
+	/* ignore cs_port_xmit_wait_data for now */
+	if (counter_select & CS_PORT_RCV_BUBBLE)
+		write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+	/* Only applicable for switch */
+	/*if (counter_select & CS_PORT_MARK_FECN)
+		write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/
+
+	if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+		write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+	/* ignore cs_port_rcv_switch_relay_errors for HFIs */
+	if (counter_select & CS_PORT_XMIT_DISCARDS)
+		write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+		write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+		write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
+		write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+		write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+	}
+
+	if (counter_select & CS_LINK_ERROR_RECOVERY) {
+		write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+		write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+						CNTR_INVALID_VL, 0);
+	}
+
+	if (counter_select & CS_PORT_RCV_ERRORS)
+		write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+		write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+		dd->rcv_ovfl_cnt = 0;
+	}
+
+	if (counter_select & CS_FM_CONFIG_ERRORS)
+		write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_LINK_DOWNED)
+		write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_UNCORRECTABLE_ERRORS)
+		write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(vl_select_mask)) {
+
+		if (counter_select & CS_PORT_XMIT_DATA)
+			write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_DATA)
+			write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_XMIT_PKTS)
+			write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_PKTS)
+			write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_XMIT_WAIT)
+			write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+		/* sw_port_vl_congestion is 0 for HFIs */
+		if (counter_select & CS_PORT_RCV_FECN)
+			write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_BECN)
+			write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+		/* port_vl_xmit_time_cong is 0 for HFIs */
+		/* port_vl_xmit_wasted_bw ??? */
+		/* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+		if (counter_select & CS_PORT_RCV_BUBBLE)
+			write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+		/*if (counter_select & CS_PORT_MARK_FECN)
+		     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+		*/
+		/* port_vl_xmit_discards ??? */
+	}
+
+	if (resp_len)
+		*resp_len += sizeof(*req);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+			struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+	struct _port_ei *rsp;
+	struct opa_port_error_info_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u64 port_mask;
+	u32 num_ports;
+	unsigned long port_num;
+	u8 num_pslm;
+	u32 error_info_select;
+
+	req = (struct opa_port_error_info_msg *)pmp->data;
+	rsp = (struct _port_ei *)&(req->port[0]);
+
+	num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+	memset(rsp, 0, sizeof(*rsp));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the port
+	 * the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask));
+
+	if ((u8)port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+	/* PortRcvErrorInfo */
+	if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+	/* ExcessiverBufferOverrunInfo */
+	if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+		/* status bit is essentially kept in the h/w - bit 5 of
+		 * RCV_ERR_INFO */
+		write_csr(dd, RCV_ERR_INFO,
+			  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+	if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+		dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+	if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+		dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+	/* UncorrectableErrorInfo */
+	if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+	/* FMConfigErrorInfo */
+	if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+	if (resp_len)
+		*resp_len += sizeof(*req);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+	__be16 congestion_info;
+	u8 control_table_cap;	/* Multiple of 64 entry unit CCTs */
+	u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len)
+{
+	struct opa_congestion_info_attr *p =
+		(struct opa_congestion_info_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	p->congestion_info = 0;
+	p->control_table_cap = ppd->cc_max_table_entries;
+	p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+					     u8 *data,
+					     struct ib_device *ibdev,
+					     u8 port, u32 *resp_len)
+{
+	int i;
+	struct opa_congestion_setting_attr *p =
+		(struct opa_congestion_setting_attr *) data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_congestion_setting_entry_shadow *entries;
+	struct cc_state *cc_state;
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (cc_state == NULL) {
+		rcu_read_unlock();
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	entries = cc_state->cong_setting.entries;
+	p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+	p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		p->entries[i].ccti_increase = entries[i].ccti_increase;
+		p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+		p->entries[i].trigger_threshold =
+			entries[i].trigger_threshold;
+		p->entries[i].ccti_min = entries[i].ccti_min;
+	}
+
+	rcu_read_unlock();
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+				       struct ib_device *ibdev, u8 port,
+				       u32 *resp_len)
+{
+	struct opa_congestion_setting_attr *p =
+		(struct opa_congestion_setting_attr *) data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_congestion_setting_entry_shadow *entries;
+	int i;
+
+	ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+	entries = ppd->congestion_entries;
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		entries[i].ccti_increase = p->entries[i].ccti_increase;
+		entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+		entries[i].trigger_threshold =
+			p->entries[i].trigger_threshold;
+		entries[i].ccti_min = p->entries[i].ccti_min;
+	}
+
+	return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+					   resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+					u8 *data, struct ib_device *ibdev,
+					u8 port, u32 *resp_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+	s64 ts;
+	int i;
+
+	if (am != 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	spin_lock(&ppd->cc_log_lock);
+
+	cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+	cong_log->congestion_flags = 0;
+	cong_log->threshold_event_counter =
+		cpu_to_be16(ppd->threshold_event_counter);
+	memcpy(cong_log->threshold_cong_event_map,
+	       ppd->threshold_cong_event_map,
+	       sizeof(cong_log->threshold_cong_event_map));
+	/* keep timestamp in units of 1.024 usec */
+	ts = ktime_to_ns(ktime_get()) / 1024;
+	cong_log->current_time_stamp = cpu_to_be32(ts);
+	for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+		struct opa_hfi1_cong_log_event_internal *cce =
+			&ppd->cc_events[ppd->cc_mad_idx++];
+		if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+			ppd->cc_mad_idx = 0;
+		/*
+		 * Entries which are older than twice the time
+		 * required to wrap the counter are supposed to
+		 * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+		 */
+		if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+			continue;
+		memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+		memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+			&cce->rqpn, 3);
+		cong_log->events[i].sl_svc_type_cn_entry =
+			((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+		cong_log->events[i].remote_lid_cn_entry =
+			cpu_to_be32(cce->rlid);
+		cong_log->events[i].timestamp_cn_entry =
+			cpu_to_be32(cce->timestamp);
+	}
+
+	/*
+	 * Reset threshold_cong_event_map, and threshold_event_counter
+	 * to 0 when log is read.
+	 */
+	memset(ppd->threshold_cong_event_map, 0x0,
+	       sizeof(ppd->threshold_cong_event_map));
+	ppd->threshold_event_counter = 0;
+
+	spin_unlock(&ppd->cc_log_lock);
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_hfi1_cong_log);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct ib_cc_table_attr *cc_table_attr =
+		(struct ib_cc_table_attr *) data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 start_block = OPA_AM_START_BLK(am);
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct ib_cc_table_entry_shadow *entries;
+	int i, j;
+	u32 sentry, eentry;
+	struct cc_state *cc_state;
+
+	/* sanity check n_blocks, start_block */
+	if (n_blocks == 0 ||
+	    start_block + n_blocks > ppd->cc_max_table_entries) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (cc_state == NULL) {
+		rcu_read_unlock();
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	sentry = start_block * IB_CCT_ENTRIES;
+	eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+	cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+	entries = cc_state->cct.entries;
+
+	/* return n_blocks, though the last block may not be full */
+	for (j = 0, i = sentry; i < eentry; j++, i++)
+		cc_table_attr->ccti_entries[j].entry =
+			cpu_to_be16(entries[i].entry);
+
+	rcu_read_unlock();
+
+	if (resp_len)
+		*resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+void cc_state_reclaim(struct rcu_head *rcu)
+{
+	struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
+
+	kfree(cc_state);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 start_block = OPA_AM_START_BLK(am);
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct ib_cc_table_entry_shadow *entries;
+	int i, j;
+	u32 sentry, eentry;
+	u16 ccti_limit;
+	struct cc_state *old_cc_state, *new_cc_state;
+
+	/* sanity check n_blocks, start_block */
+	if (n_blocks == 0 ||
+	    start_block + n_blocks > ppd->cc_max_table_entries) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	sentry = start_block * IB_CCT_ENTRIES;
+	eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+		 (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+	/* sanity check ccti_limit */
+	ccti_limit = be16_to_cpu(p->ccti_limit);
+	if (ccti_limit + 1 > eentry) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+	if (new_cc_state == NULL)
+		goto getit;
+
+	spin_lock(&ppd->cc_state_lock);
+
+	old_cc_state = get_cc_state(ppd);
+
+	if (old_cc_state == NULL) {
+		spin_unlock(&ppd->cc_state_lock);
+		kfree(new_cc_state);
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	*new_cc_state = *old_cc_state;
+
+	new_cc_state->cct.ccti_limit = ccti_limit;
+
+	entries = ppd->ccti_entries;
+	ppd->total_cct_entry = ccti_limit + 1;
+
+	for (j = 0, i = sentry; i < eentry; j++, i++)
+		entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+
+	memcpy(new_cc_state->cct.entries, entries,
+	       eentry * sizeof(struct ib_cc_table_entry));
+
+	new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+	new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+	memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+	       OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+	rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+	spin_unlock(&ppd->cc_state_lock);
+
+	call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+
+getit:
+	return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+	__be32 rsvd_led_mask;
+	__be32 rsvd;
+};
+
+#define OPA_LED_SHIFT	31
+#define OPA_LED_MASK	(1 << OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct opa_led_info *p = (struct opa_led_info *) data;
+	u32 nport = OPA_AM_NPORT(am);
+	u64 reg;
+
+	if (nport != 1 || OPA_AM_PORTNUM(am)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	reg = read_csr(dd, DCC_CFG_LED_CNTRL);
+	if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) &&
+		((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf))
+			p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK);
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_led_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct opa_led_info *p = (struct opa_led_info *) data;
+	u32 nport = OPA_AM_NPORT(am);
+	int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+	if (nport != 1 || OPA_AM_PORTNUM(am)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	setextled(dd, on);
+
+	return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+			    u8 *data, struct ib_device *ibdev, u8 port,
+			    u32 *resp_len)
+{
+	int ret;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	switch (attr_id) {
+	case IB_SMP_ATTR_NODE_DESC:
+		ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_NODE_INFO:
+		ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_PORT_INFO:
+		ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_PKEY_TABLE:
+		ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+		ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+		ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+		ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+		ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_PORT_STATE_INFO:
+		ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+					 resp_len);
+		break;
+	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+		ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+					 resp_len);
+		break;
+	case OPA_ATTRIB_ID_CABLE_INFO:
+		ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+						resp_len);
+		break;
+	case IB_SMP_ATTR_VL_ARB_TABLE:
+		ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+					    resp_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_INFO:
+		ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+		ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+						  port, resp_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+		ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+						   port, resp_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+		ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_LED_INFO:
+		ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_SM_INFO:
+		if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		if (ibp->port_cap_flags & IB_PORT_SM)
+			return IB_MAD_RESULT_SUCCESS;
+		/* FALLTHROUGH */
+	default:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		ret = reply((struct ib_mad_hdr *)smp);
+		break;
+	}
+	return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+			    u8 *data, struct ib_device *ibdev, u8 port,
+			    u32 *resp_len)
+{
+	int ret;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	switch (attr_id) {
+	case IB_SMP_ATTR_PORT_INFO:
+		ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_PKEY_TABLE:
+		ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+		ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+		ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+		ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+		ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+					       resp_len);
+		break;
+	case OPA_ATTRIB_ID_PORT_STATE_INFO:
+		ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+					 resp_len);
+		break;
+	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+		ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+					 resp_len);
+		break;
+	case IB_SMP_ATTR_VL_ARB_TABLE:
+		ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+					    resp_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+		ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+						  port, resp_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+		ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_LED_INFO:
+		ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+					      resp_len);
+		break;
+	case IB_SMP_ATTR_SM_INFO:
+		if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		if (ibp->port_cap_flags & IB_PORT_SM)
+			return IB_MAD_RESULT_SUCCESS;
+		/* FALLTHROUGH */
+	default:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		ret = reply((struct ib_mad_hdr *)smp);
+		break;
+	}
+	return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+	ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+				  struct ib_device *ibdev, u8 port,
+				  u32 *resp_len)
+{
+	int i;
+	u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+	u8 *next_smp = opa_get_smp_data(smp);
+
+	if (num_attr < 1 || num_attr > 117) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < num_attr; i++) {
+		struct opa_aggregate *agg;
+		size_t agg_data_len;
+		size_t agg_size;
+		u32 am;
+
+		agg = (struct opa_aggregate *)next_smp;
+		agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+		agg_size = sizeof(*agg) + agg_data_len;
+		am = be32_to_cpu(agg->attr_mod);
+
+		*resp_len += agg_size;
+
+		if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			return reply((struct ib_mad_hdr *)smp);
+		}
+
+		/* zero the payload for this segment */
+		memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+		(void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+					ibdev, port, NULL);
+		if (smp->status & ~IB_SMP_DIRECTION) {
+			set_aggr_error(agg);
+			return reply((struct ib_mad_hdr *)smp);
+		}
+		next_smp += agg_size;
+
+	}
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+				  struct ib_device *ibdev, u8 port,
+				  u32 *resp_len)
+{
+	int i;
+	u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+	u8 *next_smp = opa_get_smp_data(smp);
+
+	if (num_attr < 1 || num_attr > 117) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < num_attr; i++) {
+		struct opa_aggregate *agg;
+		size_t agg_data_len;
+		size_t agg_size;
+		u32 am;
+
+		agg = (struct opa_aggregate *)next_smp;
+		agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+		agg_size = sizeof(*agg) + agg_data_len;
+		am = be32_to_cpu(agg->attr_mod);
+
+		*resp_len += agg_size;
+
+		if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			return reply((struct ib_mad_hdr *)smp);
+		}
+
+		(void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+					ibdev, port, NULL);
+		if (smp->status & ~IB_SMP_DIRECTION) {
+			set_aggr_error(agg);
+			return reply((struct ib_mad_hdr *)smp);
+		}
+		next_smp += agg_size;
+
+	}
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ *   PortRcvErrors [*]
+ *   LinkErrorRecovery
+ *   LocalLinkIntegrityErrors
+ *   ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+	/* PortRcvErrors */
+	write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+	dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+	/* LinkErrorRecovery */
+	write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+	write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+	/* LocalLinkIntegrityErrors */
+	write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+	write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+	/* ExcessiveBufferOverruns */
+	write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+	dd->rcv_ovfl_cnt = 0;
+	dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+			const struct ib_wc *in_wc)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		return (smp->hop_cnt == 0 &&
+			smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+			smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+	}
+
+	return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+			       const struct ib_wc *in_wc)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u16 slid = in_wc->slid;
+	u16 pkey;
+
+	if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+		return 1;
+
+	pkey = ppd->pkeys[in_wc->pkey_index];
+	/*
+	 * We need to do the "node-local" checks specified in OPAv1,
+	 * rev 0.90, section 9.10.26, which are:
+	 *   - pkey is 0x7fff, or 0xffff
+	 *   - Source QPN == 0 || Destination QPN == 0
+	 *   - the MAD header's management class is either
+	 *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+	 *     IB_MGMT_CLASS_SUBN_LID_ROUTED
+	 *   - SLID != 0
+	 *
+	 * However, we know (and so don't need to check again) that,
+	 * for local SMPs, the MAD stack passes MADs with:
+	 *   - Source QPN of 0
+	 *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+	 *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+	 *     our own port's lid
+	 *
+	 */
+	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+		return 0;
+	ingress_pkey_table_fail(ppd, pkey, slid);
+	return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+			    u8 port, const struct opa_mad *in_mad,
+			    struct opa_mad *out_mad,
+			    u32 *resp_len)
+{
+	struct opa_smp *smp = (struct opa_smp *)out_mad;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *data;
+	u32 am;
+	__be16 attr_id;
+	int ret;
+
+	*out_mad = *in_mad;
+	data = opa_get_smp_data(smp);
+
+	am = be32_to_cpu(smp->attr_mod);
+	attr_id = smp->attr_id;
+	if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_mad_hdr *)smp);
+		goto bail;
+	}
+	ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+			 smp->route.dr.dr_slid, smp->route.dr.return_path,
+			 smp->hop_cnt);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void) check_mkey(to_iport(ibdev, port_num),
+					  (struct ib_mad_hdr *)smp, 0,
+					  smp->mkey, smp->route.dr.dr_slid,
+					  smp->route.dr.return_path,
+					  smp->hop_cnt);
+		ret = IB_MAD_RESULT_FAILURE;
+		goto bail;
+	}
+
+	*resp_len = opa_get_smp_header_size(smp);
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (attr_id) {
+		default:
+			clear_opa_smp_data(smp);
+			ret = subn_get_opa_sma(attr_id, smp, am, data,
+					       ibdev, port, resp_len);
+			goto bail;
+		case OPA_ATTRIB_ID_AGGREGATE:
+			ret = subn_get_opa_aggregate(smp, ibdev, port,
+						     resp_len);
+			goto bail;
+		}
+	case IB_MGMT_METHOD_SET:
+		switch (attr_id) {
+		default:
+			ret = subn_set_opa_sma(attr_id, smp, am, data,
+					       ibdev, port, resp_len);
+			goto bail;
+		case OPA_ATTRIB_ID_AGGREGATE:
+			ret = subn_set_opa_aggregate(smp, ibdev, port,
+						     resp_len);
+			goto bail;
+		}
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_mad_hdr *)smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port, const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_mad_hdr *)smp);
+		goto bail;
+	}
+
+	ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+			 smp->mkey, (__force __be32)smp->dr_slid,
+			 smp->return_path, smp->hop_cnt);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void) check_mkey(to_iport(ibdev, port_num),
+					  (struct ib_mad_hdr *)smp, 0,
+					  smp->mkey,
+					  (__force __be32)smp->dr_slid,
+					  smp->return_path, smp->hop_cnt);
+		ret = IB_MAD_RESULT_FAILURE;
+		goto bail;
+	}
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = subn_get_nodeinfo(smp, ibdev, port);
+			goto bail;
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)smp);
+			goto bail;
+		}
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+			    const struct opa_mad *in_mad,
+			    struct opa_mad *out_mad, u32 *resp_len)
+{
+	struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+
+	if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	*resp_len = sizeof(pmp->mad_hdr);
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+			goto bail;
+		case OPA_PM_ATTRIB_ID_PORT_STATUS:
+			ret = pma_get_opa_portstatus(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+			ret = pma_get_opa_datacounters(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+			ret = pma_get_opa_porterrors(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		case OPA_PM_ATTRIB_ID_ERROR_INFO:
+			ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+			ret = pma_set_opa_portstatus(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		case OPA_PM_ATTRIB_ID_ERROR_INFO:
+			ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+								resp_len);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_mad_hdr *)pmp);
+	}
+
+bail:
+	return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+			        u8 port, const struct ib_wc *in_wc,
+			        const struct ib_grh *in_grh,
+			        const struct opa_mad *in_mad,
+			        struct opa_mad *out_mad, size_t *out_mad_size,
+			        u16 *out_mad_pkey_index)
+{
+	int ret;
+	int pkey_idx;
+	u32 resp_len = 0;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+	if (pkey_idx < 0) {
+		pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+			hfi1_get_pkey(ibp, 1));
+		pkey_idx = 1;
+	}
+	*out_mad_pkey_index = (u16)pkey_idx;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		if (is_local_mad(ibp, in_mad, in_wc)) {
+			ret = opa_local_smp_check(ibp, in_wc);
+			if (ret)
+				return IB_MAD_RESULT_FAILURE;
+		}
+		ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+				       out_mad, &resp_len);
+		goto bail;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+				       &resp_len);
+		goto bail;
+
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	if (ret & IB_MAD_RESULT_REPLY)
+		*out_mad_size = round_up(resp_len, 8);
+	else if (ret & IB_MAD_RESULT_SUCCESS)
+		*out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+	return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+			       const struct ib_wc *in_wc,
+			       const struct ib_grh *in_grh,
+			       const struct ib_mad *in_mad,
+			       struct ib_mad *out_mad)
+{
+	int ret;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index)
+{
+	switch (in_mad->base_version) {
+	case OPA_MGMT_BASE_VERSION:
+		if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+			dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+			return IB_MAD_RESULT_FAILURE;
+		}
+		return hfi1_process_opa_mad(ibdev, mad_flags, port,
+					    in_wc, in_grh,
+					    (struct opa_mad *)in_mad,
+					    (struct opa_mad *)out_mad,
+					    out_mad_size,
+					    out_mad_pkey_index);
+	case IB_MGMT_BASE_VERSION:
+		return hfi1_process_ib_mad(ibdev, mad_flags, port,
+					  in_wc, in_grh,
+					  (const struct ib_mad *)in_mad,
+					  (struct ib_mad *)out_mad);
+	default:
+		break;
+	}
+
+	return IB_MAD_RESULT_FAILURE;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int hfi1_create_agents(struct hfi1_ibdev *dev)
+{
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct hfi1_ibport *ibp;
+	int p;
+	int ret;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
+					      NULL, 0, send_handler,
+					      NULL, NULL, 0);
+		if (IS_ERR(agent)) {
+			ret = PTR_ERR(agent);
+			goto err;
+		}
+
+		ibp->send_agent = agent;
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+	}
+
+	return ret;
+}
+
+void hfi1_free_agents(struct hfi1_ibdev *dev)
+{
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct hfi1_ibport *ibp;
+	int p;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+		if (ibp->sm_ah) {
+			ib_destroy_ah(&ibp->sm_ah->ibah);
+			ibp->sm_ah = NULL;
+		}
+	}
+}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
new file mode 100644
index 0000000..4745750
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.h
@@ -0,0 +1,325 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#define USE_PI_LED_ENABLE	1 /* use led enabled bit in struct
+				   * opa_port_states, if available */
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#ifndef PI_LED_ENABLE_SUP
+#define PI_LED_ENABLE_SUP 0
+#endif
+#include "opa_compat.h"
+
+
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+	u8 reserved;
+	u8 reserved1;
+	__be16 port_check_rate;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved2;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved3;
+	__be16 vl15_dropped;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_xmit_wait;
+	__be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
+#define HFI1_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI	2
+
+struct opa_hfi1_cong_log_event_internal {
+	u32 lqpn;
+	u32 rqpn;
+	u8 sl;
+	u8 svc_type;
+	u32 rlid;
+	s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+	u8 local_qp_cn_entry[3];
+	u8 remote_qp_number_cn_entry[3];
+	u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+	u8 reserved;
+	__be32 remote_lid_cn_entry;
+	__be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS	96
+
+struct opa_hfi1_cong_log {
+	u8 log_type;
+	u8 congestion_flags;
+	__be16 threshold_event_counter;
+	__be32 current_time_stamp;
+	u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+	struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+	u8 ccti_increase;
+	u8 reserved;
+	__be16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+	u8 ccti_increase;
+	u8 reserved;
+	u16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+	__be32 control_map;
+	__be16 port_control;
+	struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+	u32 control_map;
+	u16 port_control;
+	struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+	__be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+	u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+	__be16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+	(IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+	struct rcu_head rcu;
+	struct cc_table_shadow cct;
+	struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT	24
+#define OPA_AM_NPORT_MASK	0xff
+#define OPA_AM_NPORT_SMASK	(OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am)	(((am) >> OPA_AM_NPORT_SHIFT) & \
+					OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT	24
+#define OPA_AM_NBLK_MASK	0xff
+#define OPA_AM_NBLK_SMASK	(OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am)		(((am) >> OPA_AM_NBLK_SHIFT) & \
+					OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT	0
+#define OPA_AM_START_BLK_MASK	0xff
+#define OPA_AM_START_BLK_SMASK	(OPA_AM_START_BLK_MASK << \
+					OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am)	(((am) >> OPA_AM_START_BLK_SHIFT) & \
+					OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT	0
+#define OPA_AM_PORTNUM_MASK	0xff
+#define OPA_AM_PORTNUM_SMASK	(OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am)	(((am) >> OPA_AM_PORTNUM_SHIFT) & \
+					OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT	12
+#define OPA_AM_ASYNC_MASK	0x1
+#define OPA_AM_ASYNC_SMASK	(OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am)	(((am) >> OPA_AM_ASYNC_SHIFT) & \
+					OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT	9
+#define OPA_AM_START_SM_CFG_MASK	0x1
+#define OPA_AM_START_SM_CFG_SMASK	(OPA_AM_START_SM_CFG_MASK << \
+						OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am)		(((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+						& OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT	19
+#define OPA_AM_CI_ADDR_MASK	0xfff
+#define OPA_AM_CI_ADDR_SMASK	(OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am)	(((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+					OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT	13
+#define OPA_AM_CI_LEN_MASK	0x3f
+#define OPA_AM_CI_LEN_SMASK	(OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am)	(((am) >> OPA_AM_CI_LEN_SHIFT) & \
+					OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK	0x80
+#define OPA_EI_CODE_SMASK	0x0f
+
+struct vl_limit {
+	__be16 dedicated;
+	__be16 shared;
+};
+
+struct buffer_control {
+	__be16 reserved;
+	__be16 overall_shared_limit;
+	struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+	u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+	cpu_to_be32(COUNTER_MASK(1, 0) | \
+		    COUNTER_MASK(1, 1) | \
+		    COUNTER_MASK(1, 2) | \
+		    COUNTER_MASK(1, 3) | \
+		    COUNTER_MASK(1, 4))
+
+#endif				/* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/staging/rdma/hfi1/mmap.c
new file mode 100644
index 0000000..5173b1c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mmap.c
@@ -0,0 +1,192 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct hfi1_mmap_info
+ */
+void hfi1_release_mmap_info(struct kref *ref)
+{
+	struct hfi1_mmap_info *ip =
+		container_of(ref, struct hfi1_mmap_info, ref);
+	struct hfi1_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void hfi1_vma_open(struct vm_area_struct *vma)
+{
+	struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void hfi1_vma_close(struct vm_area_struct *vma)
+{
+	struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, hfi1_release_mmap_info);
+}
+
+static struct vm_operations_struct hfi1_vm_ops = {
+	.open =     hfi1_vma_open,
+	.close =    hfi1_vma_close,
+};
+
+/**
+ * hfi1_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct hfi1_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct hfi1_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &hfi1_vm_ops;
+		vma->vm_private_data = ip;
+		hfi1_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for hfi1_mmap
+ */
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev,
+					     u32 size,
+					     struct ib_ucontext *context,
+					     void *obj) {
+	struct hfi1_mmap_info *ip;
+
+	ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+			   u32 size, void *obj)
+{
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
new file mode 100644
index 0000000..23567f8
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -0,0 +1,546 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+
+/* Fast memory region */
+struct hfi1_fmr {
+	struct ib_fmr ibfmr;
+	struct hfi1_mregion mr;        /* must be last */
+};
+
+static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct hfi1_fmr, ibfmr);
+}
+
+static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd,
+			int count)
+{
+	int m, i = 0;
+	int rval = 0;
+
+	m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+	for (; i < m; i++) {
+		mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+		if (!mr->map[i])
+			goto bail;
+	}
+	mr->mapsz = m;
+	init_completion(&mr->comp);
+	/* count returning the ptr to user */
+	atomic_set(&mr->refcount, 1);
+	mr->pd = pd;
+	mr->max_segs = count;
+out:
+	return rval;
+bail:
+	while (i)
+		kfree(mr->map[--i]);
+	rval = -ENOMEM;
+	goto out;
+}
+
+static void deinit_mregion(struct hfi1_mregion *mr)
+{
+	int i = mr->mapsz;
+
+	mr->mapsz = 0;
+	while (i)
+		kfree(mr->map[--i]);
+}
+
+
+/**
+ * hfi1_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see dma.c).
+ */
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct hfi1_mr *mr = NULL;
+	struct ib_mr *ret;
+	int rval;
+
+	if (to_ipd(pd)->user) {
+		ret = ERR_PTR(-EPERM);
+		goto bail;
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	rval = init_mregion(&mr->mr, pd, 0);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail;
+	}
+
+
+	rval = hfi1_alloc_lkey(&mr->mr, 1);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail_mregion;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	goto done;
+}
+
+static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
+{
+	struct hfi1_mr *mr;
+	int rval = -ENOMEM;
+	int m;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+	mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
+	if (!mr)
+		goto bail;
+
+	rval = init_mregion(&mr->mr, pd, count);
+	if (rval)
+		goto bail;
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	rval = hfi1_alloc_lkey(&mr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	mr->ibmr.lkey = mr->mr.lkey;
+	mr->ibmr.rkey = mr->mr.lkey;
+done:
+	return mr;
+
+bail_mregion:
+	deinit_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	mr = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * hfi1_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *buffer_list,
+			       int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct hfi1_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		goto bail;
+	}
+
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.access_flags = acc;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == HFI1_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt_addr, int mr_access_flags,
+			       struct ib_udata *udata)
+{
+	struct hfi1_mr *mr;
+	struct ib_umem *umem;
+	struct scatterlist *sg;
+	int n, m, entry;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+
+	mr = alloc_mr(n, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = ib_umem_offset(umem);
+	mr->mr.access_flags = mr_access_flags;
+	mr->umem = umem;
+
+	if (is_power_of_2(umem->page_size))
+		mr->mr.page_shift = ilog2(umem->page_size);
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+			void *vaddr;
+
+			vaddr = page_address(sg_page(sg));
+			if (!vaddr) {
+				ret = ERR_PTR(-EINVAL);
+				goto bail;
+			}
+			mr->mr.map[m]->segs[n].vaddr = vaddr;
+			mr->mr.map[m]->segs[n].length = umem->page_size;
+			n++;
+			if (n == HFI1_SEGSZ) {
+				m++;
+				n = 0;
+			}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by hfi1_get_dma_mr()
+ * or hfi1_reg_user_mr().
+ */
+int hfi1_dereg_mr(struct ib_mr *ibmr)
+{
+	struct hfi1_mr *mr = to_imr(ibmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	hfi1_free_lkey(&mr->mr);
+
+	hfi1_put_mr(&mr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&mr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		dd_dev_err(
+			dd_from_ibdev(mr->mr.pd->device),
+			"hfi1_dereg_mr timeout mr %p pd %p refcount %u\n",
+			mr, mr->mr.pd, atomic_read(&mr->mr.refcount));
+		hfi1_get_mr(&mr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_mregion(&mr->mr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+out:
+	return ret;
+}
+
+/*
+ * Allocate a memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ *
+ * Return the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *hfi1_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct hfi1_mr *mr;
+
+	mr = alloc_mr(max_page_list_len, pd);
+	if (IS_ERR(mr))
+		return (struct ib_mr *)mr;
+
+	return &mr->ibmr;
+}
+
+struct ib_fast_reg_page_list *
+hfi1_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
+{
+	unsigned size = page_list_len * sizeof(u64);
+	struct ib_fast_reg_page_list *pl;
+
+	if (size > PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl)
+		return ERR_PTR(-ENOMEM);
+
+	pl->page_list = kzalloc(size, GFP_KERNEL);
+	if (!pl->page_list)
+		goto err_free;
+
+	return pl;
+
+err_free:
+	kfree(pl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+	kfree(pl->page_list);
+	kfree(pl);
+}
+
+/**
+ * hfi1_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr)
+{
+	struct hfi1_fmr *fmr;
+	int m;
+	struct ib_fmr *ret;
+	int rval = -ENOMEM;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+	fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages);
+	if (rval)
+		goto bail;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	rval = hfi1_alloc_lkey(&fmr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	fmr->ibfmr.rkey = fmr->mr.lkey;
+	fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->mr.page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_mregion(&fmr->mr);
+bail:
+	kfree(fmr);
+	ret = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * hfi1_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int list_len, u64 iova)
+{
+	struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+	struct hfi1_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	i = atomic_read(&fmr->mr.refcount);
+	if (i > 2)
+		return -EBUSY;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->mr.page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == HFI1_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int hfi1_unmap_fmr(struct list_head *fmr_list)
+{
+	struct hfi1_fmr *fmr;
+	struct hfi1_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * hfi1_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	hfi1_free_lkey(&fmr->mr);
+	hfi1_put_mr(&fmr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&fmr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		hfi1_get_mr(&fmr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_mregion(&fmr->mr);
+	kfree(fmr);
+out:
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
new file mode 100644
index 0000000..f64eec1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/opa_compat.h
@@ -0,0 +1,129 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO		cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG	cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING	cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE	cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS		cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS	cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS	cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS	cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO		cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE		cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+	return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+	return ((ps->portphysstate_portstate &
+		  OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+	IB_PORTPHYSSTATE_NOP = 0,
+	/* 1 is reserved */
+	IB_PORTPHYSSTATE_POLLING = 2,
+	IB_PORTPHYSSTATE_DISABLED = 3,
+	IB_PORTPHYSSTATE_TRAINING = 4,
+	IB_PORTPHYSSTATE_LINKUP = 5,
+	IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+	IB_PORTPHYSSTATE_PHY_TEST = 7,
+	/* 8 is reserved */
+	OPA_PORTPHYSSTATE_OFFLINE = 9,
+	OPA_PORTPHYSSTATE_GANGED = 10,
+	OPA_PORTPHYSSTATE_TEST = 11,
+	OPA_PORTPHYSSTATE_MAX = 11,
+	/* values 12-15 are reserved/ignored */
+};
+
+/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */
+#define OPA_PORT_TYPE_UNKNOWN          0
+#define OPA_PORT_TYPE_DISCONNECTED     1
+/* port is not currently usable, CableInfo not available */
+#define OPA_PORT_TYPE_FIXED            2
+/* A fixed backplane port in a director class switch. All OPA ASICS */
+#define OPA_PORT_TYPE_VARIABLE         3
+/* A backplane port in a blade system, possibly mixed configuration */
+#define OPA_PORT_TYPE_STANDARD         4
+/* implies a SFF-8636 defined format for CableInfo (QSFP) */
+#define OPA_PORT_TYPE_SI_PHOTONICS      5
+/* A silicon photonics module implies TBD defined format for CableInfo
+ * as defined by Intel SFO group */
+/* 6 - 15 are reserved */
+
+#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
new file mode 100644
index 0000000..ac5653c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pcie.c
@@ -0,0 +1,1253 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/*
+		 * This can happen (in theory) iff:
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+			       -ret);
+		goto done;
+	}
+
+	ret = pci_request_regions(pdev, DRIVER_NAME);
+	if (ret) {
+		hfi1_early_err(&pdev->dev,
+			       "pci_request_regions fails: err %d\n", -ret);
+		goto bail;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * If the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			hfi1_early_err(&pdev->dev,
+				       "Unable to set DMA mask: %d\n", ret);
+			goto bail;
+		}
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		hfi1_early_err(&pdev->dev,
+			       "Unable to set DMA consistent mask: %d\n", ret);
+		goto bail;
+	}
+
+	pci_set_master(pdev);
+	ret = pci_enable_pcie_error_reporting(pdev);
+	if (ret) {
+		hfi1_early_err(&pdev->dev,
+			       "Unable to enable pcie error reporting: %d\n",
+			      ret);
+		ret = 0;
+	}
+	goto done;
+
+bail:
+	hfi1_pcie_cleanup(pdev);
+done:
+	return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+	pci_disable_device(pdev);
+	/*
+	 * Release regions should be called after the disable. OK to
+	 * call if request regions has not been called or failed.
+	 */
+	pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+		     const struct pci_device_id *ent)
+{
+	unsigned long len;
+	resource_size_t addr;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+
+	/*
+	 * The TXE PIO buffers are at the tail end of the chip space.
+	 * Cut them off and map them separately.
+	 */
+
+	/* sanity check vs expectations */
+	if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+		dd_dev_err(dd, "chip PIO range does not match\n");
+		return -EINVAL;
+	}
+
+	dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+	if (!dd->kregbase)
+		return -ENOMEM;
+
+	dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+	if (!dd->piobase) {
+		iounmap(dd->kregbase);
+		return -ENOMEM;
+	}
+
+	dd->flags |= HFI1_PRESENT;	/* now register routines work */
+
+	dd->kregend = dd->kregbase + TXE_PIO_SEND;
+	dd->physaddr = addr;        /* used for io_remap, etc. */
+
+	/*
+	 * Re-map the chip's RcvArray as write-combining to allow us
+	 * to write an entire cacheline worth of entries in one shot.
+	 * If this re-map fails, just continue - the RcvArray programming
+	 * function will handle both cases.
+	 */
+	dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+	dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+				     dd->chip_rcv_array_count * 8);
+	dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+	/*
+	 * Save BARs and command to rewrite after device reset.
+	 */
+	dd->pcibar0 = addr;
+	dd->pcibar1 = addr >> 32;
+	pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+	pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+							&dd->pcie_devctl2);
+	pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+	pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+							&dd->pci_lnkctl3);
+	pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+	return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+	u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+	dd->flags &= ~HFI1_PRESENT;
+	dd->kregbase = NULL;
+	iounmap(base);
+	if (dd->rcvarray_wc)
+		iounmap(dd->rcvarray_wc);
+	if (dd->piobase)
+		iounmap(dd->piobase);
+
+	pci_set_drvdata(dd->pcidev, NULL);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+	int i;
+	u16 status;
+
+	/* no need to check for the capability - we know the device has it */
+
+	/* wait for Transaction Pending bit to clear, at most a few ms */
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
+
+		pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+		if (!(status & PCI_EXP_DEVSTA_TRPND))
+			goto clear;
+	}
+
+	dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+	pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+						PCI_EXP_DEVCTL_BCR_FLR);
+	/* PCIe spec requires the function to be back within 100ms */
+	msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+		       struct hfi1_msix_entry *hfi1_msix_entry)
+{
+	int ret;
+	int nvec = *msixcnt;
+	struct msix_entry *msix_entry;
+	int i;
+
+	/* We can't pass hfi1_msix_entry array to msix_setup
+	 * so use a dummy msix_entry array and copy the allocated
+	 * irq back to the hfi1_msix_entry array. */
+	msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+	if (!msix_entry) {
+		ret = -ENOMEM;
+		goto do_intx;
+	}
+
+	for (i = 0; i < nvec; i++)
+		msix_entry[i] = hfi1_msix_entry[i].msix;
+
+	ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+	if (ret < 0)
+		goto free_msix_entry;
+	nvec = ret;
+
+	for (i = 0; i < nvec; i++)
+		hfi1_msix_entry[i].msix = msix_entry[i];
+
+	kfree(msix_entry);
+	*msixcnt = nvec;
+	return;
+
+free_msix_entry:
+	kfree(msix_entry);
+
+do_intx:
+	dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+		   nvec, ret);
+	*msixcnt = 0;
+	hfi1_enable_intx(dd->pcidev);
+
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+	u32 speed;
+
+	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	default: /* not defined, assume Gen1 */
+	case PCI_EXP_LNKSTA_CLS_2_5GB:
+		speed = 2500; /* Gen 1, 2.5GHz */
+		break;
+	case PCI_EXP_LNKSTA_CLS_5_0GB:
+		speed = 5000; /* Gen 2, 5GHz */
+		break;
+	case GEN3_SPEED_VECTOR:
+		speed = 8000; /* Gen 3, 8GHz */
+		break;
+	}
+	return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+	return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+	u16 linkstat;
+
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	dd->lbus_width = extract_width(linkstat);
+	dd->lbus_speed = extract_speed(linkstat);
+	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+		 "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed.  Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+	u32 linkcap;
+
+	if (!pci_is_pcie(dd->pcidev)) {
+		dd_dev_err(dd, "Can't find PCI Express capability!\n");
+		return -EINVAL;
+	}
+
+	/* find if our max speed is Gen3 and parent supports Gen3 speeds */
+	dd->link_gen3_capable = 1;
+
+	pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+	if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+		dd_dev_info(dd,
+			"This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+			linkcap & PCI_EXP_LNKCAP_SLS);
+		dd->link_gen3_capable = 0;
+	}
+
+	/*
+	 * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+	 */
+	if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+		dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+		dd->link_gen3_capable = 0;
+	}
+
+	/* obtain the link width and current speed */
+	update_lbus_info(dd);
+
+	/* check against expected pcie width and complain if "wrong" */
+	if (dd->lbus_width < 16)
+		dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width);
+
+	return 0;
+}
+
+/*
+ * Returns in *nent:
+ *	- actual number of interrupts allocated
+ *	- 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+		  struct hfi1_msix_entry *entry)
+{
+	int pos;
+
+	pos = dd->pcidev->msix_cap;
+	if (*nent && pos) {
+		msix_setup(dd, pos, nent, entry);
+		/* did it, either MSI-X or INTx */
+	} else {
+		*nent = 0;
+		hfi1_enable_intx(dd->pcidev);
+	}
+
+	tune_pcie_caps(dd);
+}
+
+/*
+ * Disable MSI-X.
+ */
+void hfi1_nomsix(struct hfi1_devdata *dd)
+{
+	pci_disable_msix(dd->pcidev);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+	/* first, turn on INTx */
+	pci_intx(pdev, 1);
+	/* then turn off MSI-X */
+	pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+	pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+	pci_write_config_dword(dd->pcidev,
+				PCI_BASE_ADDRESS_0, dd->pcibar0);
+	pci_write_config_dword(dd->pcidev,
+				PCI_BASE_ADDRESS_1, dd->pcibar1);
+	pci_write_config_dword(dd->pcidev,
+				PCI_ROM_ADDRESS, dd->pci_rom);
+	pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+	pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+	pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+							dd->pcie_devctl2);
+	pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+							dd->pci_lnkctl3);
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+	u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (!pci_is_root_bus(parent->bus)) {
+		dd_dev_info(dd, "Parent not root\n");
+		return;
+	}
+
+	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+		return;
+	rc_mpss = parent->pcie_mpss;
+	rc_mps = ffs(pcie_get_mps(parent)) - 8;
+	/* Find out supported and configured values for endpoint (us) */
+	ep_mpss = dd->pcidev->pcie_mpss;
+	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+	/* Find max payload supported by root, endpoint */
+	if (rc_mpss > ep_mpss)
+		rc_mpss = ep_mpss;
+
+	/* If Supported greater than limit in module param, limit it */
+	if (rc_mpss > (hfi1_pcie_caps & 7))
+		rc_mpss = hfi1_pcie_caps & 7;
+	/* If less than (allowed, supported), bump root payload */
+	if (rc_mpss > rc_mps) {
+		rc_mps = rc_mpss;
+		pcie_set_mps(parent, 128 << rc_mps);
+	}
+	/* If less than (allowed, supported), bump endpoint payload */
+	if (rc_mpss > ep_mps) {
+		ep_mps = rc_mpss;
+		pcie_set_mps(dd->pcidev, 128 << ep_mps);
+	}
+
+	/*
+	 * Now the Read Request size.
+	 * No field for max supported, but PCIe spec limits it to 4096,
+	 * which is code '5' (log2(4096) - 7)
+	 */
+	max_mrrs = 5;
+	if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+		max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+	max_mrrs = 128 << max_mrrs;
+	rc_mrrs = pcie_get_readrq(parent);
+	ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+	if (max_mrrs > rc_mrrs) {
+		rc_mrrs = max_mrrs;
+		pcie_set_readrq(parent, rc_mrrs);
+	}
+	if (max_mrrs > ep_mrrs) {
+		ep_mrrs = max_mrrs;
+		pcie_set_readrq(dd->pcidev, ep_mrrs);
+	}
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		dd_dev_info(dd, "State Normal, ignoring\n");
+		break;
+
+	case pci_channel_io_frozen:
+		dd_dev_info(dd, "State Frozen, requesting reset\n");
+		pci_disable_device(pdev);
+		ret = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		if (dd) {
+			dd_dev_info(dd, "State Permanent Failure, disabling\n");
+			/* no more register accesses! */
+			dd->flags &= ~HFI1_PRESENT;
+			hfi1_disable_after_error(dd);
+		}
+		 /* else early, or other problem */
+		ret =  PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default: /* shouldn't happen */
+		dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+			    state);
+		break;
+	}
+	return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+	u64 words = 0U;
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	if (dd && dd->pport) {
+		words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+		if (words == ~0ULL)
+			ret = PCI_ERS_RESULT_NEED_RESET;
+		dd_dev_info(dd,
+			    "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+			    words, ret);
+	}
+	return  ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	dd_dev_info(dd, "HFI1 resume function called\n");
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	/*
+	 * Running jobs will fail, since it's asynchronous
+	 * unlike sysfs-requested reset.   Better than
+	 * doing nothing.
+	 */
+	hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+	.error_detected = pci_error_detected,
+	.mmio_enabled = pci_mmio_enabled,
+	.link_reset = pci_link_reset,
+	.slot_reset = pci_slot_reset,
+	.resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product.  If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1	/* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2	/* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3	/* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE		0x0	/* no error */
+#define DL_ERR_SWAP_PARITY	0x1	/* parity error in SerDes interrupt */
+					/*   or response data */
+#define DL_ERR_DISABLED	0x2	/* hfi disabled */
+#define DL_ERR_SECURITY	0x3	/* security check failed */
+#define DL_ERR_SBUS		0x4	/* SBus status error */
+#define DL_ERR_XFR_PARITY	0x5	/* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000	/* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2	/* discrete HFI */
+#define DEFAULT_MCP_PSET 4	/* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+	/* prec   attn   post */
+	{  0x00,  0x00,  0x12 },	/* p0 */
+	{  0x00,  0x00,  0x0c },	/* p1 */
+	{  0x00,  0x00,  0x0f },	/* p2 */
+	{  0x00,  0x00,  0x09 },	/* p3 */
+	{  0x00,  0x00,  0x00 },	/* p4 */
+	{  0x06,  0x00,  0x00 },	/* p5 */
+	{  0x09,  0x00,  0x00 },	/* p6 */
+	{  0x06,  0x00,  0x0f },	/* p7 */
+	{  0x09,  0x00,  0x09 },	/* p8 */
+	{  0x0c,  0x00,  0x00 },	/* p9 */
+	{  0x00,  0x00,  0x18 },	/* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+	/* prec   attn   post */
+	{  0x00,  0x1e,  0x07 },	/* p0 */
+	{  0x00,  0x1e,  0x05 },	/* p1 */
+	{  0x00,  0x1e,  0x06 },	/* p2 */
+	{  0x00,  0x1e,  0x04 },	/* p3 */
+	{  0x00,  0x1e,  0x00 },	/* p4 */
+	{  0x03,  0x1e,  0x00 },	/* p5 */
+	{  0x04,  0x1e,  0x00 },	/* p6 */
+	{  0x03,  0x1e,  0x06 },	/* p7 */
+	{  0x03,  0x1e,  0x04 },	/* p8 */
+	{  0x05,  0x1e,  0x00 },	/* p9 */
+	{  0x00,  0x1e,  0x0a },	/* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+	((((u32)(pre)) << \
+			PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+	| (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+	| (((u32)(post)) << \
+		PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+			 u8 div)
+{
+	struct pci_dev *pdev = dd->pcidev;
+	u32 hit_error = 0;
+	u32 violation;
+	u32 i;
+	u8 c_minus1, c0, c_plus1;
+
+	for (i = 0; i < 11; i++) {
+		/* set index */
+		pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+		/* write the value */
+		c_minus1 = eq[i][PREC] / div;
+		c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+		c_plus1 = eq[i][POST] / div;
+		pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+			eq_value(c_minus1, c0, c_plus1));
+		/* check if these coefficients violate EQ rules */
+		pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+								&violation);
+		if (violation
+		    & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+			if (hit_error == 0) {
+				dd_dev_err(dd,
+					"Gen3 EQ Table Coefficient rule violations\n");
+				dd_dev_err(dd, "         prec   attn   post\n");
+			}
+			dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
+				i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]);
+			dd_dev_err(dd, "            %02x     %02x     %02x\n",
+				(u32)c_minus1, (u32)c0, (u32)c_plus1);
+			hit_error = 1;
+		}
+	}
+	if (hit_error)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The hardware mutex is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+	int i;
+
+	set_sbus_fast_mode(dd);
+	/*
+	 * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+	 * This avoids a spurious framing error that can otherwise be
+	 * generated by the MAC layer.
+	 *
+	 * Use individual addresses since no broadcast is set up.
+	 */
+	for (i = 0; i < NUM_PCIE_SERDES; i++) {
+		sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+			     0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+	}
+
+	clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+	struct pci_dev *dev = dd->pcidev;
+	struct pci_dev *pdev;
+
+	/* need a parent */
+	if (!dev->bus->self) {
+		dd_dev_err(dd, "%s: no parent device\n", __func__);
+		return -ENOTTY;
+	}
+
+	/* should not be anyone else on the bus */
+	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+		if (pdev != dev) {
+			dd_dev_err(dd,
+				"%s: another device is on the same bus\n",
+				__func__);
+			return -ENOTTY;
+		}
+
+	/*
+	 * A secondary bus reset (SBR) issues a hot reset to our device.
+	 * The following routine does a 1s wait after the reset is dropped
+	 * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
+	 * Conventional Reset, paragraph 3, line 35 also says that a 1s
+	 * delay after a reset is required.  Per spec requirements,
+	 * the link is either working or not after that point.
+	 */
+	pci_reset_bridge_secondary_bus(dev->bus->self);
+
+	return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+				   u16 code, u16 data)
+{
+	write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+	    (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT)
+	    |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = (((u64)1 << dd->hfi1_id)
+			<< ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT)
+		| ((u64)pcie_serdes_broadcast[dd->hfi1_id]
+			<< ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT
+		| ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK
+		| ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK)
+			<< ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT
+		);
+	write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+	/* read back to push the write */
+	read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent;
+	u64 fw_ctrl;
+	u64 reg, therm;
+	u32 reg32, fs, lf;
+	u32 status, err;
+	int ret;
+	int do_retry, retry_count = 0;
+	uint default_pset;
+	u16 target_vector, target_speed;
+	u16 lnkctl, lnkctl2, vendor;
+	u8 nsbr = 1;
+	u8 div;
+	const u8 (*eq)[3];
+	int return_error = 0;
+
+	/* PCIe Gen3 is for the ASIC only */
+	if (dd->icode != ICODE_RTL_SILICON)
+		return 0;
+
+	if (pcie_target == 1) {			/* target Gen1 */
+		target_vector = GEN1_SPEED_VECTOR;
+		target_speed = 2500;
+	} else if (pcie_target == 2) {		/* target Gen2 */
+		target_vector = GEN2_SPEED_VECTOR;
+		target_speed = 5000;
+	} else if (pcie_target == 3) {		/* target Gen3 */
+		target_vector = GEN3_SPEED_VECTOR;
+		target_speed = 8000;
+	} else {
+		/* off or invalid target - skip */
+		dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+		return 0;
+	}
+
+	/* if already at target speed, done (unless forced) */
+	if (dd->lbus_speed == target_speed) {
+		dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+			pcie_target,
+			pcie_force ? "re-doing anyway" : "skipping");
+		if (!pcie_force)
+			return 0;
+	}
+
+	/*
+	 * A0 needs an additional SBR
+	 */
+	if (is_a0(dd))
+		nsbr++;
+
+	/*
+	 * Do the Gen3 transition.  Steps are those of the PCIe Gen3
+	 * recipe.
+	 */
+
+	/* step 1: pcie link working in gen1/gen2 */
+
+	/* step 2: if either side is not capable of Gen3, done */
+	if (pcie_target == 3 && !dd->link_gen3_capable) {
+		dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+		ret = -ENOSYS;
+		goto done_no_mutex;
+	}
+
+	/* hold the HW mutex across the firmware download and SBR */
+	ret = acquire_hw_mutex(dd);
+	if (ret)
+		return ret;
+
+	/* make sure thermal polling is not causing interrupts */
+	therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+	if (therm) {
+		write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+		msleep(100);
+		dd_dev_info(dd, "%s: Disabled therm polling\n",
+			    __func__);
+	}
+
+	/* step 3: download SBus Master firmware */
+	/* step 4: download PCIe Gen3 SerDes firmware */
+retry:
+	dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+	ret = load_pcie_firmware(dd);
+	if (ret)
+		goto done;
+
+	/* step 5: set up device parameter settings */
+	dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+	/*
+	 * PcieCfgSpcie1 - Link Control 3
+	 * Leave at reset value.  No need to set PerfEq - link equalization
+	 * will be performed automatically after the SBR when the target
+	 * speed is 8GT/s.
+	 */
+
+	/* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+	/* step 5a: Set Synopsys Port Logic registers */
+
+	/*
+	 * PcieCfgRegPl2 - Port Force Link
+	 *
+	 * Set the low power field to 0x10 to avoid unnecessary power
+	 * management messages.  All other fields are zero.
+	 */
+	reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+	/*
+	 * PcieCfgRegPl100 - Gen3 Control
+	 *
+	 * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+	 * turn on PcieCfgRegPl100.EqEieosCnt (erratum)
+	 * Everything else zero.
+	 */
+	reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+	/*
+	 * PcieCfgRegPl101 - Gen3 EQ FS and LF
+	 * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+	 * PcieCfgRegPl103 - Gen3 EQ Preset Index
+	 * PcieCfgRegPl105 - Gen3 EQ Status
+	 *
+	 * Give initial EQ settings.
+	 */
+	if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+		/* 1000mV, FS=24, LF = 8 */
+		fs = 24;
+		lf = 8;
+		div = 3;
+		eq = discrete_preliminary_eq;
+		default_pset = DEFAULT_DISCRETE_PSET;
+	} else {
+		/* 400mV, FS=29, LF = 9 */
+		fs = 29;
+		lf = 9;
+		div = 1;
+		eq = integrated_preliminary_eq;
+		default_pset = DEFAULT_MCP_PSET;
+	}
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+		(fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT)
+		| (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+	ret = load_eq_table(dd, eq, fs, div);
+	if (ret)
+		goto done;
+
+	/*
+	 * PcieCfgRegPl106 - Gen3 EQ Control
+	 *
+	 * Set Gen3EqPsetReqVec, leave other fields 0.
+	 */
+	if (pcie_pset == UNSET_PSET)
+		pcie_pset = default_pset;
+	if (pcie_pset > 10) {	/* valid range is 0-10, inclusive */
+		dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+			__func__, pcie_pset, default_pset);
+		pcie_pset = default_pset;
+	}
+	dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+		((1 << pcie_pset)
+			<< PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT)
+		| PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK
+		| PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+	/*
+	 * step 5b: Do post firmware download steps via SBus
+	 */
+	dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+	pcie_post_steps(dd);
+
+	/*
+	 * step 5c: Program gasket interrupts
+	 */
+	/* set the Rx Bit Rate to REFCLK ratio */
+	write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+	/* disable pCal for PCIe Gen3 RX equalization */
+	write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+	/*
+	 * Enable iCal for PCIe Gen3 RX equalization, and set which
+	 * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+	 */
+	write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+	/* terminate list */
+	write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+
+	/*
+	 * step 5d: program XMT margin
+	 * Right now, leave the default alone.  To change, do a
+	 * read-modify-write of:
+	 *	CcePcieCtrl.XmtMargin
+	 *	CcePcieCtrl.XmitMarginOverwriteEnable
+	 */
+
+	/* step 5e: disable active state power management (ASPM) */
+	dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl);
+	lnkctl &= ~PCI_EXP_LNKCTL_ASPMC;
+	pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl);
+
+	/*
+	 * step 5f: clear DirectSpeedChange
+	 * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+	 * change in the speed target from starting before we are ready.
+	 * This field defaults to 0 and we are not changing it, so nothing
+	 * needs to be done.
+	 */
+
+	/* step 5g: Set target link speed */
+	/*
+	 * Set target link speed to be target on both device and parent.
+	 * On setting the parent: Some system BIOSs "helpfully" set the
+	 * parent target speed to Gen2 to match the ASIC's initial speed.
+	 * We can set the target Gen3 because we have already checked
+	 * that it is Gen3 capable earlier.
+	 */
+	dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+	parent = dd->pcidev->bus->self;
+	pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+		(u32)lnkctl2);
+	/* only write to parent if target is not as high as ours */
+	if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+		lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+		lnkctl2 |= target_vector;
+		dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+			(u32)lnkctl2);
+		pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+	} else {
+		dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+	}
+
+	dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+		(u32)lnkctl2);
+	lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+	lnkctl2 |= target_vector;
+	dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+		(u32)lnkctl2);
+	pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+	/* step 5h: arm gasket logic */
+	/* hold DC in reset across the SBR */
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+	(void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+	/* save firmware control across the SBR */
+	fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+	dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+	arm_gasket_logic(dd);
+
+	/*
+	 * step 6: quiesce PCIe link
+	 * The chip has already been reset, so there will be no traffic
+	 * from the chip.  Linux has no easy way to enforce that it will
+	 * not try to access the device, so we just need to hope it doesn't
+	 * do it while we are doing the reset.
+	 */
+
+	/*
+	 * step 7: initiate the secondary bus reset (SBR)
+	 * step 8: hardware brings the links back up
+	 * step 9: wait for link speed transition to be complete
+	 */
+	dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+	ret = trigger_sbr(dd);
+	if (ret)
+		goto done;
+
+	/* step 10: decide what to do next */
+
+	/* check if we can read PCI space */
+	ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+	if (ret) {
+		dd_dev_info(dd,
+			"%s: read of VendorID failed after SBR, err %d\n",
+			__func__, ret);
+		return_error = 1;
+		goto done;
+	}
+	if (vendor == 0xffff) {
+		dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+		return_error = 1;
+		ret = -EIO;
+		goto done;
+	}
+
+	/* restore PCI space registers we know were reset */
+	dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+	restore_pci_variables(dd);
+	/* restore firmware control */
+	write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+	/*
+	 * Check the gasket block status.
+	 *
+	 * This is the first CSR read after the SBR.  If the read returns
+	 * all 1s (fails), the link did not make it back.
+	 *
+	 * Once we're sure we can read and write, clear the DC reset after
+	 * the SBR.  Then check for any per-lane errors. Then look over
+	 * the status.
+	 */
+	reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+	dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+	if (reg == ~0ull) {	/* PCIe read failed/timeout */
+		dd_dev_err(dd, "SBR failed - unable to read from device\n");
+		return_error = 1;
+		ret = -ENOSYS;
+		goto done;
+	}
+
+	/* clear the DC reset */
+	write_csr(dd, CCE_DC_CTRL, 0);
+	/* Set the LED off */
+	if (is_a0(dd))
+		setextled(dd, 0);
+
+	/* check for any per-lane errors */
+	pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+	dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+	/* extract status, look for our HFI */
+	status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+			& ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+	if ((status & (1 << dd->hfi1_id)) == 0) {
+		dd_dev_err(dd,
+			"%s: gasket status 0x%x, expecting 0x%x\n",
+			__func__, status, 1 << dd->hfi1_id);
+		ret = -EIO;
+		goto done;
+	}
+
+	/* extract error */
+	err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+		& ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+	if (err) {
+		dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+		ret = -EIO;
+		goto done;
+	}
+
+	/* update our link information cache */
+	update_lbus_info(dd);
+	dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+		dd->lbus_info);
+
+	if (dd->lbus_speed != target_speed) { /* not target */
+		/* maybe retry */
+		do_retry = retry_count < pcie_retry;
+		dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+			pcie_target, do_retry ? ", retrying" : "");
+		retry_count++;
+		if (do_retry) {
+			msleep(100); /* allow time to settle */
+			goto retry;
+		}
+		ret = -EIO;
+	}
+
+done:
+	if (therm) {
+		write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+		msleep(100);
+		dd_dev_info(dd, "%s: Re-enable therm polling\n",
+			    __func__);
+	}
+	release_hw_mutex(dd);
+done_no_mutex:
+	/* return no error if it is OK to be at current speed */
+	if (ret && !return_error) {
+		dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+		ret = 0;
+	}
+
+	dd_dev_info(dd, "%s: done\n", __func__);
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
new file mode 100644
index 0000000..9991814
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -0,0 +1,1771 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear.  Use the provided
+ * sendctrl register.  This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+	write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+	while (1) {
+		udelay(1);
+		sendctrl = read_csr(dd, SEND_CTRL);
+		if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+			break;
+	}
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+	u64 reg, mask;
+	unsigned long flags;
+	int write = 1;	/* write sendctrl back */
+	int flush = 0;	/* re-read sendctrl to make sure it is flushed */
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	reg = read_csr(dd, SEND_CTRL);
+	switch (op) {
+	case PSC_GLOBAL_ENABLE:
+		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+	/* Fall through */
+	case PSC_DATA_VL_ENABLE:
+		/* Disallow sending on VLs not enabled */
+		mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<<
+				SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+		reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+		break;
+	case PSC_GLOBAL_DISABLE:
+		reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+		break;
+	case PSC_GLOBAL_VLARB_ENABLE:
+		reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+		break;
+	case PSC_GLOBAL_VLARB_DISABLE:
+		reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+		break;
+	case PSC_CM_RESET:
+		__cm_reset(dd, reg);
+		write = 0; /* CSR already written (and flushed) */
+		break;
+	case PSC_DATA_VL_DISABLE:
+		reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+		flush = 1;
+		break;
+	default:
+		dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+		break;
+	}
+
+	if (write) {
+		write_csr(dd, SEND_CTRL, reg);
+		if (flush)
+			(void) read_csr(dd, SEND_CTRL); /* flush write */
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU  -2
+
+#define SCC_PER_KRCVQ  -3
+#define SCC_ACK_CREDITS  32
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+	[SC_KERNEL] = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
+			.count = SCC_PER_VL },/* one per NUMA */
+	[SC_ACK]    = { .size  = SCC_ACK_CREDITS,
+			.count = SCC_PER_KRCVQ },
+	[SC_USER]   = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
+			.count = SCC_PER_CPU },	/* one per CPU */
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+	int centipercent;	/* % of memory, in 100ths of 1% */
+	int absolute_blocks;	/* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+	/* centi%, abs blocks */
+	{  10000,     -1 },		/* pool 0 */
+	{      0,     -1 },		/* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+	int centipercent;	/* 100th of 1% of memory to use, -1 if blocks
+				   already set */
+	int count;		/* count of contexts in the pool */
+	int blocks;		/* block size of the pool */
+	int size;		/* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index.  The wildcards
+ * start at -1 and increase negatively.  Map them as:
+ *	-1 => 0
+ *	-2 => 1
+ *	etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+	if (wc >= 0)
+		return -1;	/* non-wildcard */
+	return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+	"kernel",
+	"ack",
+	"user"
+};
+
+static const char *sc_type_name(int index)
+{
+	if (index < 0 || index >= SC_MAX)
+		return "unknown";
+	return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration.  Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+	struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+	int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+	int total_contexts = 0;
+	int fixed_blocks;
+	int pool_blocks;
+	int used_blocks;
+	int cp_total;		/* centipercent total */
+	int ab_total;		/* absolute block total */
+	int extra;
+	int i;
+
+	/*
+	 * Step 0:
+	 *	- copy the centipercents/absolute sizes from the pool config
+	 *	- sanity check these values
+	 *	- add up centipercents, then later check for full value
+	 *	- add up absolute blocks, then later check for over-commit
+	 */
+	cp_total = 0;
+	ab_total = 0;
+	for (i = 0; i < NUM_SC_POOLS; i++) {
+		int cp = sc_mem_pool_config[i].centipercent;
+		int ab = sc_mem_pool_config[i].absolute_blocks;
+
+		/*
+		 * A negative value is "unused" or "invalid".  Both *can*
+		 * be valid, but centipercent wins, so check that first
+		 */
+		if (cp >= 0) {			/* centipercent valid */
+			cp_total += cp;
+		} else if (ab >= 0) {		/* absolute blocks valid */
+			ab_total += ab;
+		} else {			/* neither valid */
+			dd_dev_err(
+				dd,
+				"Send context memory pool %d: both the block count and centipercent are invalid\n",
+				i);
+			return -EINVAL;
+		}
+
+		mem_pool_info[i].centipercent = cp;
+		mem_pool_info[i].blocks = ab;
+	}
+
+	/* do not use both % and absolute blocks for different pools */
+	if (cp_total != 0 && ab_total != 0) {
+		dd_dev_err(
+			dd,
+			"All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+		return -EINVAL;
+	}
+
+	/* if any percentages are present, they must add up to 100% x 100 */
+	if (cp_total != 0 && cp_total != 10000) {
+		dd_dev_err(
+			dd,
+			"Send context memory pool centipercent is %d, expecting 10000\n",
+			cp_total);
+		return -EINVAL;
+	}
+
+	/* the absolute pool total cannot be more than the mem total */
+	if (ab_total > total_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context memory pool absolute block count %d is larger than the memory size %d\n",
+			ab_total, total_blocks);
+		return -EINVAL;
+	}
+
+	/*
+	 * Step 2:
+	 *	- copy from the context size config
+	 *	- replace context type wildcard counts with real values
+	 *	- add up non-memory pool block sizes
+	 *	- add up memory pool user counts
+	 */
+	fixed_blocks = 0;
+	for (i = 0; i < SC_MAX; i++) {
+		int count = sc_config_sizes[i].count;
+		int size = sc_config_sizes[i].size;
+		int pool;
+
+		/*
+		 * Sanity check count: Either a positive value or
+		 * one of the expected wildcards is valid.  The positive
+		 * value is checked later when we compare against total
+		 * memory available.
+		 */
+		if (i == SC_ACK) {
+			count = dd->n_krcv_queues;
+		} else if (i == SC_KERNEL) {
+			count = num_vls + 1 /* VL15 */;
+		} else if (count == SCC_PER_CPU) {
+			count = dd->num_rcv_contexts - dd->n_krcv_queues;
+		} else if (count < 0) {
+			dd_dev_err(
+				dd,
+				"%s send context invalid count wildcard %d\n",
+				sc_type_name(i), count);
+			return -EINVAL;
+		}
+		if (total_contexts + count > dd->chip_send_contexts)
+			count = dd->chip_send_contexts - total_contexts;
+
+		total_contexts += count;
+
+		/*
+		 * Sanity check pool: The conversion will return a pool
+		 * number or -1 if a fixed (non-negative) value.  The fixed
+		 * value is checked later when we compare against
+		 * total memory available.
+		 */
+		pool = wildcard_to_pool(size);
+		if (pool == -1) {			/* non-wildcard */
+			fixed_blocks += size * count;
+		} else if (pool < NUM_SC_POOLS) {	/* valid wildcard */
+			mem_pool_info[pool].count += count;
+		} else {				/* invalid wildcard */
+			dd_dev_err(
+				dd,
+				"%s send context invalid pool wildcard %d\n",
+				sc_type_name(i), size);
+			return -EINVAL;
+		}
+
+		dd->sc_sizes[i].count = count;
+		dd->sc_sizes[i].size = size;
+	}
+	if (fixed_blocks > total_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context fixed block count, %u, larger than total block count %u\n",
+			fixed_blocks, total_blocks);
+		return -EINVAL;
+	}
+
+	/* step 3: calculate the blocks in the pools, and pool context sizes */
+	pool_blocks = total_blocks - fixed_blocks;
+	if (ab_total > pool_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context fixed pool sizes, %u, larger than pool block count %u\n",
+			ab_total, pool_blocks);
+		return -EINVAL;
+	}
+	/* subtract off the fixed pool blocks */
+	pool_blocks -= ab_total;
+
+	for (i = 0; i < NUM_SC_POOLS; i++) {
+		struct mem_pool_info *pi = &mem_pool_info[i];
+
+		/* % beats absolute blocks */
+		if (pi->centipercent >= 0)
+			pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+		if (pi->blocks == 0 && pi->count != 0) {
+			dd_dev_err(
+				dd,
+				"Send context memory pool %d has %u contexts, but no blocks\n",
+				i, pi->count);
+			return -EINVAL;
+		}
+		if (pi->count == 0) {
+			/* warn about wasted blocks */
+			if (pi->blocks != 0)
+				dd_dev_err(
+					dd,
+					"Send context memory pool %d has %u blocks, but zero contexts\n",
+					i, pi->blocks);
+			pi->size = 0;
+		} else {
+			pi->size = pi->blocks / pi->count;
+		}
+	}
+
+	/* step 4: fill in the context type sizes from the pool sizes */
+	used_blocks = 0;
+	for (i = 0; i < SC_MAX; i++) {
+		if (dd->sc_sizes[i].size < 0) {
+			unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+			WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+			dd->sc_sizes[i].size = mem_pool_info[pool].size;
+		}
+		/* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+		if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+			dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+		/* calculate our total usage */
+		used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+	}
+	extra = total_blocks - used_blocks;
+	if (extra != 0)
+		dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+	return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+	u16 base;
+	int ret, i, j, context;
+
+	ret = init_credit_return(dd);
+	if (ret)
+		return ret;
+
+	dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+					GFP_KERNEL);
+	dd->send_contexts = kcalloc(dd->num_send_contexts,
+					sizeof(struct send_context_info),
+					GFP_KERNEL);
+	if (!dd->send_contexts || !dd->hw_to_sw) {
+		dd_dev_err(dd, "Unable to allocate send context arrays\n");
+		kfree(dd->hw_to_sw);
+		kfree(dd->send_contexts);
+		free_credit_return(dd);
+		return -ENOMEM;
+	}
+
+	/* hardware context map starts with invalid send context indices */
+	for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+		dd->hw_to_sw[i] = INVALID_SCI;
+
+	/*
+	 * All send contexts have their credit sizes.  Allocate credits
+	 * for each context one after another from the global space.
+	 */
+	context = 0;
+	base = 1;
+	for (i = 0; i < SC_MAX; i++) {
+		struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+		for (j = 0; j < scs->count; j++) {
+			struct send_context_info *sci =
+						&dd->send_contexts[context];
+			sci->type = i;
+			sci->base = base;
+			sci->credits = scs->size;
+
+			context++;
+			base += scs->size;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+		       u32 *hw_context)
+{
+	struct send_context_info *sci;
+	u32 index;
+	u32 context;
+
+	for (index = 0, sci = &dd->send_contexts[0];
+			index < dd->num_send_contexts; index++, sci++) {
+		if (sci->type == type && sci->allocated == 0) {
+			sci->allocated = 1;
+			/* use a 1:1 mapping, but make them non-equal */
+			context = dd->chip_send_contexts - index - 1;
+			dd->hw_to_sw[context] = index;
+			*sw_index = index;
+			*hw_context = context;
+			return 0; /* success */
+		}
+	}
+	dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+	return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+	struct send_context_info *sci;
+
+	sci = &dd->send_contexts[sw_index];
+	if (!sci->allocated) {
+		dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+			__func__, sw_index, hw_context);
+	}
+	sci->allocated = 0;
+	dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+	return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+	return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return.  One for each physical
+ *   send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ *   credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ *   with the same value.  Use the address of the first send context in the
+ *   group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+	u32 gc = group_context(sc->hw_context, sc->group);
+	u32 index = sc->hw_context & 0x7;
+
+	sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+	*pa = (unsigned long)
+	       &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+	struct send_context *sc;
+
+	sc = container_of(work, struct send_context, halt_work);
+	sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+	u32 release_credits;
+	u32 threshold;
+
+	/* add in the header size, then divide by the PIO block size */
+	mtu += hdrqentsize << 2;
+	release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+	/* check against this context's credits */
+	if (sc->credits <= release_credits)
+		threshold = 1;
+	else
+		threshold = sc->credits - release_credits;
+
+	return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+	return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+	unsigned long flags;
+	u32 old_threshold;
+	int force_return = 0;
+
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+	old_threshold = (sc->credit_ctrl >>
+				SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+			 & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+	if (new_threshold != old_threshold) {
+		sc->credit_ctrl =
+			(sc->credit_ctrl
+				& ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+			| ((new_threshold
+				& SC(CREDIT_CTRL_THRESHOLD_MASK))
+			   << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+		write_kctxt_csr(sc->dd, sc->hw_context,
+			SC(CREDIT_CTRL), sc->credit_ctrl);
+
+		/* force a credit return on change to avoid a possible stall */
+		force_return = 1;
+	}
+
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+	if (force_return)
+		sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u64 reg = 0;
+	u32 hw_context = sc->hw_context;
+	int type = sc->type;
+
+	/*
+	 * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+	 * we're snooping.
+	 */
+	if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+	    dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+		reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+	write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+			      uint hdrqentsize, int numa)
+{
+	struct send_context_info *sci;
+	struct send_context *sc;
+	dma_addr_t pa;
+	unsigned long flags;
+	u64 reg;
+	u32 thresh;
+	u32 sw_index;
+	u32 hw_context;
+	int ret;
+	u8 opval, opmask;
+
+	/* do not allocate while frozen */
+	if (dd->flags & HFI1_FROZEN)
+		return NULL;
+
+	sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa);
+	if (!sc) {
+		dd_dev_err(dd, "Cannot allocate send context structure\n");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&dd->sc_lock, flags);
+	ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+	if (ret) {
+		spin_unlock_irqrestore(&dd->sc_lock, flags);
+		kfree(sc);
+		return NULL;
+	}
+
+	sci = &dd->send_contexts[sw_index];
+	sci->sc = sc;
+
+	sc->dd = dd;
+	sc->node = numa;
+	sc->type = type;
+	spin_lock_init(&sc->alloc_lock);
+	spin_lock_init(&sc->release_lock);
+	spin_lock_init(&sc->credit_ctrl_lock);
+	INIT_LIST_HEAD(&sc->piowait);
+	INIT_WORK(&sc->halt_work, sc_halted);
+	atomic_set(&sc->buffers_allocated, 0);
+	init_waitqueue_head(&sc->halt_wait);
+
+	/* grouping is always single context for now */
+	sc->group = 0;
+
+	sc->sw_index = sw_index;
+	sc->hw_context = hw_context;
+	cr_group_addresses(sc, &pa);
+	sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+	sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+					<< PIO_ADDR_CONTEXT_SHIFT);
+
+	/* set base and credits */
+	reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+					<< SC(CTRL_CTXT_DEPTH_SHIFT))
+		| ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+					<< SC(CTRL_CTXT_BASE_SHIFT));
+	write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+	set_pio_integrity(sc);
+
+	/* unmask all errors */
+	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+	/* set the default partition key */
+	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+		(DEFAULT_PKEY &
+			SC(CHECK_PARTITION_KEY_VALUE_MASK))
+		    << SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+	/* per context type checks */
+	if (type == SC_USER) {
+		opval = USER_OPCODE_CHECK_VAL;
+		opmask = USER_OPCODE_CHECK_MASK;
+	} else {
+		opval = OPCODE_CHECK_VAL_DISABLED;
+		opmask = OPCODE_CHECK_MASK_DISABLED;
+	}
+
+	/* set the send context check opcode mask and value */
+	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+		((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+		((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+	/* set up credit return */
+	reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+	/*
+	 * Calculate the initial credit return threshold.
+	 *
+	 * For Ack contexts, set a threshold for half the credits.
+	 * For User contexts use the given percentage.  This has been
+	 * sanitized on driver start-up.
+	 * For Kernel contexts, use the default MTU plus a header.
+	 */
+	if (type == SC_ACK) {
+		thresh = sc_percent_to_threshold(sc, 50);
+	} else if (type == SC_USER) {
+		thresh = sc_percent_to_threshold(sc,
+				user_credit_return_threshold);
+	} else { /* kernel */
+		thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+	}
+	reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+	/* add in early return */
+	if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+	else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+	/* set up write-through credit_ctrl */
+	sc->credit_ctrl = reg;
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+	/* User send contexts should not allow sending on VL15 */
+	if (type == SC_USER) {
+		reg = 1ULL << 15;
+		write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+	}
+
+	spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+	/*
+	 * Allocate shadow ring to track outstanding PIO buffers _after_
+	 * unlocking.  We don't know the size until the lock is held and
+	 * we can't allocate while the lock is held.  No one is using
+	 * the context yet, so allocate it now.
+	 *
+	 * User contexts do not get a shadow ring.
+	 */
+	if (type != SC_USER) {
+		/*
+		 * Size the shadow ring 1 larger than the number of credits
+		 * so head == tail can mean empty.
+		 */
+		sc->sr_size = sci->credits + 1;
+		sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+				sc->sr_size, GFP_KERNEL, numa);
+		if (!sc->sr) {
+			dd_dev_err(dd,
+				"Cannot allocate send context shadow ring structure\n");
+			sc_free(sc);
+			return NULL;
+		}
+	}
+
+	dd_dev_info(dd,
+		"Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+		sw_index,
+		hw_context,
+		sc_type_name(type),
+		sc->group,
+		sc->credits,
+		sc->credit_ctrl,
+		thresh);
+
+	return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	u32 sw_index;
+	u32 hw_context;
+
+	if (!sc)
+		return;
+
+	sc->flags |= SCF_IN_FREE;	/* ensure no restarts */
+	dd = sc->dd;
+	if (!list_empty(&sc->piowait))
+		dd_dev_err(dd, "piowait list not empty!\n");
+	sw_index = sc->sw_index;
+	hw_context = sc->hw_context;
+	sc_disable(sc);	/* make sure the HW is disabled */
+	flush_work(&sc->halt_work);
+
+	spin_lock_irqsave(&dd->sc_lock, flags);
+	dd->send_contexts[sw_index].sc = NULL;
+
+	/* clear/disable all registers set in sc_alloc */
+	write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+	/* release the index and context for re-use */
+	sc_hw_free(dd, sw_index, hw_context);
+	spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+	kfree(sc->sr);
+	kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+	u64 reg;
+	unsigned long flags;
+	struct pio_buf *pbuf;
+
+	if (!sc)
+		return;
+
+	/* do all steps, even if already disabled */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+	reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+	sc->flags &= ~SCF_ENABLED;
+	sc_wait_for_packet_egress(sc, 1);
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	/*
+	 * Flush any waiters.  Once the context is disabled,
+	 * credit return interrupts are stopped (although there
+	 * could be one in-process when the context is disabled).
+	 * Wait one microsecond for any lingering interrupts, then
+	 * proceed with the flush.
+	 */
+	udelay(1);
+	spin_lock_irqsave(&sc->release_lock, flags);
+	if (sc->sr) {	/* this context has a shadow ring */
+		while (sc->sr_tail != sc->sr_head) {
+			pbuf = &sc->sr[sc->sr_tail].pbuf;
+			if (pbuf->cb)
+				(*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+			sc->sr_tail++;
+			if (sc->sr_tail >= sc->sr_size)
+				sc->sr_tail = 0;
+		}
+	}
+	spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+	(((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+	>> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+	((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return  */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u64 reg;
+	u32 loop = 0;
+
+	while (1) {
+		reg = read_csr(dd, sc->hw_context * 8 +
+			       SEND_EGRESS_CTXT_STATUS);
+		/* done if egress is stopped */
+		if (egress_halted(reg))
+			break;
+		reg = packet_occupancy(reg);
+		if (reg == 0)
+			break;
+		if (loop > 100) {
+			dd_dev_err(dd,
+				"%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n",
+				__func__, sc->sw_index,
+				sc->hw_context, (u32)reg);
+			break;
+		}
+		loop++;
+		udelay(1);
+	}
+
+	if (pause)
+		/* Add additional delay to ensure chip returns all credits */
+		pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		struct send_context *sc = dd->send_contexts[i].sc;
+
+		if (!sc)
+			continue;
+		sc_wait_for_packet_egress(sc, 0);
+	}
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u64 reg;
+	u32 loop;
+	int count;
+
+	/* bounce off if not halted, or being free'd */
+	if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+		return -EINVAL;
+
+	dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+		sc->hw_context);
+
+	/*
+	 * Step 1: Wait for the context to actually halt.
+	 *
+	 * The error interrupt is asynchronous to actually setting halt
+	 * on the context.
+	 */
+	loop = 0;
+	while (1) {
+		reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+		if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+			break;
+		if (loop > 100) {
+			dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+				__func__, sc->sw_index, sc->hw_context);
+			return -ETIME;
+		}
+		loop++;
+		udelay(1);
+	}
+
+	/*
+	 * Step 2: Ensure no users are still trying to write to PIO.
+	 *
+	 * For kernel contexts, we have already turned off buffer allocation.
+	 * Now wait for the buffer count to go to zero.
+	 *
+	 * For user contexts, the user handling code has cut off write access
+	 * to the context's PIO pages before calling this routine and will
+	 * restore write access after this routine returns.
+	 */
+	if (sc->type != SC_USER) {
+		/* kernel context */
+		loop = 0;
+		while (1) {
+			count = atomic_read(&sc->buffers_allocated);
+			if (count == 0)
+				break;
+			if (loop > 100) {
+				dd_dev_err(dd,
+					"%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+					__func__, sc->sw_index,
+					sc->hw_context, count);
+			}
+			loop++;
+			udelay(1);
+		}
+	}
+
+	/*
+	 * Step 3: Wait for all packets to egress.
+	 * This is done while disabling the send context
+	 *
+	 * Step 4: Disable the context
+	 *
+	 * This is a superset of the halt.  After the disable, the
+	 * errors can be cleared.
+	 */
+	sc_disable(sc);
+
+	/*
+	 * Step 5: Enable the context
+	 *
+	 * This enable will clear the halted flag and per-send context
+	 * error flags.
+	 */
+	return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing.  To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them.  The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		/*
+		 * Don't disable unallocated, unfrozen, or user send contexts.
+		 * User send contexts will be disabled when the process
+		 * calls into the driver to reset its context.
+		 */
+		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+			continue;
+
+		/* only need to disable, the context is already stopped */
+		sc_disable(sc);
+	}
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts.  The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared.  Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+			continue;
+
+		sc_enable(sc);	/* will clear the sc frozen flag */
+	}
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ *	-ETIMEDOUT - if we wait too long
+ *	-EIO	   - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	int max, count = 0;
+
+	/* max is the longest possible HW init time / delay */
+	max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+	while (1) {
+		reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+		if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+			break;
+		if (count >= max)
+			return -ETIMEDOUT;
+		udelay(5);
+		count++;
+	}
+
+	return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state.  Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	/* make sure the init engine is not busy */
+	ret = pio_init_wait_progress(dd);
+	/* ignore any timeout */
+	if (ret == -EIO) {
+		/* clear the error */
+		write_csr(dd, SEND_PIO_ERR_CLEAR,
+			SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+	}
+
+	/* reset init all */
+	write_csr(dd, SEND_PIO_INIT_CTXT,
+			SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+	udelay(2);
+	ret = pio_init_wait_progress(dd);
+	if (ret < 0) {
+		dd_dev_err(dd,
+			"PIO send context init %s while initializing all PIO blocks\n",
+			ret == -ETIMEDOUT ? "is stuck" : "had an error");
+	}
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+	u64 sc_ctrl, reg, pio;
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!sc)
+		return -EINVAL;
+	dd = sc->dd;
+
+	/*
+	 * Obtain the allocator lock to guard against any allocation
+	 * attempts (which should not happen prior to context being
+	 * enabled). On the release/disable side we don't need to
+	 * worry about locking since the releaser will not do anything
+	 * if the context accounting values have not changed.
+	 */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+	if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+		goto unlock; /* already enabled */
+
+	/* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+	*sc->hw_free = 0;
+	sc->free = 0;
+	sc->alloc_free = 0;
+	sc->fill = 0;
+	sc->sr_head = 0;
+	sc->sr_tail = 0;
+	sc->flags = 0;
+	atomic_set(&sc->buffers_allocated, 0);
+
+	/*
+	 * Clear all per-context errors.  Some of these will be set when
+	 * we are re-enabling after a context halt.  Now that the context
+	 * is disabled, the halt will not clear until after the PIO init
+	 * engine runs below.
+	 */
+	reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+	if (reg)
+		write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR),
+			reg);
+
+	/*
+	 * The HW PIO initialization engine can handle only one init
+	 * request at a time. Serialize access to each device's engine.
+	 */
+	spin_lock(&dd->sc_init_lock);
+	/*
+	 * Since access to this code block is serialized and
+	 * each access waits for the initialization to complete
+	 * before releasing the lock, the PIO initialization engine
+	 * should not be in use, so we don't have to wait for the
+	 * InProgress bit to go down.
+	 */
+	pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+	       SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+		SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+	write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+	/*
+	 * Wait until the engine is done.  Give the chip the required time
+	 * so, hopefully, we read the register just once.
+	 */
+	udelay(2);
+	ret = pio_init_wait_progress(dd);
+	spin_unlock(&dd->sc_init_lock);
+	if (ret) {
+		dd_dev_err(dd,
+			   "sctxt%u(%u): Context not enabled due to init failure %d\n",
+			   sc->sw_index, sc->hw_context, ret);
+		goto unlock;
+	}
+
+	/*
+	 * All is well. Enable the context.
+	 */
+	sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+	write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+	/*
+	 * Read SendCtxtCtrl to force the write out and prevent a timing
+	 * hazard where a PIO write may reach the context before the enable.
+	 */
+	read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+	sc->flags |= SCF_ENABLED;
+
+unlock:
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	/* a 0->1 transition schedules a credit return */
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+		SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+	/*
+	 * Ensure that the write is flushed and the credit return is
+	 * scheduled. We care more about the 0 -> 1 transition.
+	 */
+	read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+	/* set back to 0 for next time */
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+			__func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ *	- mark the context as halted or frozen
+ *	- stop buffer allocations
+ *
+ * Called from the error interrupt.  Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+	unsigned long flags;
+
+	/* mark the context */
+	sc->flags |= flag;
+
+	/* stop buffer allocations */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	sc->flags &= ~SCF_ENABLED;
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+	wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+				pio_release_cb cb, void *arg)
+{
+	struct pio_buf *pbuf = NULL;
+	unsigned long flags;
+	unsigned long avail;
+	unsigned long blocks = dwords_to_blocks(dw_len);
+	unsigned long start_fill;
+	int trycount = 0;
+	u32 head, next;
+
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	if (!(sc->flags & SCF_ENABLED)) {
+		spin_unlock_irqrestore(&sc->alloc_lock, flags);
+		goto done;
+	}
+
+retry:
+	avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+	if (blocks > avail) {
+		/* not enough room */
+		if (unlikely(trycount))	{ /* already tried to get more room */
+			spin_unlock_irqrestore(&sc->alloc_lock, flags);
+			goto done;
+		}
+		/* copy from receiver cache line and recalculate */
+		sc->alloc_free = ACCESS_ONCE(sc->free);
+		avail =
+			(unsigned long)sc->credits -
+			(sc->fill - sc->alloc_free);
+		if (blocks > avail) {
+			/* still no room, actively update */
+			spin_unlock_irqrestore(&sc->alloc_lock, flags);
+			sc_release_update(sc);
+			spin_lock_irqsave(&sc->alloc_lock, flags);
+			sc->alloc_free = ACCESS_ONCE(sc->free);
+			trycount++;
+			goto retry;
+		}
+	}
+
+	/* there is enough room */
+
+	atomic_inc(&sc->buffers_allocated);
+
+	/* read this once */
+	head = sc->sr_head;
+
+	/* "allocate" the buffer */
+	start_fill = sc->fill;
+	sc->fill += blocks;
+
+	/*
+	 * Fill the parts that the releaser looks at before moving the head.
+	 * The only necessary piece is the sent_at field.  The credits
+	 * we have just allocated cannot have been returned yet, so the
+	 * cb and arg will not be looked at for a "while".  Put them
+	 * on this side of the memory barrier anyway.
+	 */
+	pbuf = &sc->sr[head].pbuf;
+	pbuf->sent_at = sc->fill;
+	pbuf->cb = cb;
+	pbuf->arg = arg;
+	pbuf->sc = sc;	/* could be filled in at sc->sr init time */
+	/* make sure this is in memory before updating the head */
+
+	/* calculate next head index, do not store */
+	next = head + 1;
+	if (next >= sc->sr_size)
+		next = 0;
+	/* update the head - must be last! - the releaser can look at fields
+	   in pbuf once we move the head */
+	smp_wmb();
+	sc->sr_head = next;
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	/* finish filling in the buffer outside the lock */
+	pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+							* PIO_BLOCK_SIZE);
+	pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+	pbuf->end = sc->base_addr + pbuf->size;
+	pbuf->block_count = blocks;
+	pbuf->qw_written = 0;
+	pbuf->carry_bytes = 0;
+	pbuf->carry.val64 = 0;
+done:
+	return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap.  Avoid problems by implementing
+ * a count scheme that is enforced by a lock.  The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts.  This is managed by a count.  If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+	unsigned long flags;
+
+	/* lock must surround both the count change and the CSR update */
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+	if (sc->credit_intr_count == 0) {
+		sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+		write_kctxt_csr(sc->dd, sc->hw_context,
+			SC(CREDIT_CTRL), sc->credit_ctrl);
+	}
+	sc->credit_intr_count++;
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts.  This is managed by a count.  Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+	unsigned long flags;
+
+	WARN_ON(sc->credit_intr_count == 0);
+
+	/* lock must surround both the count change and the CSR update */
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+	sc->credit_intr_count--;
+	if (sc->credit_intr_count == 0) {
+		sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+		write_kctxt_csr(sc->dd, sc->hw_context,
+			SC(CREDIT_CTRL), sc->credit_ctrl);
+	}
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this.  All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+	if (needint)
+		sc_add_credit_return_intr(sc);
+	else
+		sc_del_credit_return_intr(sc);
+	trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+	if (needint) {
+		mmiowb();
+		sc_return_credits(sc);
+	}
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	struct list_head *list;
+	struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE];
+	struct hfi1_qp *qp;
+	unsigned long flags;
+	unsigned i, n = 0;
+
+	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+		return;
+	list = &sc->piowait;
+	/*
+	 * Note: checking that the piowait list is empty and clearing
+	 * the buffer available interrupt needs to be atomic or we
+	 * could end up with QPs on the wait list with the interrupt
+	 * disabled.
+	 */
+	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	while (!list_empty(list)) {
+		struct iowait *wait;
+
+		if (n == ARRAY_SIZE(qps))
+			goto full;
+		wait = list_first_entry(list, struct iowait, list);
+		qp = container_of(wait, struct hfi1_qp, s_iowait);
+		list_del_init(&qp->s_iowait.list);
+		/* refcount held until actual wake up */
+		qps[n++] = qp;
+	}
+	/*
+	 * Counting: only call wantpiobuf_intr() if there were waiters and they
+	 * are now all gone.
+	 */
+	if (n)
+		hfi1_sc_wantpiobuf_intr(sc, 0);
+full:
+	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+	for (i = 0; i < n; i++)
+		hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+	int code = 0;
+
+	if (hw_free & CR_STATUS_SMASK)
+		code |= PRC_STATUS_ERR;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+		code |= PRC_PBC;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+		code |= PRC_THRESHOLD;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+		code |= PRC_FILL_ERR;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+		code |= PRC_SC_DISABLE;
+	return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b)	/* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+	struct pio_buf *pbuf;
+	u64 hw_free;
+	u32 head, tail;
+	unsigned long old_free;
+	unsigned long extra;
+	unsigned long flags;
+	int code;
+
+	if (!sc)
+		return;
+
+	spin_lock_irqsave(&sc->release_lock, flags);
+	/* update free */
+	hw_free = le64_to_cpu(*sc->hw_free);		/* volatile read */
+	old_free = sc->free;
+	extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+			- (old_free & CR_COUNTER_MASK))
+				& CR_COUNTER_MASK;
+	sc->free = old_free + extra;
+	trace_hfi1_piofree(sc, extra);
+
+	/* call sent buffer callbacks */
+	code = -1;				/* code not yet set */
+	head = ACCESS_ONCE(sc->sr_head);	/* snapshot the head */
+	tail = sc->sr_tail;
+	while (head != tail) {
+		pbuf = &sc->sr[tail].pbuf;
+
+		if (sent_before(sc->free, pbuf->sent_at)) {
+			/* not sent yet */
+			break;
+		}
+		if (pbuf->cb) {
+			if (code < 0) /* fill in code on first user */
+				code = fill_code(hw_free);
+			(*pbuf->cb)(pbuf->arg, code);
+		}
+
+		tail++;
+		if (tail >= sc->sr_size)
+			tail = 0;
+	}
+	/* update tail, in case we moved it */
+	sc->sr_tail = tail;
+	spin_unlock_irqrestore(&sc->release_lock, flags);
+	sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser.  Argument is the send context that caused
+ * the interrupt.  Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler.  Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+	struct send_context *sc;
+	u32 sw_index;
+	u32 gc, gc_end;
+
+	spin_lock(&dd->sc_lock);
+	sw_index = dd->hw_to_sw[hw_context];
+	if (unlikely(sw_index >= dd->num_send_contexts)) {
+		dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+			__func__, hw_context, sw_index);
+		goto done;
+	}
+	sc = dd->send_contexts[sw_index].sc;
+	if (unlikely(!sc))
+		goto done;
+
+	gc = group_context(hw_context, sc->group);
+	gc_end = gc + group_size(sc->group);
+	for (; gc < gc_end; gc++) {
+		sw_index = dd->hw_to_sw[gc];
+		if (unlikely(sw_index >= dd->num_send_contexts)) {
+			dd_dev_err(dd,
+				"%s: invalid hw (%u) to sw (%u) mapping\n",
+				__func__, hw_context, sw_index);
+			continue;
+		}
+		sc_release_update(dd->send_contexts[sw_index].sc);
+	}
+done:
+	spin_unlock(&dd->sc_lock);
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+	int i;
+	u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */
+	u32 ctxt;
+
+	dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+				  dd->rcd[0]->rcvhdrqentsize, dd->node);
+	if (!dd->vld[15].sc)
+		goto nomem;
+	hfi1_init_ctxt(dd->vld[15].sc);
+	dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+	for (i = 0; i < num_vls; i++) {
+		/*
+		 * Since this function does not deal with a specific
+		 * receive context but we need the RcvHdrQ entry size,
+		 * use the size from rcd[0]. It is guaranteed to be
+		 * valid at this point and will remain the same for all
+		 * receive contexts.
+		 */
+		dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+					 dd->rcd[0]->rcvhdrqentsize, dd->node);
+		if (!dd->vld[i].sc)
+			goto nomem;
+
+		hfi1_init_ctxt(dd->vld[i].sc);
+
+		/* non VL15 start with the max MTU */
+		dd->vld[i].mtu = hfi1_max_mtu;
+	}
+	sc_enable(dd->vld[15].sc);
+	ctxt = dd->vld[15].sc->hw_context;
+	mask = all_vl_mask & ~(1LL << 15);
+	write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+	dd_dev_info(dd,
+		    "Using send context %u(%u) for VL15\n",
+		    dd->vld[15].sc->sw_index, ctxt);
+	for (i = 0; i < num_vls; i++) {
+		sc_enable(dd->vld[i].sc);
+		ctxt = dd->vld[i].sc->hw_context;
+		mask = all_vl_mask & ~(1LL << i);
+		write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+	}
+	return 0;
+nomem:
+	sc_free(dd->vld[15].sc);
+	for (i = 0; i < num_vls; i++)
+		sc_free(dd->vld[i].sc);
+	return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+	int ret;
+	int num_numa;
+	int i;
+
+	num_numa = num_online_nodes();
+	/* enforce the expectation that the numas are compact */
+	for (i = 0; i < num_numa; i++) {
+		if (!node_online(i)) {
+			dd_dev_err(dd, "NUMA nodes are not compact\n");
+			ret = -EINVAL;
+			goto done;
+		}
+	}
+
+	dd->cr_base = kcalloc(
+		num_numa,
+		sizeof(struct credit_return_base),
+		GFP_KERNEL);
+	if (!dd->cr_base) {
+		dd_dev_err(dd, "Unable to allocate credit return base\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	for (i = 0; i < num_numa; i++) {
+		int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+		set_dev_node(&dd->pcidev->dev, i);
+		dd->cr_base[i].va = dma_zalloc_coherent(
+					&dd->pcidev->dev,
+					bytes,
+					&dd->cr_base[i].pa,
+					GFP_KERNEL);
+		if (dd->cr_base[i].va == NULL) {
+			set_dev_node(&dd->pcidev->dev, dd->node);
+			dd_dev_err(dd,
+				"Unable to allocate credit return DMA range for NUMA %d\n",
+				i);
+			ret = -ENOMEM;
+			goto done;
+		}
+	}
+	set_dev_node(&dd->pcidev->dev, dd->node);
+
+	ret = 0;
+done:
+	return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+	int num_numa;
+	int i;
+
+	if (!dd->cr_base)
+		return;
+
+	num_numa = num_online_nodes();
+	for (i = 0; i < num_numa; i++) {
+		if (dd->cr_base[i].va) {
+			dma_free_coherent(&dd->pcidev->dev,
+				TXE_NUM_CONTEXTS
+					* sizeof(struct credit_return),
+				dd->cr_base[i].va,
+				dd->cr_base[i].pa);
+		}
+	}
+	kfree(dd->cr_base);
+	dd->cr_base = NULL;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
new file mode 100644
index 0000000..0bb885c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -0,0 +1,224 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_ACK    1
+#define SC_USER   2
+#define SC_MAX    3
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK		0	/* no known error */
+#define PRC_STATUS_ERR	0x01	/* credit return due to status error */
+#define PRC_PBC		0x02	/* credit return due to PBC */
+#define PRC_THRESHOLD	0x04	/* credit return due to threshold */
+#define PRC_FILL_ERR	0x08	/* credit return due fill error */
+#define PRC_FORCE	0x10	/* credit return due credit force */
+#define PRC_SC_DISABLE	0x20	/* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+	u64 val64;
+	u32 val32[2];
+	u8  val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+	struct send_context *sc;/* back pointer to owning send context */
+	pio_release_cb cb;	/* called when the buffer is released */
+	void *arg;		/* argument for cb */
+	void __iomem *start;	/* buffer start address */
+	void __iomem *end;	/* context end address */
+	unsigned long size;	/* context size, in bytes */
+	unsigned long sent_at;	/* buffer is sent when <= free */
+	u32 block_count;	/* size of buffer, in blocks */
+	u32 qw_written;		/* QW written so far */
+	u32 carry_bytes;	/* number of valid bytes in carry */
+	union mix carry;	/* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+	struct pio_buf pbuf;
+	u64 unused[16];		/* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+	/* read-only after init */
+	struct hfi1_devdata *dd;		/* device */
+	void __iomem *base_addr;	/* start of PIO memory */
+	union pio_shadow_ring *sr;	/* shadow ring */
+	volatile __le64 *hw_free;	/* HW free counter */
+	struct work_struct halt_work;	/* halted context work queue entry */
+	unsigned long flags;		/* flags */
+	int node;			/* context home node */
+	int type;			/* context type */
+	u32 sw_index;			/* software index number */
+	u32 hw_context;			/* hardware context number */
+	u32 credits;			/* number of blocks in context */
+	u32 sr_size;			/* size of the shadow ring */
+	u32 group;			/* credit return group */
+	/* allocator fields */
+	spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+	unsigned long fill;		/* official alloc count */
+	unsigned long alloc_free;	/* copy of free (less cache thrash) */
+	u32 sr_head;			/* shadow ring head */
+	/* releaser fields */
+	spinlock_t release_lock ____cacheline_aligned_in_smp;
+	unsigned long free;		/* official free count */
+	u32 sr_tail;			/* shadow ring tail */
+	/* list for PIO waiters */
+	struct list_head piowait  ____cacheline_aligned_in_smp;
+	spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+	u64 credit_ctrl;		/* cache for credit control */
+	u32 credit_intr_count;		/* count of credit intr users */
+	atomic_t buffers_allocated;	/* count of buffers allocated */
+	wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED  0x04
+#define SCF_FROZEN  0x08
+
+struct send_context_info {
+	struct send_context *sc;	/* allocated working context */
+	u16 allocated;			/* has this been allocated? */
+	u16 type;			/* context type */
+	u16 base;			/* base in PIO array */
+	u16 credits;			/* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+	volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+	struct credit_return *va;
+	dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+	short int size;
+	short int count;
+};
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+			      uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+			pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+	      const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+					const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
new file mode 100644
index 0000000..8972bbc
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio_copy.c
@@ -0,0 +1,858 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks.  The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+	      const void *from, size_t count)
+{
+	void __iomem *dest = pbuf->start + SOP_DISTANCE;
+	void __iomem *send = dest + PIO_BLOCK_SIZE;
+	void __iomem *dend;			/* 8-byte data end */
+
+	/* write the PBC */
+	writeq(pbc, dest);
+	dest += sizeof(u64);
+
+	/* calculate where the QWORD data ends - in SOP=1 space */
+	dend = dest + ((count>>1) * sizeof(u64));
+
+	if (dend < send) {
+		/* all QWORD data is within the SOP block, does *not*
+		   reach the end of the SOP block */
+
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/*
+		 * No boundary checks are needed here:
+		 * 0. We're not on the SOP block boundary
+		 * 1. The possible DWORD dangle will still be within
+		 *    the SOP block
+		 * 2. We cannot wrap except on a block boundary.
+		 */
+	} else {
+		/* QWORD data extends _to_ or beyond the SOP block */
+
+		/* write 8-byte SOP chunk data */
+		while (dest < send) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/* drop out of the SOP range */
+		dest -= SOP_DISTANCE;
+		dend -= SOP_DISTANCE;
+
+		/*
+		 * If the wrap comes before or matches the data end,
+		 * copy until until the wrap, then wrap.
+		 *
+		 * If the data ends at the end of the SOP above and
+		 * the buffer wraps, then pbuf->end == dend == dest
+		 * and nothing will get written, but we will wrap in
+		 * case there is a dangling DWORD.
+		 */
+		if (pbuf->end <= dend) {
+			while (dest < pbuf->end) {
+				writeq(*(u64 *)from, dest);
+				from += sizeof(u64);
+				dest += sizeof(u64);
+			}
+
+			dest -= pbuf->size;
+			dend -= pbuf->size;
+		}
+
+		/* write 8-byte non-SOP, non-wrap chunk data */
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+	}
+	/* at this point we have wrapped if we are going to wrap */
+
+	/* write dangling u32, if any */
+	if (count & 1) {
+		union mix val;
+
+		val.val64 = 0;
+		val.val32[0] = *(u32 *)from;
+		writeq(val.val64, dest);
+		dest += sizeof(u64);
+	}
+	/* fill in rest of block, no need to check pbuf->end
+	   as we only wrap on a block boundary */
+	while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+		writeq(0, dest);
+		dest += sizeof(u64);
+	}
+
+	/* finished with this buffer */
+	atomic_dec(&pbuf->sc->buffers_allocated);
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes.  Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8-(x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes.  Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry.  Other bytes are zeroed.  Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+							unsigned int nbytes)
+{
+	unsigned long off;
+
+	if (nbytes == 0) {
+		pbuf->carry.val64 = 0;
+	} else {
+		/* align our pointer */
+		off = (unsigned long)from & 0x7;
+		from = (void *)((unsigned long)from & ~0x7l);
+		pbuf->carry.val64 = ((*(u64 *)from)
+				<< zshift(nbytes + off))/* zero upper bytes */
+				>> zshift(nbytes);	/* place at bottom */
+	}
+	pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+					const void *from, unsigned int nbytes)
+{
+	unsigned long off = (unsigned long)from & 0x7;
+	unsigned int room, xbytes;
+
+	/* align our pointer */
+	from = (void *)((unsigned long)from & ~0x7l);
+
+	/* check count first - don't read anything if count is zero */
+	while (nbytes) {
+		/* find the number of bytes in this u64 */
+		room = 8 - off;	/* this u64 has room for this many bytes */
+		xbytes = nbytes > room ? room : nbytes;
+
+		/*
+		 * shift down to zero lower bytes, shift up to zero upper
+		 * bytes, shift back down to move into place
+		 */
+		pbuf->carry.val64 |= (((*(u64 *)from)
+					>> mshift(off))
+					<< zshift(xbytes))
+					>> zshift(xbytes+pbuf->carry_bytes);
+		off = 0;
+		pbuf->carry_bytes += xbytes;
+		nbytes -= xbytes;
+		from += sizeof(u64);
+	}
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+	unsigned int remaining;
+
+	if (zbytes == 0)	/* nothing to do */
+		return;
+
+	remaining = pbuf->carry_bytes - zbytes;	/* remaining bytes */
+
+	/* NOTE: zshift only guaranteed to work if remaining != 0 */
+	if (remaining)
+		pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+					>> zshift(remaining);
+	else
+		pbuf->carry.val64 = 0;
+	pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+	struct pio_buf *pbuf,
+	void __iomem *dest,
+	const void *src)
+{
+	u64 new, temp;
+
+	new = *(u64 *)src;
+	temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+	writeq(temp, dest);
+	pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+	writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+	if (pbuf->carry_bytes) {
+		/* unused bytes are always kept zeroed, so just write */
+		writeq(pbuf->carry.val64, dest);
+		return 1;
+	}
+
+	return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+	switch (n) {
+	case 7:
+		*dest++ = *src++;
+	case 6:
+		*dest++ = *src++;
+	case 5:
+		*dest++ = *src++;
+	case 4:
+		*dest++ = *src++;
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry.  Other bytes are left as-is.  Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+							unsigned int nbytes)
+{
+	jcopy(&pbuf->carry.val8[0], from, nbytes);
+	pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+					const void *from, unsigned int nbytes)
+{
+	jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+	pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+	pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+	struct pio_buf *pbuf,
+	void *dest,
+	const void *src)
+{
+	u32 remainder = 8 - pbuf->carry_bytes;
+
+	jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+	writeq(pbuf->carry.val64, dest);
+	jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+	writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+	if (pbuf->carry_bytes) {
+		u64 zero = 0;
+
+		jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+						8 - pbuf->carry_bytes);
+		writeq(pbuf->carry.val64, dest);
+		return 1;
+	}
+
+	return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+				const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + SOP_DISTANCE;
+	void __iomem *send = dest + PIO_BLOCK_SIZE;
+	void __iomem *dend;			/* 8-byte data end */
+
+	writeq(pbc, dest);
+	dest += sizeof(u64);
+
+	/* calculate where the QWORD data ends - in SOP=1 space */
+	dend = dest + ((nbytes>>3) * sizeof(u64));
+
+	if (dend < send) {
+		/* all QWORD data is within the SOP block, does *not*
+		   reach the end of the SOP block */
+
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/*
+		 * No boundary checks are needed here:
+		 * 0. We're not on the SOP block boundary
+		 * 1. The possible DWORD dangle will still be within
+		 *    the SOP block
+		 * 2. We cannot wrap except on a block boundary.
+		 */
+	} else {
+		/* QWORD data extends _to_ or beyond the SOP block */
+
+		/* write 8-byte SOP chunk data */
+		while (dest < send) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/* drop out of the SOP range */
+		dest -= SOP_DISTANCE;
+		dend -= SOP_DISTANCE;
+
+		/*
+		 * If the wrap comes before or matches the data end,
+		 * copy until until the wrap, then wrap.
+		 *
+		 * If the data ends at the end of the SOP above and
+		 * the buffer wraps, then pbuf->end == dend == dest
+		 * and nothing will get written, but we will wrap in
+		 * case there is a dangling DWORD.
+		 */
+		if (pbuf->end <= dend) {
+			while (dest < pbuf->end) {
+				writeq(*(u64 *)from, dest);
+				from += sizeof(u64);
+				dest += sizeof(u64);
+			}
+
+			dest -= pbuf->size;
+			dend -= pbuf->size;
+		}
+
+		/* write 8-byte non-SOP, non-wrap chunk data */
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+	}
+	/* at this point we have wrapped if we are going to wrap */
+
+	/* ...but it doesn't matter as we're done writing */
+
+	/* save dangling bytes, if any */
+	read_low_bytes(pbuf, from, nbytes & 0x7);
+
+	pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+	void __iomem *dend;			/* 8-byte data end */
+	unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+	unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+	/* calculate 8-byte data end */
+	dend = dest + (qw_to_write * sizeof(u64));
+
+	if (pbuf->qw_written < PIO_BLOCK_QWS) {
+		/*
+		 * Still within SOP block.  We don't need to check for
+		 * wrap because we are still in the first block and
+		 * can only wrap on block boundaries.
+		 */
+		void __iomem *send;		/* SOP end */
+		void __iomem *xend;
+
+		/* calculate the end of data or end of block, whichever
+		   comes first */
+		send = pbuf->start + PIO_BLOCK_SIZE;
+		xend = send < dend ? send : dend;
+
+		/* shift up to SOP=1 space */
+		dest += SOP_DISTANCE;
+		xend += SOP_DISTANCE;
+
+		/* write 8-byte chunk data */
+		while (dest < xend) {
+			merge_write8(pbuf, dest, from);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		/* shift down to SOP=0 space */
+		dest -= SOP_DISTANCE;
+	}
+	/*
+	 * At this point dest could be (either, both, or neither):
+	 * - at dend
+	 * - at the wrap
+	 */
+
+	/*
+	 * If the wrap comes before or matches the data end,
+	 * copy until until the wrap, then wrap.
+	 *
+	 * If dest is at the wrap, we will fall into the if,
+	 * not do the loop, when wrap.
+	 *
+	 * If the data ends at the end of the SOP above and
+	 * the buffer wraps, then pbuf->end == dend == dest
+	 * and nothing will get written.
+	 */
+	if (pbuf->end <= dend) {
+		while (dest < pbuf->end) {
+			merge_write8(pbuf, dest, from);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		dest -= pbuf->size;
+		dend -= pbuf->size;
+	}
+
+	/* write 8-byte non-SOP, non-wrap chunk data */
+	while (dest < dend) {
+		merge_write8(pbuf, dest, from);
+		from += sizeof(u64);
+		dest += sizeof(u64);
+	}
+
+	/* adjust carry */
+	if (pbuf->carry_bytes < bytes_left) {
+		/* need to read more */
+		read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+	} else {
+		/* remove invalid bytes */
+		zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+	}
+
+	pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+						const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+	void __iomem *dend;			/* 8-byte data end */
+
+	/* calculate 8-byte data end */
+	dend = dest + ((nbytes>>3) * sizeof(u64));
+
+	if (pbuf->qw_written < PIO_BLOCK_QWS) {
+		/*
+		 * Still within SOP block.  We don't need to check for
+		 * wrap because we are still in the first block and
+		 * can only wrap on block boundaries.
+		 */
+		void __iomem *send;		/* SOP end */
+		void __iomem *xend;
+
+		/* calculate the end of data or end of block, whichever
+		   comes first */
+		send = pbuf->start + PIO_BLOCK_SIZE;
+		xend = send < dend ? send : dend;
+
+		/* shift up to SOP=1 space */
+		dest += SOP_DISTANCE;
+		xend += SOP_DISTANCE;
+
+		/* write 8-byte chunk data */
+		while (dest < xend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		/* shift down to SOP=0 space */
+		dest -= SOP_DISTANCE;
+	}
+	/*
+	 * At this point dest could be (either, both, or neither):
+	 * - at dend
+	 * - at the wrap
+	 */
+
+	/*
+	 * If the wrap comes before or matches the data end,
+	 * copy until until the wrap, then wrap.
+	 *
+	 * If dest is at the wrap, we will fall into the if,
+	 * not do the loop, when wrap.
+	 *
+	 * If the data ends at the end of the SOP above and
+	 * the buffer wraps, then pbuf->end == dend == dest
+	 * and nothing will get written.
+	 */
+	if (pbuf->end <= dend) {
+		while (dest < pbuf->end) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		dest -= pbuf->size;
+		dend -= pbuf->size;
+	}
+
+	/* write 8-byte non-SOP, non-wrap chunk data */
+	while (dest < dend) {
+		writeq(*(u64 *)from, dest);
+		from += sizeof(u64);
+		dest += sizeof(u64);
+	}
+
+	/* we know carry_bytes was zero on entry to this routine */
+	read_low_bytes(pbuf, from, nbytes & 0x7);
+
+	pbuf->qw_written += nbytes>>3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+	unsigned long from_align = (unsigned long)from & 0x7;
+
+	if (pbuf->carry_bytes + nbytes < 8) {
+		/* not enough bytes to fill a QW */
+		read_extra_bytes(pbuf, from, nbytes);
+		return;
+	}
+
+	if (from_align) {
+		/* misaligned source pointer - align it */
+		unsigned long to_align;
+
+		/* bytes to read to align "from" */
+		to_align = 8 - from_align;
+
+		/*
+		 * In the advance-to-alignment logic below, we do not need
+		 * to check if we are using more than nbytes.  This is because
+		 * if we are here, we already know that carry+nbytes will
+		 * fill at least one QW.
+		 */
+		if (pbuf->carry_bytes + to_align < 8) {
+			/* not enough align bytes to fill a QW */
+			read_extra_bytes(pbuf, from, to_align);
+			from += to_align;
+			nbytes -= to_align;
+		} else {
+			/* bytes to fill carry */
+			unsigned long to_fill = 8 - pbuf->carry_bytes;
+			/* bytes left over to be read */
+			unsigned long extra = to_align - to_fill;
+			void __iomem *dest;
+
+			/* fill carry... */
+			read_extra_bytes(pbuf, from, to_fill);
+			from += to_fill;
+			nbytes -= to_fill;
+
+			/* ...now write carry */
+			dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+			/*
+			 * The two checks immediately below cannot both be
+			 * true, hence the else.  If we have wrapped, we
+			 * cannot still be within the first block.
+			 * Conversely, if we are still in the first block, we
+			 * cannot have wrapped.  We do the wrap check first
+			 * as that is more likely.
+			 */
+			/* adjust if we've wrapped */
+			if (dest >= pbuf->end)
+				dest -= pbuf->size;
+			/* jump to SOP range if within the first block */
+			else if (pbuf->qw_written < PIO_BLOCK_QWS)
+				dest += SOP_DISTANCE;
+
+			carry8_write8(pbuf->carry, dest);
+			pbuf->qw_written++;
+
+			/* read any extra bytes to do final alignment */
+			/* this will overwrite anything in pbuf->carry */
+			read_low_bytes(pbuf, from, extra);
+			from += extra;
+			nbytes -= extra;
+		}
+
+		/* at this point, from is QW aligned */
+	}
+
+	if (pbuf->carry_bytes)
+		mid_copy_mix(pbuf, from, nbytes);
+	else
+		mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+	/*
+	 * The two checks immediately below cannot both be true, hence the
+	 * else.  If we have wrapped, we cannot still be within the first
+	 * block.  Conversely, if we are still in the first block, we
+	 * cannot have wrapped.  We do the wrap check first as that is
+	 * more likely.
+	 */
+	/* adjust if we have wrapped */
+	if (dest >= pbuf->end)
+		dest -= pbuf->size;
+	/* jump to the SOP range if within the first block */
+	else if (pbuf->qw_written < PIO_BLOCK_QWS)
+		dest += SOP_DISTANCE;
+
+	/* write final bytes, if any */
+	if (carry_write8(pbuf, dest)) {
+		dest += sizeof(u64);
+		/*
+		 * NOTE: We do not need to recalculate whether dest needs
+		 * SOP_DISTANCE or not.
+		 *
+		 * If we are in the first block and the dangle write
+		 * keeps us in the same block, dest will need
+		 * to retain SOP_DISTANCE in the loop below.
+		 *
+		 * If we are in the first block and the dangle write pushes
+		 * us to the next block, then loop below will not run
+		 * and dest is not used.  Hence we do not need to update
+		 * it.
+		 *
+		 * If we are past the first block, then SOP_DISTANCE
+		 * was never added, so there is nothing to do.
+		 */
+	}
+
+	/* fill in rest of block */
+	while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+		writeq(0, dest);
+		dest += sizeof(u64);
+	}
+
+	/* finished with this buffer */
+	atomic_dec(&pbuf->sc->buffers_allocated);
+}
diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform_config.h
new file mode 100644
index 0000000..8a94a83
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/platform_config.h
@@ -0,0 +1,286 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_CONFIG_H
+#define __PLATFORM_CONFIG_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT		0
+#define METADATA_TABLE_FIELD_START_LEN_BITS		15
+#define METADATA_TABLE_FIELD_LEN_SHIFT			16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS		16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT			0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS		6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT		16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS		12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT			28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS		4
+
+enum platform_config_table_type_encoding {
+	PLATFORM_CONFIG_TABLE_RESERVED,
+	PLATFORM_CONFIG_SYSTEM_TABLE,
+	PLATFORM_CONFIG_PORT_TABLE,
+	PLATFORM_CONFIG_RX_PRESET_TABLE,
+	PLATFORM_CONFIG_TX_PRESET_TABLE,
+	PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+	PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+	PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+	SYSTEM_TABLE_RESERVED,
+	SYSTEM_TABLE_NODE_STRING,
+	SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+	SYSTEM_TABLE_NODE_GUID,
+	SYSTEM_TABLE_REVISION,
+	SYSTEM_TABLE_VENDOR_OUI,
+	SYSTEM_TABLE_META_VERSION,
+	SYSTEM_TABLE_DEVICE_ID,
+	SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+	SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+	SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+	SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+	SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+	SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+	PORT_TABLE_RESERVED,
+	PORT_TABLE_PORT_TYPE,
+	PORT_TABLE_ATTENUATION_12G,
+	PORT_TABLE_ATTENUATION_25G,
+	PORT_TABLE_LINK_SPEED_SUPPORTED,
+	PORT_TABLE_LINK_WIDTH_SUPPORTED,
+	PORT_TABLE_VL_CAP,
+	PORT_TABLE_MTU_CAP,
+	PORT_TABLE_TX_LANE_ENABLE_MASK,
+	PORT_TABLE_LOCAL_MAX_TIMEOUT,
+	PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+	PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+	PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU,
+	PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+	PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+	PORT_TABLE_RX_PRESET_IDX,
+	PORT_TABLE_CABLE_REACH_CLASS,
+	PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+	RX_PRESET_TABLE_RESERVED,
+	RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_EQ_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_CDR,
+	RX_PRESET_TABLE_QSFP_RX_EQ,
+	RX_PRESET_TABLE_QSFP_RX_AMP,
+	RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+	TX_PRESET_TABLE_RESERVED,
+	TX_PRESET_TABLE_PRECUR,
+	TX_PRESET_TABLE_ATTN,
+	TX_PRESET_TABLE_POSTCUR,
+	TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+	TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+	TX_PRESET_TABLE_QSFP_TX_CDR,
+	TX_PRESET_TABLE_QSFP_TX_EQ,
+	TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+	QSFP_ATTEN_TABLE_RESERVED,
+	QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+	QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+	QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+	VARIABLE_SETTINGS_TABLE_RESERVED,
+	VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+	VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+	VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config_data {
+	u32 *table;
+	u32 *table_metadata;
+	u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+	u8  cache_valid;
+	struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+	0,
+	SYSTEM_TABLE_MAX,
+	PORT_TABLE_MAX,
+	RX_PRESET_TABLE_MAX,
+	TX_PRESET_TABLE_MAX,
+	QSFP_ATTEN_TABLE_MAX,
+	VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*=====================================================
+ *  System table encodings
+ *====================================================*/
+#define PLATFORM_CONFIG_MAGIC_NUM		0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN	4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+	QSFP_POWER_CLASS_1 = 1,
+	QSFP_POWER_CLASS_2,
+	QSFP_POWER_CLASS_3,
+	QSFP_POWER_CLASS_4,
+	QSFP_POWER_CLASS_5,
+	QSFP_POWER_CLASS_6,
+	QSFP_POWER_CLASS_7
+};
+
+
+/*=====================================================
+ *  Port table encodings
+ *==================================================== */
+enum platform_config_port_type_encoding {
+	PORT_TYPE_RESERVED,
+	PORT_TYPE_DISCONNECTED,
+	PORT_TYPE_FIXED,
+	PORT_TYPE_VARIABLE,
+	PORT_TYPE_QSFP,
+	PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+	LINK_SPEED_SUPP_12G = 1,
+	LINK_SPEED_SUPP_25G,
+	LINK_SPEED_SUPP_12G_25G,
+	LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+	LINK_WIDTH_SUPP_1X = 1,
+	LINK_WIDTH_SUPP_2X,
+	LINK_WIDTH_SUPP_2X_1X,
+	LINK_WIDTH_SUPP_3X,
+	LINK_WIDTH_SUPP_3X_1X,
+	LINK_WIDTH_SUPP_3X_2X,
+	LINK_WIDTH_SUPP_3X_2X_1X,
+	LINK_WIDTH_SUPP_4X,
+	LINK_WIDTH_SUPP_4X_1X,
+	LINK_WIDTH_SUPP_4X_2X,
+	LINK_WIDTH_SUPP_4X_2X_1X,
+	LINK_WIDTH_SUPP_4X_3X,
+	LINK_WIDTH_SUPP_4X_3X_1X,
+	LINK_WIDTH_SUPP_4X_3X_2X,
+	LINK_WIDTH_SUPP_4X_3X_2X_1X,
+	LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+	VL_CAP_VL0 = 1,
+	VL_CAP_VL0_1,
+	VL_CAP_VL0_2,
+	VL_CAP_VL0_3,
+	VL_CAP_VL0_4,
+	VL_CAP_VL0_5,
+	VL_CAP_VL0_6,
+	VL_CAP_VL0_7,
+	VL_CAP_VL0_8,
+	VL_CAP_VL0_9,
+	VL_CAP_VL0_10,
+	VL_CAP_VL0_11,
+	VL_CAP_VL0_12,
+	VL_CAP_VL0_13,
+	VL_CAP_VL0_14,
+	VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+	MTU_CAP_256   = 1,
+	MTU_CAP_512   = 2,
+	MTU_CAP_1024  = 3,
+	MTU_CAP_2048  = 4,
+	MTU_CAP_4096  = 5,
+	MTU_CAP_8192  = 6,
+	MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+	LOCAL_MAX_TIMEOUT_10_MS = 1,
+	LOCAL_MAX_TIMEOUT_100_MS,
+	LOCAL_MAX_TIMEOUT_1_S,
+	LOCAL_MAX_TIMEOUT_10_S,
+	LOCAL_MAX_TIMEOUT_100_S,
+	LOCAL_MAX_TIMEOUT_1000_S
+};
+
+#endif			/*__PLATFORM_CONFIG_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
new file mode 100644
index 0000000..df1fa56
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -0,0 +1,1687 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "sdma.h"
+
+#define BITS_PER_PAGE           (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
+
+static unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct hfi1_qp *qp);
+static int iowait_sleep(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *stx,
+	unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+
+static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt,
+			      struct qpn_map *map, unsigned off)
+{
+	return (map - qpt->map) * BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+	0,                      /* 0 */
+	1,                      /* 1 */
+	2,                      /* 2 */
+	3,                      /* 3 */
+	4,                      /* 4 */
+	6,                      /* 5 */
+	8,                      /* 6 */
+	12,                     /* 7 */
+	16,                     /* 8 */
+	24,                     /* 9 */
+	32,                     /* A */
+	48,                     /* B */
+	64,                     /* C */
+	96,                     /* D */
+	128,                    /* E */
+	192,                    /* F */
+	256,                    /* 10 */
+	384,                    /* 11 */
+	512,                    /* 12 */
+	768,                    /* 13 */
+	1024,                   /* 14 */
+	1536,                   /* 15 */
+	2048,                   /* 16 */
+	3072,                   /* 17 */
+	4096,                   /* 18 */
+	6144,                   /* 19 */
+	8192,                   /* 1A */
+	12288,                  /* 1B */
+	16384,                  /* 1C */
+	24576,                  /* 1D */
+	32768                   /* 1E */
+};
+
+static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock(&qpt->lock);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt,
+		     enum ib_qp_type type, u8 port)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret;
+
+	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+		unsigned n;
+
+		ret = type == IB_QPT_GSI;
+		n = 1 << (ret + 2 * (port - 1));
+		spin_lock(&qpt->lock);
+		if (qpt->flags & n)
+			ret = -EINVAL;
+		else
+			qpt->flags |= n;
+		spin_unlock(&qpt->lock);
+		goto bail;
+	}
+
+	qpn = qpt->last + qpt->incr;
+	if (qpn >= QPN_MAX)
+		qpn = qpt->incr | ((qpt->last & 1) ^ 1);
+	/* offset carries bit 0 */
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		do {
+			if (!test_and_set_bit(offset, map->page)) {
+				qpt->last = qpn;
+				ret = qpn;
+				goto bail;
+			}
+			offset += qpt->incr;
+			/*
+			 * This qpn might be bogus if offset >= BITS_PER_PAGE.
+			 * That is OK.   It gets re-assigned below
+			 */
+			qpn = mk_qpn(qpt, map, offset);
+		} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			/* start at incr with current bit 0 */
+			offset = qpt->incr | (offset & 1);
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			/* start at incr with current bit 0 */
+			offset = qpt->incr | (offset & 1);
+		} else {
+			map = &qpt->map[0];
+			/* wrap to first map page, invert bit 0 */
+			offset = qpt->incr | ((offset & 1) ^ 1);
+		}
+		/* there can be no bits at shift and below */
+		WARN_ON(offset & (dd->qos_shift - 1));
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+}
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	unsigned long flags;
+
+	atomic_inc(&qp->refcount);
+	spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+	if (qp->ibqp.qp_num <= 1) {
+		rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp);
+	} else {
+		u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+
+		qp->next = dev->qp_dev->qp_table[n];
+		rcu_assign_pointer(dev->qp_dev->qp_table[n], qp);
+		trace_hfi1_qpinsert(qp, n);
+	}
+
+	spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+	unsigned long flags;
+	int removed = 1;
+
+	spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+	if (rcu_dereference_protected(ibp->qp[0],
+			lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(ibp->qp[0], NULL);
+	} else if (rcu_dereference_protected(ibp->qp[1],
+			lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(ibp->qp[1], NULL);
+	} else {
+		struct hfi1_qp *q;
+		struct hfi1_qp __rcu **qpp;
+
+		removed = 0;
+		qpp = &dev->qp_dev->qp_table[n];
+		for (; (q = rcu_dereference_protected(*qpp,
+				lockdep_is_held(&dev->qp_dev->qpt_lock)))
+					!= NULL;
+				qpp = &q->next)
+			if (q == qp) {
+				RCU_INIT_POINTER(*qpp,
+				 rcu_dereference_protected(qp->next,
+				 lockdep_is_held(&dev->qp_dev->qpt_lock)));
+				removed = 1;
+				trace_hfi1_qpremove(qp, n);
+				break;
+			}
+	}
+
+	spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+	if (removed) {
+		synchronize_rcu();
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+/**
+ * free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+static unsigned free_all_qps(struct hfi1_devdata *dd)
+{
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	unsigned long flags;
+	struct hfi1_qp *qp;
+	unsigned n, qp_inuse = 0;
+
+	for (n = 0; n < dd->num_pports; n++) {
+		struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+		if (!hfi1_mcast_tree_empty(ibp))
+			qp_inuse++;
+		rcu_read_lock();
+		if (rcu_dereference(ibp->qp[0]))
+			qp_inuse++;
+		if (rcu_dereference(ibp->qp[1]))
+			qp_inuse++;
+		rcu_read_unlock();
+	}
+
+	if (!dev->qp_dev)
+		goto bail;
+	spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+	for (n = 0; n < dev->qp_dev->qp_table_size; n++) {
+		qp = rcu_dereference_protected(dev->qp_dev->qp_table[n],
+			lockdep_is_held(&dev->qp_dev->qpt_lock));
+		RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL);
+
+		for (; qp; qp = rcu_dereference_protected(qp->next,
+				lockdep_is_held(&dev->qp_dev->qpt_lock)))
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+	synchronize_rcu();
+bail:
+	return qp_inuse;
+}
+
+/**
+ * reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	iowait_init(
+		&qp->s_iowait,
+		1,
+		hfi1_do_send,
+		iowait_sleep,
+		iowait_wakeup);
+	qp->s_flags &= HFI1_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_draining = 0;
+	qp->s_next_psn = 0;
+	qp->s_last_psn = 0;
+	qp->s_sending_psn = 0;
+	qp->s_sending_hpsn = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_acked = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	clear_ahg(qp);
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+	qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends)
+{
+	unsigned n;
+
+	if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+		hfi1_put_ss(&qp->s_rdma_read_sge);
+
+	hfi1_put_ss(&qp->r_sge);
+
+	if (clr_sends) {
+		while (qp->s_last != qp->s_head) {
+			struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+			unsigned i;
+
+			for (i = 0; i < wqe->wr.num_sge; i++) {
+				struct hfi1_sge *sge = &wqe->sg_list[i];
+
+				hfi1_put_mr(sge->mr);
+			}
+			if (qp->ibqp.qp_type == IB_QPT_UD ||
+			    qp->ibqp.qp_type == IB_QPT_SMI ||
+			    qp->ibqp.qp_type == IB_QPT_GSI)
+				atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+		}
+		if (qp->s_rdma_mr) {
+			hfi1_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+	}
+
+	if (qp->ibqp.qp_type != IB_QPT_RC)
+		return;
+
+	for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+		struct hfi1_ack_entry *e = &qp->s_ack_queue[n];
+
+		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+		    e->rdma_sge.mr) {
+			hfi1_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+	}
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+		qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	if (qp->s_flags & HFI1_S_ANY_WAIT_SEND)
+		qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND;
+
+	write_seqlock(&dev->iowait_lock);
+	if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) {
+		qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+		list_del_init(&qp->s_iowait.list);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+	write_sequnlock(&dev->iowait_lock);
+
+	if (!(qp->s_flags & HFI1_S_BUSY)) {
+		qp->s_hdrwords = 0;
+		if (qp->s_rdma_mr) {
+			hfi1_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+		flush_tx_list(qp);
+	}
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		hfi1_schedule_send(qp);
+
+	clear_mr_refs(qp, 0);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct hfi1_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+static void flush_tx_list(struct hfi1_qp *qp)
+{
+	while (!list_empty(&qp->s_iowait.tx_head)) {
+		struct sdma_txreq *tx;
+
+		tx = list_first_entry(
+			&qp->s_iowait.tx_head,
+			struct sdma_txreq,
+			list);
+		list_del_init(&tx->list);
+		hfi1_put_txreq(
+			container_of(tx, struct verbs_txreq, txreq));
+	}
+}
+
+static void flush_iowait(struct hfi1_qp *qp)
+{
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+
+	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	if (!list_empty(&qp->s_iowait.list)) {
+		list_del_init(&qp->s_iowait.list);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+	switch (mtu) {
+	case OPA_MTU_8192:  return 8192;
+	case OPA_MTU_10240: return 10240;
+	default:            return -1;
+	}
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ *
+ * The actual flag used to determine "8k MTU" will change and is currently
+ * unknown.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+	int val = opa_mtu_enum_to_int((int)mtu);
+
+	if (val > 0)
+		return val;
+	return ib_mtu_enum_to_int(mtu);
+}
+
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_udata *udata)
+{
+	struct hfi1_ibdev *dev = to_idev(ibqp->device);
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct ib_event ev;
+	int lastwqe = 0;
+	int mig = 0;
+	int ret;
+	u32 pmtu = 0; /* for gcc warning only */
+	struct hfi1_devdata *dd;
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+			goto inval;
+		if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+			goto inval;
+		if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+			goto inval;
+		if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (qp->ibqp.qp_type == IB_QPT_SMI ||
+		    qp->ibqp.qp_type == IB_QPT_GSI ||
+		    attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		if (attr->dest_qp_num > HFI1_QPN_MASK)
+			goto inval;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		if (attr->retry_cnt > 7)
+			goto inval;
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		if (attr->rnr_retry > 7)
+			goto inval;
+
+	/*
+	 * Don't allow invalid path_mtu values.  OK to set greater
+	 * than the active mtu (or even the max_cap, if we have tuned
+	 * that to a small mtu.  We'll set qp->path_mtu
+	 * to the lesser of requested attribute mtu and active,
+	 * for packetizing messages.
+	 * Note that the QP port has to be set in INIT and MTU in RTR.
+	 */
+	if (attr_mask & IB_QP_PATH_MTU) {
+		int mtu, pidx = qp->port_num - 1;
+
+		dd = dd_from_dev(dev);
+		mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu);
+		if (mtu == -1)
+			goto inval;
+
+		if (mtu > dd->pport[pidx].ibmtu)
+			pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+		else
+			pmtu = attr->path_mtu;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state == IB_MIG_REARM) {
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				goto inval;
+			if (new_state != IB_QPS_RTS)
+				goto inval;
+		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+			if (qp->s_mig_state == IB_MIG_REARM)
+				goto inval;
+			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+				goto inval;
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				mig = 1;
+		} else
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			flush_iowait(qp);
+			qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+			spin_unlock(&qp->s_lock);
+			spin_unlock_irq(&qp->r_lock);
+			/* Stop the sending work queue and retry timer */
+			cancel_work_sync(&qp->s_iowait.iowork);
+			del_timer_sync(&qp->s_timer);
+			iowait_sdma_drain(&qp->s_iowait);
+			flush_tx_list(qp);
+			remove_qp(dev, qp);
+			wait_event(qp->wait, !atomic_read(&qp->refcount));
+			spin_lock_irq(&qp->r_lock);
+			spin_lock(&qp->s_lock);
+			clear_mr_refs(qp, 1);
+			clear_ahg(qp);
+			reset_qp(qp, ibqp->qp_type);
+		}
+		break;
+
+	case IB_QPS_RTR:
+		/* Allow event to re-trigger if QP set to RTR more than once */
+		qp->r_flags &= ~HFI1_R_COMM_EST;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_PORT)
+		qp->port_num = attr->port_num;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sending_psn = qp->s_next_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+		qp->s_sending_hpsn = qp->s_last_psn;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_srate = attr->ah_attr.static_rate;
+		qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		qp->alt_ah_attr = attr->alt_ah_attr;
+		qp->s_alt_pkey_index = attr->alt_pkey_index;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		qp->s_mig_state = attr->path_mig_state;
+		if (mig) {
+			qp->remote_ah_attr = qp->alt_ah_attr;
+			qp->port_num = qp->alt_ah_attr.port_num;
+			qp->s_pkey_index = qp->s_alt_pkey_index;
+			qp->s_flags |= HFI1_S_AHG_CLEAR;
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		struct hfi1_ibport *ibp;
+		u8 sc, vl;
+		u32 mtu;
+
+		dd = dd_from_dev(dev);
+		ibp = &dd->pport[qp->port_num - 1].ibport_data;
+
+		sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+		vl = sc_to_vlt(dd, sc);
+
+		mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu);
+		if (vl < PER_VL_SEND_CONTEXTS)
+			mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+		pmtu = mtu_to_enum(mtu, OPA_MTU_8192);
+
+		qp->path_mtu = pmtu;
+		qp->pmtu = mtu;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp->s_retry_cnt = attr->retry_cnt;
+		qp->s_retry = attr->retry_cnt;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry_cnt = attr->rnr_retry;
+		qp->s_rnr_retry = attr->rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp->timeout = attr->timeout;
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		insert_qp(dev, qp);
+
+	if (lastwqe) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	if (mig) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_PATH_MIG;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = qp->s_mig_state;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = mask_psn(qp->r_psn);
+	attr->sq_psn = mask_psn(qp->s_next_psn);
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	attr->alt_ah_attr = qp->alt_ah_attr;
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = qp->s_alt_pkey_index;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = qp->port_num;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = qp->alt_ah_attr.port_num;
+	attr->alt_timeout = qp->alt_timeout;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = qp->port_num;
+	return 0;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp)
+{
+	u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct hfi1_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * There is a small chance that the pair of reads are
+		 * not atomic, which is OK, since the fuzziness is
+		 * resolved as further ACKs go out.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata)
+{
+	struct hfi1_qp *qp;
+	int err;
+	struct hfi1_swqe *swq = NULL;
+	struct hfi1_ibdev *dev;
+	struct hfi1_devdata *dd;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->cap.max_send_sge > hfi1_max_sges ||
+	    init_attr->cap.max_send_wr > hfi1_max_qp_wrs ||
+	    init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > hfi1_max_sges ||
+		    init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (init_attr->port_num == 0 ||
+		    init_attr->port_num > ibpd->device->phys_port_cnt) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+		sz = sizeof(struct hfi1_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct hfi1_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct hfi1_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		RCU_INIT_POINTER(qp->next, NULL);
+		qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+		if (!qp->s_hdr) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_qp;
+		}
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+		if (init_attr->srq)
+			sz = 0;
+		else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct hfi1_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) +
+						   qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->r_lock);
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_timer(&qp->s_timer);
+		qp->s_timer.data = (unsigned long)qp;
+		INIT_LIST_HEAD(&qp->rspwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = HFI1_S_SIGNAL_REQ_WR;
+		dev = to_idev(ibpd->device);
+		dd = dd_from_dev(dev);
+		err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type,
+				init_attr->port_num);
+		if (err < 0) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_qp;
+		}
+		qp->ibqp.qp_num = err;
+		qp->port_num = init_attr->port_num;
+		reset_qp(qp, init_attr->qp_type);
+
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See hfi1_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz;
+
+			qp->ip = hfi1_create_mmap_info(dev, s,
+						      ibpd->uobject->context,
+						      qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == hfi1_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+
+	/*
+	 * We have our QP and its good, now keep track of what types of opcodes
+	 * can be processed on this QP. We do this by keeping track of what the
+	 * 3 high order bits of the opcode are.
+	 */
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK;
+		break;
+	case IB_QPT_RC:
+		qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK;
+		break;
+	case IB_QPT_UC:
+		qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK;
+		break;
+	default:
+		ret = ERR_PTR(-EINVAL);
+		goto bail_ip;
+	}
+
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+bail_qp:
+	kfree(qp->s_hdr);
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	struct hfi1_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		flush_iowait(qp);
+		qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+		spin_unlock(&qp->s_lock);
+		spin_unlock_irq(&qp->r_lock);
+		cancel_work_sync(&qp->s_iowait.iowork);
+		del_timer_sync(&qp->s_timer);
+		iowait_sdma_drain(&qp->s_iowait);
+		flush_tx_list(qp);
+		remove_qp(dev, qp);
+		wait_event(qp->wait, !atomic_read(&qp->refcount));
+		spin_lock_irq(&qp->r_lock);
+		spin_lock(&qp->s_lock);
+		clear_mr_refs(qp, 1);
+		clear_ahg(qp);
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	vfree(qp->s_wq);
+	kfree(qp->s_hdr);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt)
+{
+	u32 offset, qpn, i;
+	struct qpn_map *map;
+	int ret = 0;
+
+	spin_lock_init(&qpt->lock);
+
+	qpt->last = 0;
+	qpt->incr = 1 << dd->qos_shift;
+
+	/* insure we don't assign QPs from KDETH 64K window */
+	qpn = kdeth_qp << 16;
+	qpt->nmaps = qpn / BITS_PER_PAGE;
+	/* This should always be zero */
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpt->nmaps];
+	dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n",
+		qpn, qpn + 65535);
+	for (i = 0; i < 65536; i++) {
+		if (!map->page) {
+			get_map_page(qpt, map);
+			if (!map->page) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		set_bit(offset, map->page);
+		offset++;
+		if (offset == BITS_PER_PAGE) {
+			/* next page */
+			qpt->nmaps++;
+			map++;
+			offset = 0;
+		}
+	}
+	return ret;
+}
+
+/**
+ * free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+static void free_qpn_table(struct hfi1_qpn_table *qpt)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+		free_page((unsigned long) qpt->map[i].page);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == HFI1_AETH_CREDIT_INVAL) {
+		if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+			qp->s_flags |= HFI1_S_UNLIMITED_CREDIT;
+			if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+				hfi1_schedule_send(qp);
+			}
+		}
+	} else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+		if (cmp_msn(credit, qp->s_lsn) > 0) {
+			qp->s_lsn = credit;
+			if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+				hfi1_schedule_send(qp);
+			}
+		}
+	}
+}
+
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & flag) {
+		qp->s_flags &= ~flag;
+		trace_hfi1_qpwakeup(qp, flag);
+		hfi1_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	/* Notify hfi1_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *stx,
+	unsigned seq)
+{
+	struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+	struct hfi1_qp *qp;
+	unsigned long flags;
+	int ret = 0;
+	struct hfi1_ibdev *dev;
+
+	qp = tx->qp;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+
+		/*
+		 * If we couldn't queue the DMA request, save the info
+		 * and try again later rather than destroying the
+		 * buffer and undoing the side effects of the copy.
+		 */
+		/* Make a common routine? */
+		dev = &sde->dd->verbs_dev;
+		list_add_tail(&stx->list, &wait->tx_head);
+		write_seqlock(&dev->iowait_lock);
+		if (sdma_progress(sde, seq, stx))
+			goto eagain;
+		if (list_empty(&qp->s_iowait.list)) {
+			struct hfi1_ibport *ibp =
+				to_iport(qp->ibqp.device, qp->port_num);
+
+			ibp->n_dmawait++;
+			qp->s_flags |= HFI1_S_WAIT_DMA_DESC;
+			list_add_tail(&qp->s_iowait.list, &sde->dmawait);
+			trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC);
+			atomic_inc(&qp->refcount);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~HFI1_S_BUSY;
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		ret = -EBUSY;
+	} else {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		hfi1_put_txreq(tx);
+	}
+	return ret;
+eagain:
+	write_sequnlock(&dev->iowait_lock);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	list_del_init(&stx->list);
+	return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+	struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+
+	WARN_ON(reason != SDMA_AVAIL_REASON);
+	hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC);
+}
+
+int hfi1_qp_init(struct hfi1_ibdev *dev)
+{
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	int i;
+	int ret = -ENOMEM;
+
+	/* allocate parent object */
+	dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL);
+	if (!dev->qp_dev)
+		goto nomem;
+	/* allocate hash table */
+	dev->qp_dev->qp_table_size = hfi1_qp_table_size;
+	dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size);
+	dev->qp_dev->qp_table =
+		kmalloc(dev->qp_dev->qp_table_size *
+				sizeof(*dev->qp_dev->qp_table),
+			GFP_KERNEL);
+	if (!dev->qp_dev->qp_table)
+		goto nomem;
+	for (i = 0; i < dev->qp_dev->qp_table_size; i++)
+		RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL);
+	spin_lock_init(&dev->qp_dev->qpt_lock);
+	/* initialize qpn map */
+	ret = init_qpn_table(dd, &dev->qp_dev->qpn_table);
+	if (ret)
+		goto nomem;
+	return ret;
+nomem:
+	if (dev->qp_dev) {
+		kfree(dev->qp_dev->qp_table);
+		free_qpn_table(&dev->qp_dev->qpn_table);
+		kfree(dev->qp_dev);
+	}
+	return ret;
+}
+
+void hfi1_qp_exit(struct hfi1_ibdev *dev)
+{
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	u32 qps_inuse;
+
+	qps_inuse = free_all_qps(dd);
+	if (qps_inuse)
+		dd_dev_err(dd, "QP memory leak! %u still in use\n",
+			   qps_inuse);
+	if (dev->qp_dev) {
+		kfree(dev->qp_dev->qp_table);
+		free_qpn_table(&dev->qp_dev->qpn_table);
+		kfree(dev->qp_dev);
+	}
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct sdma_engine *sde;
+
+	if (!(dd->flags & HFI1_HAS_SEND_DMA))
+		return NULL;
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+		break;
+	case IB_QPT_SMI:
+		return NULL;
+	default:
+		break;
+	}
+	sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+	return sde;
+}
+
+struct qp_iter {
+	struct hfi1_ibdev *dev;
+	struct hfi1_qp *qp;
+	int specials;
+	int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+	struct qp_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	iter->specials = dev->ibdev.phys_port_cnt * 2;
+	if (qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+	struct hfi1_ibdev *dev = iter->dev;
+	int n = iter->n;
+	int ret = 1;
+	struct hfi1_qp *pqp = iter->qp;
+	struct hfi1_qp *qp;
+
+	/*
+	 * The approach is to consider the special qps
+	 * as an additional table entries before the
+	 * real hash table.  Since the qp code sets
+	 * the qp->next hash link to NULL, this works just fine.
+	 *
+	 * iter->specials is 2 * # ports
+	 *
+	 * n = 0..iter->specials is the special qp indices
+	 *
+	 * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are
+	 * the potential hash bucket entries
+	 *
+	 */
+	for (; n <  dev->qp_dev->qp_table_size + iter->specials; n++) {
+		if (pqp) {
+			qp = rcu_dereference(pqp->next);
+		} else {
+			if (n < iter->specials) {
+				struct hfi1_pportdata *ppd;
+				struct hfi1_ibport *ibp;
+				int pidx;
+
+				pidx = n % dev->ibdev.phys_port_cnt;
+				ppd = &dd_from_dev(dev)->pport[pidx];
+				ibp = &ppd->ibport_data;
+
+				if (!(n & 1))
+					qp = rcu_dereference(ibp->qp[0]);
+				else
+					qp = rcu_dereference(ibp->qp[1]);
+			} else {
+				qp = rcu_dereference(
+					dev->qp_dev->qp_table[
+						(n - iter->specials)]);
+			}
+		}
+		pqp = qp;
+		if (qp) {
+			iter->qp = qp;
+			iter->n = n;
+			return 0;
+		}
+	}
+	return ret;
+}
+
+static const char * const qp_type_str[] = {
+	"SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct hfi1_qp *qp)
+{
+	return
+		qp->s_last == qp->s_acked &&
+		qp->s_acked == qp->s_cur &&
+		qp->s_cur == qp->s_tail &&
+		qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+	struct hfi1_swqe *wqe;
+	struct hfi1_qp *qp = iter->qp;
+	struct sdma_engine *sde;
+
+	sde = qp_to_sdma_engine(qp, qp->s_sc);
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	seq_printf(s,
+		   "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n",
+		   iter->n,
+		   qp_idle(qp) ? "I" : "B",
+		   qp->ibqp.qp_num,
+		   atomic_read(&qp->refcount),
+		   qp_type_str[qp->ibqp.qp_type],
+		   qp->state,
+		   wqe ? wqe->wr.opcode : 0,
+		   qp->s_hdrwords,
+		   qp->s_flags,
+		   atomic_read(&qp->s_iowait.sdma_busy),
+		   !list_empty(&qp->s_iowait.list),
+		   qp->timeout,
+		   wqe ? wqe->ssn : 0,
+		   qp->s_lsn,
+		   qp->s_last_psn,
+		   qp->s_psn, qp->s_next_psn,
+		   qp->s_sending_psn, qp->s_sending_hpsn,
+		   qp->s_last, qp->s_acked, qp->s_cur,
+		   qp->s_tail, qp->s_head, qp->s_size,
+		   qp->remote_qpn,
+		   qp->remote_ah_attr.dlid,
+		   qp->remote_ah_attr.sl,
+		   qp->pmtu,
+		   qp->s_retry_cnt,
+		   qp->timeout,
+		   qp->s_rnr_retry_cnt,
+		   sde,
+		   sde ? sde->this_idx : 0);
+}
+
+void qp_comm_est(struct hfi1_qp *qp)
+{
+	qp->r_flags |= HFI1_R_COMM_EST;
+	if (qp->ibqp.event_handler) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_COMM_EST;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
new file mode 100644
index 0000000..6b50585
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -0,0 +1,235 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include "verbs.h"
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	void *page;
+};
+
+struct hfi1_qpn_table {
+	spinlock_t lock; /* protect changes in this struct */
+	unsigned flags;         /* flags for QP0/1 allocated for each port */
+	u32 last;               /* last QP number allocated */
+	u32 nmaps;              /* size of the map table */
+	u16 limit;
+	u8  incr;
+	/* bit map of free QP numbers other than 0/1 */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct hfi1_qp_ibdev {
+	u32 qp_table_size;
+	u32 qp_table_bits;
+	struct hfi1_qp __rcu **qp_table;
+	spinlock_t qpt_lock;
+	struct hfi1_qpn_table qpn_table;
+};
+
+static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn)
+{
+	return hash_32(qpn, dev->qp_table_bits);
+}
+
+/**
+ * hfi1_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp,
+				u32 qpn) __must_hold(RCU)
+{
+	struct hfi1_qp *qp = NULL;
+
+	if (unlikely(qpn <= 1)) {
+		qp = rcu_dereference(ibp->qp[qpn]);
+	} else {
+		struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
+		u32 n = qpn_hash(dev->qp_dev, qpn);
+
+		for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp;
+			qp = rcu_dereference(qp->next))
+			if (qp->ibqp.qp_num == qpn)
+				break;
+	}
+	return qp;
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err);
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_udata *udata);
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_qp_init_attr *init_attr);
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp);
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_init - allocate QP tables
+ * @dev: a pointer to the hfi1_ibdev
+ */
+int hfi1_qp_init(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_exit - free the QP related structures
+ * @dev: a pointer to the hfi1_ibdev
+ */
+void hfi1_qp_exit(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_waitup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - wake up on the indicated event
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - wakeup on the indicated event
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_next - wake up on the indicated event
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct hfi1_qp *qp);
+
+#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
new file mode 100644
index 0000000..3138936
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -0,0 +1,546 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
+ * in twsi.c
+ */
+#define I2C_MAX_RETRY 4
+
+/*
+ * Unlocked i2c write.  Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+		       int offset, void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int ret, cnt;
+	u8 *buff = bp;
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = hfi1_twsi_reset(dd, target);
+	if (ret) {
+		hfi1_dev_porterr(dd, ppd->port,
+				 "I2C interface Reset for write failed\n");
+		return -EIO;
+	}
+
+	cnt = 0;
+	while (cnt < len) {
+		int wlen = len - cnt;
+
+		ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
+				       buff + cnt, wlen);
+		if (ret) {
+			/* hfi1_twsi_blk_wr() 1 for error, else 0 */
+			return -EIO;
+		}
+		offset += wlen;
+		cnt += wlen;
+	}
+
+	/* Must wait min 20us between qsfp i2c transactions */
+	udelay(20);
+
+	return cnt;
+}
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+	      void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+	if (!ret) {
+		ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+		mutex_unlock(&dd->qsfp_i2c_mutex);
+	}
+
+	return ret;
+}
+
+/*
+ * Unlocked i2c read.  Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+		      int offset, void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int ret, cnt, pass = 0;
+	int stuck = 0;
+	u8 *buff = bp;
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = hfi1_twsi_reset(dd, target);
+	if (ret) {
+		hfi1_dev_porterr(dd, ppd->port,
+				 "I2C interface Reset for read failed\n");
+		ret = -EIO;
+		stuck = 1;
+		goto exit;
+	}
+
+	cnt = 0;
+	while (cnt < len) {
+		int rlen = len - cnt;
+
+		ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
+				       buff + cnt, rlen);
+		/* Some QSFP's fail first try. Retry as experiment */
+		if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
+			continue;
+		if (ret) {
+			/* hfi1_twsi_blk_rd() 1 for error, else 0 */
+			ret = -EIO;
+			goto exit;
+		}
+		offset += rlen;
+		cnt += rlen;
+	}
+
+	ret = cnt;
+
+exit:
+	if (stuck)
+		dd_dev_err(dd, "I2C interface bus stuck non-idle\n");
+
+	if (pass >= I2C_MAX_RETRY && ret)
+		hfi1_dev_porterr(dd, ppd->port,
+				 "I2C failed even retrying\n");
+	else if (pass)
+		hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass);
+
+	/* Must wait min 20us between qsfp i2c transactions */
+	udelay(20);
+
+	return ret;
+}
+
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+	     void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+	if (!ret) {
+		ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+		mutex_unlock(&dd->qsfp_i2c_mutex);
+	}
+
+	return ret;
+}
+
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	       int len)
+{
+	int count = 0;
+	int offset;
+	int nwrite;
+	int ret;
+	u8 page;
+
+	ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+	if (ret)
+		return ret;
+
+	while (count < len) {
+		/*
+		 * Set the qsfp page based on a zero-based addresss
+		 * and a page size of QSFP_PAGESIZE bytes.
+		 */
+		page = (u8)(addr / QSFP_PAGESIZE);
+
+		ret = __i2c_write(ppd, target, QSFP_DEV,
+					QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+		if (ret != 1) {
+			hfi1_dev_porterr(
+			ppd->dd,
+			ppd->port,
+			"can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+			ret = -EIO;
+			break;
+		}
+
+		/* truncate write to end of page if crossing page boundary */
+		offset = addr % QSFP_PAGESIZE;
+		nwrite = len - count;
+		if ((offset + nwrite) > QSFP_PAGESIZE)
+			nwrite = QSFP_PAGESIZE - offset;
+
+		ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count,
+					nwrite);
+		if (ret <= 0)	/* stop on error or nothing read */
+			break;
+
+		count += ret;
+		addr += ret;
+	}
+
+	mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	      int len)
+{
+	int count = 0;
+	int offset;
+	int nread;
+	int ret;
+	u8 page;
+
+	ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+	if (ret)
+		return ret;
+
+	while (count < len) {
+		/*
+		 * Set the qsfp page based on a zero-based address
+		 * and a page size of QSFP_PAGESIZE bytes.
+		 */
+		page = (u8)(addr / QSFP_PAGESIZE);
+		ret = __i2c_write(ppd, target, QSFP_DEV,
+					QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+		if (ret != 1) {
+			hfi1_dev_porterr(
+			ppd->dd,
+			ppd->port,
+			"can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+			ret = -EIO;
+			break;
+		}
+
+		/* truncate read to end of page if crossing page boundary */
+		offset = addr % QSFP_PAGESIZE;
+		nread = len - count;
+		if ((offset + nread) > QSFP_PAGESIZE)
+			nread = QSFP_PAGESIZE - offset;
+
+		ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count,
+					nread);
+		if (ret <= 0)	/* stop on error or nothing read */
+			break;
+
+		count += ret;
+		addr += ret;
+	}
+
+	mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+	u32 target = ppd->dd->hfi1_id;
+	int ret;
+	unsigned long flags;
+	u8 *cache = &cp->cache[0];
+
+	/* ensure sane contents on invalid reads, for cable swaps */
+	memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+	dd_dev_info(ppd->dd, "%s: called\n", __func__);
+	if (!qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	ret = qsfp_read(ppd, target, 0, cache, 256);
+	if (ret != 256) {
+		dd_dev_info(ppd->dd,
+			"%s: Read of pages 00H failed, expected 256, got %d\n",
+			__func__, ret);
+		goto bail;
+	}
+
+	if (cache[0] != 0x0C && cache[0] != 0x0D)
+		goto bail;
+
+	/* Is paging enabled? */
+	if (!(cache[2] & 4)) {
+
+		/* Paging enabled, page 03 required */
+		if ((cache[195] & 0xC0) == 0xC0) {
+			/* all */
+			ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+		} else if ((cache[195] & 0x80) == 0x80) {
+			/* only page 2 and 3 */
+			ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+		} else if ((cache[195] & 0x40) == 0x40) {
+			/* only page 1 and 3 */
+			ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+		} else {
+			/* only page 3 */
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+				goto bail;
+			}
+		}
+	}
+
+	spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+	ppd->qsfp_info.cache_valid = 1;
+	ppd->qsfp_info.cache_refresh_required = 0;
+	spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+	return 0;
+
+bail:
+	memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+	return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+	"850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+	"1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+	"Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+	"Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+	if (HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+		struct hfi1_devdata *dd = ppd->dd;
+		u64 reg;
+
+		reg = read_csr(dd,
+			dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+		return !(reg & QSFP_HFI0_MODPRST_N);
+	}
+	/* always return cable present */
+	return 1;
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+		   u8 *data)
+{
+	struct hfi1_pportdata *ppd;
+	u32 excess_len = 0;
+	int ret = 0;
+
+	if (port_num > dd->num_pports || port_num < 1) {
+		dd_dev_info(dd, "%s: Invalid port number %d\n",
+				__func__, port_num);
+		ret = -EINVAL;
+		goto set_zeroes;
+	}
+
+	ppd = dd->pport + (port_num - 1);
+	if (!qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto set_zeroes;
+	}
+
+	if (!ppd->qsfp_info.cache_valid) {
+		ret = -EINVAL;
+		goto set_zeroes;
+	}
+
+	if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+		ret = -ERANGE;
+		goto set_zeroes;
+	}
+
+	if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+		excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+		memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+		data += (len - excess_len);
+		goto set_zeroes;
+	}
+
+	memcpy(data, &ppd->qsfp_info.cache[addr], len);
+	return 0;
+
+set_zeroes:
+	memset(data, 0, excess_len);
+	return ret;
+}
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+	u8 *cache = &ppd->qsfp_info.cache[0];
+	u8 bin_buff[QSFP_DUMP_CHUNK];
+	char lenstr[6];
+	int sofar, ret;
+	int bidx = 0;
+	u8 *atten = &cache[QSFP_ATTEN_OFFS];
+	u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+
+	sofar = 0;
+	lenstr[0] = ' ';
+	lenstr[1] = '\0';
+
+	if (ppd->qsfp_info.cache_valid) {
+
+		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+			sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+				pwr_codes +
+				(QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+
+		sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+				lenstr,
+			hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+				   QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+				   QSFP_OUI(vendor_oui));
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+				   QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+				   QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+			sofar += scnprintf(buf + sofar, len - sofar,
+				"Atten:%d, %d\n",
+				QSFP_ATTEN_SDR(atten),
+				QSFP_ATTEN_DDR(atten));
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+				   QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+				   QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+				   QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+		while (bidx < QSFP_DEFAULT_HDR_CNT) {
+			int iidx;
+
+			memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+			for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+				sofar += scnprintf(buf + sofar, len-sofar,
+					" %02X", bin_buff[iidx]);
+			}
+			sofar += scnprintf(buf + sofar, len - sofar, "\n");
+			bidx += QSFP_DUMP_CHUNK;
+		}
+	}
+	ret = sofar;
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
new file mode 100644
index 0000000..d30c2a6
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -0,0 +1,222 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES	5
+
+/*
+ * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK    (1 << 0)
+#define QSFP_HFI0_I2CDAT    (1 << 1)
+#define QSFP_HFI0_RESET_N   (1 << 2)
+#define QSFP_HFI0_INT_N	    (1 << 3)
+#define QSFP_HFI0_MODPRST_N (1 << 4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+			oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY		0x01
+
+#define QSFP_HIGH_TEMP_ALARM		0x80
+#define QSFP_LOW_TEMP_ALARM		0x40
+#define QSFP_HIGH_TEMP_WARNING		0x20
+#define QSFP_LOW_TEMP_WARNING		0x10
+
+#define QSFP_HIGH_VCC_ALARM		0x80
+#define QSFP_LOW_VCC_ALARM		0x40
+#define QSFP_HIGH_VCC_WARNING		0x20
+#define QSFP_LOW_VCC_WARNING		0x10
+
+#define QSFP_HIGH_POWER_ALARM		0x88
+#define QSFP_LOW_POWER_ALARM		0x44
+#define QSFP_HIGH_POWER_WARNING		0x22
+#define QSFP_LOW_POWER_WARNING		0x11
+
+#define QSFP_HIGH_BIAS_ALARM		0x88
+#define QSFP_LOW_BIAS_ALARM		0x44
+#define QSFP_HIGH_BIAS_WARNING		0x22
+#define QSFP_LOW_BIAS_WARNING		0x11
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qsfp_data {
+	/* Helps to find our way */
+	struct hfi1_pportdata *ppd;
+	struct work_struct qsfp_work;
+	u8 cache[QSFP_MAX_NUM_PAGES*128];
+	spinlock_t qsfp_lock;
+	u8 check_interrupt_flags;
+	u8 qsfp_interrupt_functional;
+	u8 cache_valid;
+	u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+		       struct qsfp_data *cp);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+		   u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+	      int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+	     int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	       int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	      int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
new file mode 100644
index 0000000..632dd5b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -0,0 +1,2426 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "sdma.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static void rc_timeout(unsigned long arg);
+
+static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = delta_psn(psn, wqe->psn) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	hfi1_skip_sge(ss, len, 0);
+	return wqe->length - len;
+}
+
+static void start_timer(struct hfi1_qp *qp)
+{
+	qp->s_flags |= HFI1_S_TIMER;
+	qp->s_timer.function = rc_timeout;
+	/* 4.096 usec. * (1 << qp->timeout) */
+	qp->s_timer.expires = jiffies + qp->timeout_jiffies;
+	add_timer(&qp->s_timer);
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp,
+		       struct hfi1_other_headers *ohdr, u32 pmtu)
+{
+	struct hfi1_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	int middle = 0;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			hfi1_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & HFI1_S_ACK_PENDING)
+				goto normal;
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester re-sends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			qp->s_rdma_mr = e->rdma_sge.mr;
+			if (qp->s_rdma_mr)
+				hfi1_get_mr(qp->s_rdma_mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = hfi1_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = mask_psn(e->psn);
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+		if (qp->s_rdma_mr)
+			hfi1_get_mr(qp->s_rdma_mr);
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+		} else {
+			ohdr->u.aeth = hfi1_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		break;
+
+	default:
+normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~HFI1_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     HFI1_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = hfi1_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = mask_psn(qp->s_ack_psn);
+	}
+	qp->s_rdma_ack_cnt++;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle);
+	return 1;
+
+bail:
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	/*
+	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+	 * HFI1_S_RESP_PENDING
+	 */
+	smp_wmb();
+	qp->s_flags &= ~(HFI1_S_RESP_PENDING
+				| HFI1_S_ACK_PENDING
+				| HFI1_S_AHG_VALID);
+	return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct hfi1_qp *qp)
+{
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_sge_state *ss;
+	struct hfi1_swqe *wqe;
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	u32 hwords = 5;
+	u32 len;
+	u32 bth0 = 0;
+	u32 bth2;
+	u32 pmtu = qp->pmtu;
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+	int middle = 0;
+	int delta;
+
+	ohdr = &qp->s_hdr->ibh.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout re-sends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->s_flags & HFI1_S_RESP_PENDING) &&
+	    make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+		if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_iowait.sdma_busy)) {
+			qp->s_flags |= HFI1_S_WAIT_DMA;
+			goto bail;
+		}
+		clear_ahg(qp);
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done;
+	}
+
+	if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK))
+		goto bail;
+
+	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= HFI1_S_WAIT_PSN;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head) {
+				clear_ahg(qp);
+				goto bail;
+			}
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= HFI1_S_WAIT_FENCE;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = mask_psn(qp->s_psn);
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+			    cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+			    cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= HFI1_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= HFI1_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+		 * thread to indicate a SEND needs to be restarted from an
+		 * earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = mask_psn(qp->s_psn++);
+		if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+		 * thread to indicate a RDMA write needs to be restarted from
+		 * an earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = mask_psn(qp->s_psn++);
+		if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+		 * thread to indicate a RDMA read needs to be restarted from
+		 * an earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_sending_hpsn = bth2;
+	delta = delta_psn(bth2, wqe->psn);
+	if (delta && delta % HFI1_PSN_CREDIT == 0)
+		bth2 |= IB_BTH_REQ_ACK;
+	if (qp->s_flags & HFI1_S_SEND_ONE) {
+		qp->s_flags &= ~HFI1_S_SEND_ONE;
+		qp->s_flags |= HFI1_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	hfi1_make_ruc_header(
+		qp,
+		ohdr,
+		bth0 | (qp->s_state << 24),
+		bth2,
+		middle);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp,
+		      int is_fecn)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 pbc, pbc_flags = 0;
+	u16 lrh0;
+	u16 sc5;
+	u32 bth0;
+	u32 hwords;
+	u32 vl, plen;
+	struct send_context *sc;
+	struct pio_buf *pbuf;
+	struct hfi1_ib_header hdr;
+	struct hfi1_other_headers *ohdr;
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if (qp->s_flags & HFI1_S_RESP_PENDING)
+		goto queue_ack;
+
+	/* Ensure s_rdma_ack_cnt changes are committed */
+	smp_read_barrier_depends();
+	if (qp->s_rdma_ack_cnt)
+		goto queue_ack;
+
+	/* Construct the header */
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+				       &qp->remote_ah_attr.grh, hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = HFI1_LRH_GRH;
+	} else {
+		ohdr = &hdr.u.oth;
+		lrh0 = HFI1_LRH_BTH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     HFI1_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = hfi1_compute_aeth(qp);
+	sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+	pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+	lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+	ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+		return;
+
+	sc = rcd->sc;
+	plen = 2 /* PBC */ + hwords;
+	vl = sc_to_vlt(ppd->dd, sc5);
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+	pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+	if (!pbuf) {
+		/*
+		 * We have no room to send at the moment.  Pass
+		 * responsibility for sending the ACK to the send tasklet
+		 * so that when enough buffer space becomes available,
+		 * the ACK is sent ahead of other outgoing packets.
+		 */
+		goto queue_ack;
+	}
+
+	trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+	/* write the pbc and data */
+	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+	return;
+
+queue_ack:
+	this_cpu_inc(*ibp->rc_qacks);
+	spin_lock(&qp->s_lock);
+	qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING;
+	qp->s_nak_state = qp->r_nak_state;
+	qp->s_ack_psn = qp->r_ack_psn;
+	if (is_fecn)
+		qp->s_flags |= HFI1_S_ECN;
+
+	/* Schedule the send tasklet. */
+	hfi1_schedule_send(qp);
+	spin_unlock(&qp->s_lock);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct hfi1_qp *qp, u32 psn)
+{
+	u32 n = qp->s_acked;
+	struct hfi1_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (cmp_psn(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = cmp_psn(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See hfi1_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+	/*
+	 * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer
+	 * asynchronously before the send tasklet can get scheduled.
+	 * Doing it in hfi1_make_rc_req() is too late.
+	 */
+	if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+		qp->s_flags |= HFI1_S_WAIT_PSN;
+	qp->s_flags &= ~HFI1_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait)
+{
+	struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
+	struct hfi1_ibport *ibp;
+
+	if (qp->s_retry == 0) {
+		if (qp->s_mig_state == IB_MIG_ARMED) {
+			hfi1_migrate_qp(qp);
+			qp->s_retry = qp->s_retry_cnt;
+		} else if (qp->s_last == qp->s_acked) {
+			hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			return;
+		} else /* need to handle delayed completion */
+			return;
+	} else
+		qp->s_retry--;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		ibp->n_rc_resends++;
+	else
+		ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+	qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR |
+			 HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN |
+			 HFI1_S_WAIT_ACK);
+	if (wait)
+		qp->s_flags |= HFI1_S_SEND_ONE;
+	reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rc_timeout(unsigned long arg)
+{
+	struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+	struct hfi1_ibport *ibp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qp->s_flags & HFI1_S_TIMER) {
+		ibp = to_iport(qp->ibqp.device, qp->port_num);
+		ibp->n_rc_timeouts++;
+		qp->s_flags &= ~HFI1_S_TIMER;
+		del_timer(&qp->s_timer);
+		restart_rc(qp, qp->s_last_psn + 1, 1);
+		hfi1_schedule_send(qp);
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+	struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & HFI1_S_WAIT_RNR) {
+		qp->s_flags &= ~HFI1_S_WAIT_RNR;
+		del_timer(&qp->s_timer);
+		hfi1_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct hfi1_qp *qp, u32 psn)
+{
+	struct hfi1_swqe *wqe;
+	u32 n = qp->s_last;
+
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = get_swqe_ptr(qp, n);
+		if (cmp_psn(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr)
+{
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_swqe *wqe;
+	struct ib_wc wc;
+	unsigned i;
+	u32 opcode;
+	u32 psn;
+
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* Find out where the BTH is */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	reset_sending_psn(qp, psn);
+
+	/*
+	 * Start timer after a packet requesting an ACK has been sent and
+	 * there are still requests that haven't been acked.
+	 */
+	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	    !(qp->s_flags &
+		(HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) &&
+		(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+		start_timer(qp);
+
+	while (qp->s_last != qp->s_acked) {
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct hfi1_sge *sge = &wqe->sg_list[i];
+
+			hfi1_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof(wc));
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	}
+	/*
+	 * If we were waiting for sends to complete before re-sending,
+	 * and they are now complete, restart sending.
+	 */
+	trace_hfi1_rc_sendcomplete(qp, psn);
+	if (qp->s_flags & HFI1_S_WAIT_PSN &&
+	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		qp->s_flags &= ~HFI1_S_WAIT_PSN;
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		hfi1_schedule_send(qp);
+	}
+}
+
+static inline void update_last_psn(struct hfi1_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp,
+					  struct hfi1_swqe *wqe,
+					  struct hfi1_ibport *ibp)
+{
+	struct ib_wc wc;
+	unsigned i;
+
+	/*
+	 * Don't decrement refcount and don't generate a
+	 * completion if the SWQE is being resent until the send
+	 * is finished.
+	 */
+	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct hfi1_sge *sge = &wqe->sg_list[i];
+
+			hfi1_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof(wc));
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	} else {
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+		this_cpu_inc(*ibp->rc_delayed_comp);
+		/*
+		 * If send progress not running attempt to progress
+		 * SDMA queue.
+		 */
+		if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+			struct sdma_engine *engine;
+			u8 sc5;
+
+			/* For now use sc to find engine */
+			sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+			engine = qp_to_sdma_engine(qp, sc5);
+			sdma_engine_progress_schedule(engine);
+		}
+	}
+
+	qp->s_retry = qp->s_retry_cnt;
+	update_last_psn(qp, wqe->lpsn);
+
+	/*
+	 * If we are completing a request which is in the process of
+	 * being resent, we can stop re-sending it since we know the
+	 * responder has already seen it.
+	 */
+	if (qp->s_acked == qp->s_cur) {
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_acked = qp->s_cur;
+		wqe = get_swqe_ptr(qp, qp->s_cur);
+		if (qp->s_acked != qp->s_tail) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		}
+	} else {
+		if (++qp->s_acked >= qp->s_size)
+			qp->s_acked = 0;
+		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+			qp->s_draining = 0;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+	}
+	return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val, struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_ibport *ibp;
+	enum ib_wc_status status;
+	struct hfi1_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+		qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/* Retry this request. */
+			if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) {
+				qp->r_flags |= HFI1_R_RDMAR_SEQ;
+				restart_rc(qp, qp->s_last_psn + 1, 0);
+				if (list_empty(&qp->rspwait)) {
+					qp->r_flags |= HFI1_R_RSP_SEND;
+					atomic_inc(&qp->refcount);
+					list_add_tail(&qp->rspwait,
+						      &rcd->qp_wait_list);
+				}
+			}
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			u64 *vaddr = wqe->sg_list[0].vaddr;
+			*vaddr = val;
+		}
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if ((qp->s_flags & HFI1_S_WAIT_FENCE) &&
+			    !qp->s_num_rd_atomic) {
+				qp->s_flags &= ~(HFI1_S_WAIT_FENCE |
+						 HFI1_S_WAIT_ACK);
+				hfi1_schedule_send(qp);
+			} else if (qp->s_flags & HFI1_S_WAIT_RDMAR) {
+				qp->s_flags &= ~(HFI1_S_WAIT_RDMAR |
+						 HFI1_S_WAIT_ACK);
+				hfi1_schedule_send(qp);
+			}
+		}
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	switch (aeth >> 29) {
+	case 0:         /* ACK */
+		this_cpu_inc(*ibp->rc_acks);
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We are expecting more ACKs so
+			 * reset the re-transmit timer.
+			 */
+			start_timer(qp);
+			/*
+			 * We can stop re-sending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (cmp_psn(qp->s_psn, psn) <= 0)
+				reset_psn(qp, psn + 1);
+		} else if (cmp_psn(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		if (qp->s_flags & HFI1_S_WAIT_ACK) {
+			qp->s_flags &= ~HFI1_S_WAIT_ACK;
+			hfi1_schedule_send(qp);
+		}
+		hfi1_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:         /* RNR NAK */
+		ibp->n_rnr_naks++;
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		if (qp->s_flags & HFI1_S_WAIT_RNR)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+		reset_psn(qp, psn);
+
+		qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK);
+		qp->s_flags |= HFI1_S_WAIT_RNR;
+		qp->s_timer.function = hfi1_rc_rnr_retry;
+		qp->s_timer.expires = jiffies + usecs_to_jiffies(
+			ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+					   HFI1_AETH_CREDIT_MASK]);
+		add_timer(&qp->s_timer);
+		goto bail;
+
+	case 3:         /* NAK */
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+			HFI1_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			ibp->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			restart_rc(qp, psn, 0);
+			hfi1_schedule_send(qp);
+			break;
+
+		case 1: /* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 2: /* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 3: /* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			ibp->n_other_naks++;
+class_b:
+			if (qp->s_last == qp->s_acked) {
+				hfi1_send_complete(qp, wqe, status);
+				hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:                /* 2: reserved */
+reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+			 struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_swqe *wqe;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+		qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+
+	while (cmp_psn(psn, wqe->lpsn) > 0) {
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			break;
+		wqe = do_rc_completion(qp, wqe, ibp);
+	}
+
+	ibp->n_rdma_seq++;
+	qp->r_flags |= HFI1_R_RDMAR_SEQ;
+	restart_rc(qp, qp->s_last_psn + 1, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= HFI1_R_RSP_SEND;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+			struct hfi1_other_headers *ohdr,
+			void *data, u32 tlen, struct hfi1_qp *qp,
+			u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+			struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_swqe *wqe;
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Ignore invalid responses. */
+	if (cmp_psn(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = cmp_psn(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			aeth = be32_to_cpu(ohdr->u.aeth);
+			if ((aeth >> 29) == 0)
+				hfi1_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	/*
+	 * Skip everything other than the PSN we expect, if we are waiting
+	 * for a reply to a restarted RDMA read or atomic op.
+	 */
+	if (qp->r_flags & HFI1_R_RDMAR_SEQ) {
+		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~HFI1_R_RDMAR_SEQ;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+			val = ((u64) be32_to_cpu(p[0]) << 32) |
+				be32_to_cpu(p[1]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/*
+		 * We got a response so update the timeout.
+		 * 4.096 usec. * (1 << qp->timeout)
+		 */
+		qp->s_flags |= HFI1_S_TIMER;
+		mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+		if (qp->s_flags & HFI1_S_WAIT_ACK) {
+			qp->s_flags &= ~HFI1_S_WAIT_ACK;
+			hfi1_schedule_send(qp);
+		}
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 4)))
+			goto ack_len_err;
+read_last:
+		tlen -= hdrsize + pad + 4;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_seq_err:
+	rdma_seq_err(qp, ibp, psn, rcd);
+	goto ack_done;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	if (qp->s_last == qp->s_acked) {
+		hfi1_send_complete(qp, wqe, status);
+		hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	}
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+			struct hfi1_qp *qp, u32 opcode, u32 psn, int diff,
+			struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_ack_entry *e;
+	unsigned long flags;
+	u8 i, prev;
+	int old_req;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			ibp->n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence NAK until all packets
+			 * in the receive queue have been processed.
+			 * Otherwise, we end up propagating congestion.
+			 */
+			if (list_empty(&qp->rspwait)) {
+				qp->r_flags |= HFI1_R_RSP_NAK;
+				atomic_inc(&qp->refcount);
+				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+			}
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 * old_req is true if there is an older response that is scheduled
+	 * to be sent before sending this one.
+	 */
+	e = NULL;
+	old_req = 1;
+	ibp->n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = HFI1_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (cmp_psn(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue &&
+			    cmp_psn(psn, e->lpsn) <= 0)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+			goto unlock_done;
+		/* RETH comes after BTH */
+		reth = &ohdr->u.rc.reth;
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = delta_psn(psn, e->psn) * qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len != e->rdma_sge.sge_length))
+			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			hfi1_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+					  IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->psn = psn;
+		if (old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		/*
+		 * Ignore this operation if it doesn't request an ACK
+		 * or an earlier RDMA read or atomic is going to be resent.
+		 */
+		if (!(psn & IB_BTH_REQ_ACK) || old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags |= HFI1_S_RESP_PENDING;
+	qp->r_nak_state = 0;
+	hfi1_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = hfi1_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > HFI1_MAX_RDMA_ATOMIC)
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+			  u32 lqpn, u32 rqpn, u8 svc_type)
+{
+	struct opa_hfi1_cong_log_event_internal *cc_event;
+
+	if (sl >= OPA_MAX_SLS)
+		return;
+
+	spin_lock(&ppd->cc_log_lock);
+
+	ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8);
+	ppd->threshold_event_counter++;
+
+	cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+	if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+		ppd->cc_log_idx = 0;
+	cc_event->lqpn = lqpn & HFI1_QPN_MASK;
+	cc_event->rqpn = rqpn & HFI1_QPN_MASK;
+	cc_event->sl = sl;
+	cc_event->svc_type = svc_type;
+	cc_event->rlid = rlid;
+	/* keep timestamp in units of 1.024 usec */
+	cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+	spin_unlock(&ppd->cc_log_lock);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+		  u32 rqpn, u8 svc_type)
+{
+	struct cca_timer *cca_timer;
+	u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+	u8 trigger_threshold;
+	struct cc_state *cc_state;
+
+	if (sl >= OPA_MAX_SLS)
+		return;
+
+	cca_timer = &ppd->cca_timer[sl];
+
+	cc_state = get_cc_state(ppd);
+
+	if (cc_state == NULL)
+		return;
+
+	/*
+	 * 1) increase CCTI (for this SL)
+	 * 2) select IPG (i.e., call set_link_ipg())
+	 * 3) start timer
+	 */
+	ccti_limit = cc_state->cct.ccti_limit;
+	ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+	trigger_threshold =
+		cc_state->cong_setting.entries[sl].trigger_threshold;
+
+	spin_lock(&ppd->cca_timer_lock);
+
+	if (cca_timer->ccti < ccti_limit) {
+		if (cca_timer->ccti + ccti_incr <= ccti_limit)
+			cca_timer->ccti += ccti_incr;
+		else
+			cca_timer->ccti = ccti_limit;
+		set_link_ipg(ppd);
+	}
+
+	spin_unlock(&ppd->cca_timer_lock);
+
+	ccti = cca_timer->ccti;
+
+	if (!hrtimer_active(&cca_timer->hrtimer)) {
+		/* ccti_timer is in units of 1.024 usec */
+		unsigned long nsec = 1024 * ccti_timer;
+
+		hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_ib_header *hdr = packet->hdr;
+	u32 rcv_flags = packet->rcv_flags;
+	void *data = packet->ebuf;
+	u32 tlen = packet->tlen;
+	struct hfi1_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_other_headers *ohdr = packet->ohdr;
+	u32 bth0, opcode;
+	u32 hdrsize = packet->hlen;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	int diff;
+	struct ib_reth *reth;
+	unsigned long flags;
+	u32 bth1;
+	int ret, is_fecn = 0;
+
+	bth0 = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+		return;
+
+	bth1 = be32_to_cpu(ohdr->bth[1]);
+	if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+		if (bth1 & HFI1_BECN_SMASK) {
+			u16 rlid = qp->remote_ah_attr.dlid;
+			u32 lqpn, rqpn;
+
+			lqpn = qp->ibqp.qp_num;
+			rqpn = qp->remote_qpn;
+			process_becn(
+				ppd,
+				qp->remote_ah_attr.sl,
+				rlid, lqpn, rqpn,
+				IB_CC_SVCTYPE_RC);
+		}
+		is_fecn = bth1 & HFI1_FECN_SMASK;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode = bth0 >> 24;
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+			    hdrsize, pmtu, rcd);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+			return;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+		qp_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		ret = hfi1_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		ret = hfi1_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		ret = hfi1_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		/* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+		wc.wc_flags = 0;
+		wc.ex.imm_data = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (bth0 >> 20) & 3;
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+		hfi1_put_ss(&qp->r_sge);
+		qp->r_msn++;
+		if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		/*
+		 * It seems that IB mandates the presence of an SL in a
+		 * work completion only for the UD transport (see section
+		 * 11.4.2 of IBTA Vol. 1).
+		 *
+		 * However, the way the SL is chosen below is consistent
+		 * with the way that IB/qib works and is trying avoid
+		 * introducing incompatibilities.
+		 *
+		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+		 */
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			      (bth0 & IB_BTH_SOLICITED) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		reth = &ohdr->u.rc.reth;
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+					  rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto no_immediate_data;
+		ret = hfi1_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		wc.ex.imm_data = ohdr->u.rc.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct hfi1_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+		if (next > HFI1_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			hfi1_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		reth = &ohdr->u.rc.reth;
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					  rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = qp->r_psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= HFI1_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct hfi1_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > HFI1_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			hfi1_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		ateth = &ohdr->u.atomic_eth;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					   vaddr, rkey,
+					   IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		hfi1_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = psn;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= HFI1_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	return;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue RNR NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= HFI1_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_op_err:
+	hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= HFI1_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= HFI1_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+	struct hfi1_ctxtdata *rcd,
+	struct hfi1_ib_header *hdr,
+	u32 rcv_flags,
+	struct hfi1_qp *qp)
+{
+	int has_grh = rcv_flags & HFI1_HAS_GRH;
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	int diff;
+	u8 opcode;
+	u32 psn;
+
+	/* Check for GRH */
+	ohdr = &hdr->u.oth;
+	if (has_grh)
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/* Only deal with RDMA Writes for now */
+	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+		diff = delta_psn(psn, qp->r_psn);
+		if (!qp->r_nak_state && diff >= 0) {
+			ibp->n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence
+			 * NAK until all packets
+			 * in the receive queue have
+			 * been processed.
+			 * Otherwise, we end up
+			 * propagating congestion.
+			 */
+			if (list_empty(&qp->rspwait)) {
+				qp->r_flags |= HFI1_R_RSP_NAK;
+				atomic_inc(&qp->refcount);
+				list_add_tail(
+					&qp->rspwait,
+					&rcd->qp_wait_list);
+				}
+		} /* Out of sequence NAK */
+	} /* QP Request NAKs */
+}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
new file mode 100644
index 0000000..a411528
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -0,0 +1,948 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "sdma.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+	655360,	/* 00: 655.36 */
+	10,	/* 01:    .01 */
+	20,	/* 02     .02 */
+	30,	/* 03:    .03 */
+	40,	/* 04:    .04 */
+	60,	/* 05:    .06 */
+	80,	/* 06:    .08 */
+	120,	/* 07:    .12 */
+	160,	/* 08:    .16 */
+	240,	/* 09:    .24 */
+	320,	/* 0A:    .32 */
+	480,	/* 0B:    .48 */
+	640,	/* 0C:    .64 */
+	960,	/* 0D:    .96 */
+	1280,	/* 0E:   1.28 */
+	1920,	/* 0F:   1.92 */
+	2560,	/* 10:   2.56 */
+	3840,	/* 11:   3.84 */
+	5120,	/* 12:   5.12 */
+	7680,	/* 13:   7.68 */
+	10240,	/* 14:  10.24 */
+	15360,	/* 15:  15.36 */
+	20480,	/* 16:  20.48 */
+	30720,	/* 17:  30.72 */
+	40960,	/* 18:  40.96 */
+	61440,	/* 19:  61.44 */
+	81920,	/* 1A:  81.92 */
+	122880,	/* 1B: 122.88 */
+	163840,	/* 1C: 163.84 */
+	245760,	/* 1D: 245.76 */
+	327680,	/* 1E: 327.68 */
+	491520	/* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct hfi1_lkey_table *rkt;
+	struct hfi1_pd *pd;
+	struct hfi1_sge_state *ss;
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				  &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	while (j) {
+		struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		hfi1_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct hfi1_rq *rq;
+	struct hfi1_rwq *wq;
+	struct hfi1_srq *srq;
+	struct hfi1_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(HFI1_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct hfi1_qp *qp)
+{
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = qp->alt_ah_attr.port_num;
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+	qp->s_flags |= HFI1_S_AHG_CLEAR;
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+	if (!index) {
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+		return cpu_to_be64(ppd->guid);
+	}
+	return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+	return (gid->global.interface_id == id &&
+		(gid->global.subnet_prefix == gid_prefix ||
+		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+		       int has_grh, struct hfi1_qp *qp, u32 bth0)
+{
+	__be64 guid;
+	unsigned long flags;
+	u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+		if (!has_grh) {
+			if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->alt_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+					    sc5, be16_to_cpu(hdr->lrh[3])))) {
+			hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				       (u16)bth0,
+				       (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				       0, qp->ibqp.qp_num,
+				       hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+			goto err;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_migrate_qp(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else {
+		if (!has_grh) {
+			if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp,
+					 qp->remote_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->remote_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+					    sc5, be16_to_cpu(hdr->lrh[3])))) {
+			hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				       (u16)bth0,
+				       (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				       0, qp->ibqp.qp_num,
+				       hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->port_num)
+			goto err;
+		if (qp->s_mig_state == IB_MIG_REARM &&
+		    !(bth0 & IB_BTH_MIG_REQ))
+			qp->s_mig_state = IB_MIG_ARMED;
+	}
+
+	return 0;
+
+err:
+	return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct hfi1_qp *sqp)
+{
+	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct hfi1_qp *qp;
+	struct hfi1_swqe *wqe;
+	struct hfi1_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	int release;
+	int ret;
+
+	rcu_read_lock();
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) ||
+	    !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= HFI1_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work request. */
+	if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		ibp->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof(wc));
+	send_status = IB_WC_SUCCESS;
+
+	release = 1;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		ret = hfi1_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = hfi1_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					   wqe->wr.wr.rdma.remote_addr,
+					   wqe->wr.wr.rdma.rkey,
+					   IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					   wqe->wr.wr.rdma.remote_addr,
+					   wqe->wr.wr.rdma.rkey,
+					   IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = 0;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					   wqe->wr.wr.atomic.remote_addr,
+					   wqe->wr.wr.atomic.rkey,
+					   IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		hfi1_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (!release)
+				hfi1_put_mr(sge->mr);
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= HFI1_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+	if (release)
+		hfi1_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		      wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ibp->n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	hfi1_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	ibp->n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= HFI1_S_WAIT_RNR;
+	sqp->s_timer.function = hfi1_rc_rnr_retry;
+	sqp->s_timer.expires = jiffies +
+		usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]);
+	add_timer(&sqp->s_timer);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	hfi1_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	hfi1_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~HFI1_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+		  struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = IB_GRH_NEXT_HDR;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
+	hdr->sgid.global.interface_id =
+		grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+		ibp->guids[grh->sgid_index - 1] :
+			cpu_to_be64(ppd_from_ibp(ibp)->guid);
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+void clear_ahg(struct hfi1_qp *qp)
+{
+	qp->s_hdr->ahgcount = 0;
+	qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR);
+	if (qp->s_sde)
+		sdma_ahg_free(qp->s_sde, qp->s_ahgidx);
+	qp->s_ahgidx = -1;
+	qp->s_sde = NULL;
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_hdr
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct hfi1_qp *qp, u32 npsn)
+{
+	if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR))
+		clear_ahg(qp);
+	if (!(qp->s_flags & HFI1_S_AHG_VALID)) {
+		/* first middle that needs copy  */
+		if (qp->s_ahgidx < 0) {
+			if (!qp->s_sde)
+				qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
+			qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde);
+		}
+		if (qp->s_ahgidx >= 0) {
+			qp->s_ahgpsn = npsn;
+			qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+			/* save to protect a change in another thread */
+			qp->s_hdr->sde = qp->s_sde;
+			qp->s_hdr->ahgidx = qp->s_ahgidx;
+			qp->s_flags |= HFI1_S_AHG_VALID;
+		}
+	} else {
+		/* subsequent middle after valid */
+		if (qp->s_ahgidx >= 0) {
+			qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+			qp->s_hdr->ahgidx = qp->s_ahgidx;
+			qp->s_hdr->ahgcount++;
+			qp->s_hdr->ahgdesc[0] =
+				sdma_build_ahg_descriptor(
+					(__force u16)cpu_to_be16((u16)npsn),
+					BTH2_OFFSET,
+					16,
+					16);
+			if ((npsn & 0xffff0000) !=
+					(qp->s_ahgpsn & 0xffff0000)) {
+				qp->s_hdr->ahgcount++;
+				qp->s_hdr->ahgdesc[1] =
+					sdma_build_ahg_descriptor(
+						(__force u16)cpu_to_be16(
+							(u16)(npsn >> 16)),
+						BTH2_OFFSET,
+						0,
+						16);
+			}
+		}
+	}
+}
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+			  u32 bth0, u32 bth2, int middle)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+	u8 sc5;
+	u32 bth1;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = HFI1_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+					       &qp->remote_ah_attr.grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = HFI1_LRH_GRH;
+		middle = 0;
+	}
+	sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+	lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+	qp->s_sc = sc5;
+	/*
+	 * reset s_hdr/AHG fields
+	 *
+	 * This insures that the ahgentry/ahgcount
+	 * are at a non-AHG default to protect
+	 * build_verbs_tx_desc() from using
+	 * an include ahgidx.
+	 *
+	 * build_ahg() will modify as appropriate
+	 * to use the AHG feature.
+	 */
+	qp->s_hdr->tx_flags = 0;
+	qp->s_hdr->ahgcount = 0;
+	qp->s_hdr->ahgidx = 0;
+	qp->s_hdr->sde = NULL;
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	else
+		middle = 0;
+	if (middle)
+		build_ahg(qp, bth2);
+	else
+		qp->s_flags &= ~HFI1_S_AHG_VALID;
+	qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr->ibh.lrh[2] =
+		cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	bth1 = qp->remote_qpn;
+	if (qp->s_flags & HFI1_S_ECN) {
+		qp->s_flags &= ~HFI1_S_ECN;
+		/* we recently received a FECN, so return a BECN */
+		bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+	}
+	ohdr->bth[1] = cpu_to_be32(bth1);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct work_struct *work)
+{
+	struct iowait *wait = container_of(work, struct iowait, iowork);
+	struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	int (*make_req)(struct hfi1_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    !loopback &&
+	    (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+		ruc_loopback(qp);
+		return;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+		make_req = hfi1_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+		make_req = hfi1_make_uc_req;
+	else
+		make_req = hfi1_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_ok(qp)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		return;
+	}
+
+	qp->s_flags |= HFI1_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (qp->s_hdrwords != 0) {
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
+					    qp->s_cur_sge, qp->s_cur_size))
+				break;
+			/* Record that s_hdr is empty. */
+			qp->s_hdrwords = 0;
+		}
+	} while (make_req(qp));
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+			enum ib_wc_status status)
+{
+	u32 old_last, last;
+	unsigned i;
+
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct hfi1_sge *sge = &wqe->sg_list[i];
+
+		hfi1_put_mr(sge->mr);
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof(wc));
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			      status != IB_WC_SUCCESS);
+	}
+
+	last = qp->s_last;
+	old_last = last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
new file mode 100644
index 0000000..37bd767
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.c
@@ -0,0 +1,2962 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 1024
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+	(SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE    (1U << 0)
+#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define SDMA_SENDCTRL_OP_HALT      (1U << 2)
+#define SDMA_SENDCTRL_OP_CLEANUP   (1U << 3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+	[sdma_state_s00_hw_down]                = "s00_HwDown",
+	[sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
+	[sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+	[sdma_state_s20_idle]                   = "s20_Idle",
+	[sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
+	[sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
+	[sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
+	[sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
+	[sdma_state_s80_hw_freeze]		= "s80_HwFreeze",
+	[sdma_state_s82_freeze_sw_clean]	= "s82_FreezeSwClean",
+	[sdma_state_s99_running]                = "s99_Running",
+};
+
+static const char * const sdma_event_names[] = {
+	[sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+	[sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+	[sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+	[sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+	[sdma_event_e30_go_running]   = "e30_GoRunning",
+	[sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+	[sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+	[sdma_event_e60_hw_halted]    = "e60_HwHalted",
+	[sdma_event_e70_go_idle]      = "e70_GoIdle",
+	[sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
+	[sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
+	[sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
+	[sdma_event_e85_link_down]    = "e85_LinkDown",
+	[sdma_event_e90_sw_halted]    = "e90_SwHalted",
+};
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+	[sdma_state_s00_hw_down] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s10_hw_start_up_halt_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 1,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s15_hw_start_up_clean_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 1,
+	},
+	[sdma_state_s20_idle] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 1,
+	},
+	[sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s60_idle_halt_wait] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 1,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s80_hw_freeze] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s82_freeze_sw_clean] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_start_sw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+	struct sdma_engine *sde,
+	enum sdma_events event);
+static void __sdma_process_event(
+	struct sdma_engine *sde,
+	enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+	return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+	kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+	struct sdma_state *ss =
+		container_of(kref, struct sdma_state, kref);
+
+	complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+	kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+	sdma_put(ss);
+	wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+	struct sdma_engine *sde,
+	u32 offset0,
+	u64 value)
+{
+	write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+	struct sdma_engine *sde,
+	u32 offset0)
+{
+	return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+					int pause)
+{
+	u64 off = 8 * sde->this_idx;
+	struct hfi1_devdata *dd = sde->dd;
+	int lcnt = 0;
+
+	while (1) {
+		u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+		reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+		reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+		if (reg == 0)
+			break;
+		if (lcnt++ > 100) {
+			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n",
+				  __func__, sde->this_idx, (u32)reg);
+			break;
+		}
+		udelay(1);
+	}
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		struct sdma_engine *sde = &dd->per_sdma[i];
+
+		sdma_wait_for_packet_egress(sde, 0);
+	}
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+	u64 reg;
+
+	if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+		return;
+	reg = cnt;
+	reg &= SD(DESC_CNT_CNT_MASK);
+	reg <<= SD(DESC_CNT_CNT_SHIFT);
+	write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ *   state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+	struct sdma_txreq *txp, *txp_next;
+	LIST_HEAD(flushlist);
+
+	/* flush from head to tail */
+	sdma_flush_descq(sde);
+	spin_lock(&sde->flushlist_lock);
+	/* copy flush list */
+	list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+		list_del_init(&txp->list);
+		list_add_tail(&txp->list, &flushlist);
+	}
+	spin_unlock(&sde->flushlist_lock);
+	/* flush from flush list */
+	list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
+		int drained = 0;
+		/* protect against complete modifying */
+		struct iowait *wait = txp->wait;
+
+		list_del_init(&txp->list);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+		trace_hfi1_sdma_out_sn(sde, txp->sn);
+		if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+			dd_dev_err(sde->dd, "expected %llu got %llu\n",
+				sde->head_sn, txp->sn);
+		sde->head_sn++;
+#endif
+		sdma_txclean(sde->dd, txp);
+		if (wait)
+			drained = atomic_dec_and_test(&wait->sdma_busy);
+		if (txp->complete)
+			(*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
+		if (wait && drained)
+			iowait_drain_wakeup(wait);
+	}
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+	unsigned long flags;
+	struct sdma_engine *sde =
+		container_of(work, struct sdma_engine, flush_worker);
+
+	write_seqlock_irqsave(&sde->head_lock, flags);
+	if (!__sdma_running(sde))
+		sdma_flush(sde);
+	write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+	struct sdma_engine *sde = container_of(work, struct sdma_engine,
+						err_halt_worker);
+	u64 statuscsr;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+	while (1) {
+		statuscsr = read_sde_csr(sde, SD(STATUS));
+		statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+		if (statuscsr)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(sde->dd,
+				"SDMA engine %d - timeout waiting for engine to halt\n",
+				sde->this_idx);
+			/*
+			 * Continue anyway.  This could happen if there was
+			 * an uncorrectable error in the wrong spot.
+			 */
+			break;
+		}
+		usleep_range(80, 120);
+	}
+
+	sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_start_err_halt_wait(struct sdma_engine *sde)
+{
+	schedule_work(&sde->err_halt_worker);
+}
+
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+	if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+
+		unsigned index;
+		struct hfi1_devdata *dd = sde->dd;
+
+		for (index = 0; index < dd->num_sdma; index++) {
+			struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+			if (curr_sdma != sde)
+				curr_sdma->progress_check_head =
+							curr_sdma->descq_head;
+		}
+		dd_dev_err(sde->dd,
+			   "SDMA engine %d - check scheduled\n",
+				sde->this_idx);
+		mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+	}
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+	unsigned index;
+	struct sdma_engine *sde = (struct sdma_engine *)data;
+
+	dd_dev_err(sde->dd, "SDE progress check event\n");
+	for (index = 0; index < sde->dd->num_sdma; index++) {
+		struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+		unsigned long flags;
+
+		/* check progress on each engine except the current one */
+		if (curr_sde == sde)
+			continue;
+		/*
+		 * We must lock interrupts when acquiring sde->lock,
+		 * to avoid a deadlock if interrupt triggers and spins on
+		 * the same lock on same CPU
+		 */
+		spin_lock_irqsave(&curr_sde->tail_lock, flags);
+		write_seqlock(&curr_sde->head_lock);
+
+		/* skip non-running queues */
+		if (curr_sde->state.current_state != sdma_state_s99_running) {
+			write_sequnlock(&curr_sde->head_lock);
+			spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+			continue;
+		}
+
+		if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+		    (curr_sde->descq_head ==
+				curr_sde->progress_check_head))
+			__sdma_process_event(curr_sde,
+					     sdma_event_e90_sw_halted);
+		write_sequnlock(&curr_sde->head_lock);
+		spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+	}
+	schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+	struct sdma_engine *sde = (struct sdma_engine *) opaque;
+	u64 statuscsr;
+
+	while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+		dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+			   sde->this_idx, slashstrip(__FILE__), __LINE__,
+			__func__);
+#endif
+		statuscsr = read_sde_csr(sde, SD(STATUS));
+		statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+		if (statuscsr)
+			break;
+		udelay(10);
+	}
+
+	sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+	smp_read_barrier_depends(); /* see sdma_update_tail() */
+	return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+	u16 head, tail;
+	int progress = 0;
+	struct sdma_txreq *txp = get_txhead(sde);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+	head = sde->descq_head & sde->sdma_mask;
+	tail = sde->descq_tail & sde->sdma_mask;
+	while (head != tail) {
+		/* advance head, wrap if needed */
+		head = ++sde->descq_head & sde->sdma_mask;
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == head) {
+			int drained = 0;
+			/* protect against complete modifying */
+			struct iowait *wait = txp->wait;
+
+			/* remove from list */
+			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+			if (wait)
+				drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+			trace_hfi1_sdma_out_sn(sde, txp->sn);
+			if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+				dd_dev_err(sde->dd, "expected %llu got %llu\n",
+					sde->head_sn, txp->sn);
+			sde->head_sn++;
+#endif
+			sdma_txclean(sde->dd, txp);
+			trace_hfi1_sdma_progress(sde, head, tail, txp);
+			if (txp->complete)
+				(*txp->complete)(
+					txp,
+					SDMA_TXREQ_S_ABORTED,
+					drained);
+			if (wait && drained)
+				iowait_drain_wakeup(wait);
+			/* see if there is another txp */
+			txp = get_txhead(sde);
+		}
+		progress++;
+	}
+	if (progress)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+	struct sdma_engine *sde = (struct sdma_engine *) opaque;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+
+	/*
+	 * At this point, the following should always be true:
+	 * - We are halted, so no more descriptors are getting retired.
+	 * - We are not running, so no one is submitting new work.
+	 * - Only we can send the e40_sw_cleaned, so we can't start
+	 *   running again until we say so.  So, the active list and
+	 *   descq are ours to play with.
+	 */
+
+
+	/*
+	 * In the error clean up sequence, software clean must be called
+	 * before the hardware clean so we can use the hardware head in
+	 * the progress routine.  A hardware clean or SPC unfreeze will
+	 * reset the hardware head.
+	 *
+	 * Process all retired requests. The progress routine will use the
+	 * latest physical hardware head - we are not running so speed does
+	 * not matter.
+	 */
+	sdma_make_progress(sde, 0);
+
+	sdma_flush(sde);
+
+	/*
+	 * Reset our notion of head and tail.
+	 * Note that the HW registers have been reset via an earlier
+	 * clean up.
+	 */
+	sde->descq_tail = 0;
+	sde->descq_head = 0;
+	sde->desc_avail = sdma_descq_freecnt(sde);
+	*sde->head_dma = 0;
+
+	__sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+	struct sdma_state *ss = &sde->state;
+
+	/* Releasing this reference means the state machine has stopped. */
+	sdma_put(ss);
+
+	/* stop waiting for all unfreeze events to complete */
+	atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+	wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+	tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_start_sw_clean_up(struct sdma_engine *sde)
+{
+	tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+	enum sdma_states next_state)
+{
+	struct sdma_state *ss = &sde->state;
+	const struct sdma_set_state_action *action = sdma_action_table;
+	unsigned op = 0;
+
+	trace_hfi1_sdma_state(
+		sde,
+		sdma_state_names[ss->current_state],
+		sdma_state_names[next_state]);
+
+	/* debugging bookkeeping */
+	ss->previous_state = ss->current_state;
+	ss->previous_op = ss->current_op;
+	ss->current_state = next_state;
+
+	if (ss->previous_state != sdma_state_s99_running
+		&& next_state == sdma_state_s99_running)
+		sdma_flush(sde);
+
+	if (action[next_state].op_enable)
+		op |= SDMA_SENDCTRL_OP_ENABLE;
+
+	if (action[next_state].op_intenable)
+		op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+	if (action[next_state].op_halt)
+		op |= SDMA_SENDCTRL_OP_HALT;
+
+	if (action[next_state].op_cleanup)
+		op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+	if (action[next_state].go_s99_running_tofalse)
+		ss->go_s99_running = 0;
+
+	if (action[next_state].go_s99_running_totrue)
+		ss->go_s99_running = 1;
+
+	ss->current_op = op;
+	sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+	u16 count = sdma_descq_cnt;
+
+	if (!count)
+		return SDMA_DESCQ_CNT;
+	/* count must be a power of 2 greater than 64 and less than
+	 * 32768.   Otherwise return default.
+	 */
+	if (!is_power_of_2(count))
+		return SDMA_DESCQ_CNT;
+	if (count < 64 && count > 32768)
+		return SDMA_DESCQ_CNT;
+	return count;
+}
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl)
+{
+	struct sdma_vl_map *m;
+	struct sdma_map_elem *e;
+	struct sdma_engine *rval;
+
+	if (WARN_ON(vl > 8))
+		return NULL;
+
+	rcu_read_lock();
+	m = rcu_dereference(dd->sdma_map);
+	if (unlikely(!m)) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	e = m->map[vl & m->mask];
+	rval = e->sde[selector & e->mask];
+	rcu_read_unlock();
+
+	trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+	return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 sc5)
+{
+	u8 vl = sc_to_vlt(dd, sc5);
+
+	return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+	int i;
+
+	for (i = 0; m && i < m->actual_vls; i++)
+		kfree(m->map[i]);
+	kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+	struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+	sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines.  Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+	int i, j;
+	int extra, sde_per_vl;
+	int engine = 0;
+	u8 lvl_engines[OPA_MAX_VLS];
+	struct sdma_vl_map *oldmap, *newmap;
+
+	if (!(dd->flags & HFI1_HAS_SEND_DMA))
+		return 0;
+
+	if (!vl_engines) {
+		/* truncate divide */
+		sde_per_vl = dd->num_sdma / num_vls;
+		/* extras */
+		extra = dd->num_sdma % num_vls;
+		vl_engines = lvl_engines;
+		/* add extras from last vl down */
+		for (i = num_vls - 1; i >= 0; i--, extra--)
+			vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+	}
+	/* build new map */
+	newmap = kzalloc(
+		sizeof(struct sdma_vl_map) +
+			roundup_pow_of_two(num_vls) *
+			sizeof(struct sdma_map_elem *),
+		GFP_KERNEL);
+	if (!newmap)
+		goto bail;
+	newmap->actual_vls = num_vls;
+	newmap->vls = roundup_pow_of_two(num_vls);
+	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+	for (i = 0; i < newmap->vls; i++) {
+		/* save for wrap around */
+		int first_engine = engine;
+
+		if (i < newmap->actual_vls) {
+			int sz = roundup_pow_of_two(vl_engines[i]);
+
+			/* only allocate once */
+			newmap->map[i] = kzalloc(
+				sizeof(struct sdma_map_elem) +
+					sz * sizeof(struct sdma_engine *),
+				GFP_KERNEL);
+			if (!newmap->map[i])
+				goto bail;
+			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+			/* assign engines */
+			for (j = 0; j < sz; j++) {
+				newmap->map[i]->sde[j] =
+					&dd->per_sdma[engine];
+				if (++engine >= first_engine + vl_engines[i])
+					/* wrap back to first engine */
+					engine = first_engine;
+			}
+		} else {
+			/* just re-use entry without allocating */
+			newmap->map[i] = newmap->map[i % num_vls];
+		}
+		engine = first_engine + vl_engines[i];
+	}
+	/* newmap in hand, save old map */
+	spin_lock_irq(&dd->sde_map_lock);
+	oldmap = rcu_dereference_protected(dd->sdma_map,
+			lockdep_is_held(&dd->sde_map_lock));
+
+	/* publish newmap */
+	rcu_assign_pointer(dd->sdma_map, newmap);
+
+	spin_unlock_irq(&dd->sde_map_lock);
+	/* success, free any old map after grace period */
+	if (oldmap)
+		call_rcu(&oldmap->list, sdma_map_rcu_callback);
+	return 0;
+bail:
+	/* free any partial allocation */
+	sdma_map_free(newmap);
+	return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+	size_t i;
+	struct sdma_engine *sde;
+
+	if (dd->sdma_pad_dma) {
+		dma_free_coherent(&dd->pcidev->dev, 4,
+				  (void *)dd->sdma_pad_dma,
+				  dd->sdma_pad_phys);
+		dd->sdma_pad_dma = NULL;
+		dd->sdma_pad_phys = 0;
+	}
+	if (dd->sdma_heads_dma) {
+		dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+				  (void *)dd->sdma_heads_dma,
+				  dd->sdma_heads_phys);
+		dd->sdma_heads_dma = NULL;
+		dd->sdma_heads_phys = 0;
+	}
+	for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+		sde = &dd->per_sdma[i];
+
+		sde->head_dma = NULL;
+		sde->head_phys = 0;
+
+		if (sde->descq) {
+			dma_free_coherent(
+				&dd->pcidev->dev,
+				sde->descq_cnt * sizeof(u64[2]),
+				sde->descq,
+				sde->descq_phys
+			);
+			sde->descq = NULL;
+			sde->descq_phys = 0;
+		}
+		if (is_vmalloc_addr(sde->tx_ring))
+			vfree(sde->tx_ring);
+		else
+			kfree(sde->tx_ring);
+		sde->tx_ring = NULL;
+	}
+	spin_lock_irq(&dd->sde_map_lock);
+	kfree(rcu_access_pointer(dd->sdma_map));
+	RCU_INIT_POINTER(dd->sdma_map, NULL);
+	spin_unlock_irq(&dd->sde_map_lock);
+	synchronize_rcu();
+	kfree(dd->per_sdma);
+	dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs.  Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+	unsigned this_idx;
+	struct sdma_engine *sde;
+	u16 descq_cnt;
+	void *curr_head;
+	struct hfi1_pportdata *ppd = dd->pport + port;
+	u32 per_sdma_credits;
+	uint idle_cnt = sdma_idle_cnt;
+	size_t num_engines = dd->chip_sdma_engines;
+
+	if (!HFI1_CAP_IS_KSET(SDMA)) {
+		HFI1_CAP_CLEAR(SDMA_AHG);
+		return 0;
+	}
+	if (mod_num_sdma &&
+		/* can't exceed chip support */
+		mod_num_sdma <= dd->chip_sdma_engines &&
+		/* count must be >= vls */
+		mod_num_sdma >= num_vls)
+		num_engines = mod_num_sdma;
+
+	dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+	dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+	dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+		dd->chip_sdma_mem_size);
+
+	per_sdma_credits =
+		dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE);
+
+	/* set up freeze waitqueue */
+	init_waitqueue_head(&dd->sdma_unfreeze_wq);
+	atomic_set(&dd->sdma_unfreeze_count, 0);
+
+	descq_cnt = sdma_get_descq_cnt();
+	dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+		num_engines, descq_cnt);
+
+	/* alloc memory for array of send engines */
+	dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+	if (!dd->per_sdma)
+		return -ENOMEM;
+
+	idle_cnt = ns_to_cclock(dd, idle_cnt);
+	/* Allocate memory for SendDMA descriptor FIFOs */
+	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+		sde = &dd->per_sdma[this_idx];
+		sde->dd = dd;
+		sde->ppd = ppd;
+		sde->this_idx = this_idx;
+		sde->descq_cnt = descq_cnt;
+		sde->desc_avail = sdma_descq_freecnt(sde);
+		sde->sdma_shift = ilog2(descq_cnt);
+		sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+		sde->descq_full_count = 0;
+
+		/* Create a mask for all 3 chip interrupt sources */
+		sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx)
+			| (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx)
+			| (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+		/* Create a mask specifically for sdma_idle */
+		sde->idle_mask =
+			(u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+		/* Create a mask specifically for sdma_progress */
+		sde->progress_mask =
+			(u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx);
+		spin_lock_init(&sde->tail_lock);
+		seqlock_init(&sde->head_lock);
+		spin_lock_init(&sde->senddmactrl_lock);
+		spin_lock_init(&sde->flushlist_lock);
+		/* insure there is always a zero bit */
+		sde->ahg_bits = 0xfffffffe00000000ULL;
+
+		sdma_set_state(sde, sdma_state_s00_hw_down);
+
+		/* set up reference counting */
+		kref_init(&sde->state.kref);
+		init_completion(&sde->state.comp);
+
+		INIT_LIST_HEAD(&sde->flushlist);
+		INIT_LIST_HEAD(&sde->dmawait);
+
+		sde->tail_csr =
+			get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+		if (idle_cnt)
+			dd->default_desc1 =
+				SDMA_DESC1_HEAD_TO_HOST_FLAG;
+		else
+			dd->default_desc1 =
+				SDMA_DESC1_INT_REQ_FLAG;
+
+		tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+			(unsigned long)sde);
+
+		tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+			(unsigned long)sde);
+		INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+		INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+		sde->progress_check_head = 0;
+
+		init_timer(&sde->err_progress_check_timer);
+		sde->err_progress_check_timer.function =
+						sdma_err_progress_check;
+		sde->err_progress_check_timer.data = (unsigned long)sde;
+
+		sde->descq = dma_zalloc_coherent(
+			&dd->pcidev->dev,
+			descq_cnt * sizeof(u64[2]),
+			&sde->descq_phys,
+			GFP_KERNEL
+		);
+		if (!sde->descq)
+			goto bail;
+		sde->tx_ring =
+			kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+				GFP_KERNEL);
+		if (!sde->tx_ring)
+			sde->tx_ring =
+				vzalloc(
+					sizeof(struct sdma_txreq *) *
+					descq_cnt);
+		if (!sde->tx_ring)
+			goto bail;
+	}
+
+	dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+	/* Allocate memory for DMA of head registers to memory */
+	dd->sdma_heads_dma = dma_zalloc_coherent(
+		&dd->pcidev->dev,
+		dd->sdma_heads_size,
+		&dd->sdma_heads_phys,
+		GFP_KERNEL
+	);
+	if (!dd->sdma_heads_dma) {
+		dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+		goto bail;
+	}
+
+	/* Allocate memory for pad */
+	dd->sdma_pad_dma = dma_zalloc_coherent(
+		&dd->pcidev->dev,
+		sizeof(u32),
+		&dd->sdma_pad_phys,
+		GFP_KERNEL
+	);
+	if (!dd->sdma_pad_dma) {
+		dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+		goto bail;
+	}
+
+	/* assign each engine to different cacheline and init registers */
+	curr_head = (void *)dd->sdma_heads_dma;
+	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+		unsigned long phys_offset;
+
+		sde = &dd->per_sdma[this_idx];
+
+		sde->head_dma = curr_head;
+		curr_head += L1_CACHE_BYTES;
+		phys_offset = (unsigned long)sde->head_dma -
+			      (unsigned long)dd->sdma_heads_dma;
+		sde->head_phys = dd->sdma_heads_phys + phys_offset;
+		init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+	}
+	dd->flags |= HFI1_HAS_SEND_DMA;
+	dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+	dd->num_sdma = num_engines;
+	if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+		goto bail;
+	dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+	return 0;
+
+bail:
+	sdma_clean(dd, num_engines);
+	return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+	struct sdma_engine *sde;
+	unsigned int i;
+
+	/* move all engines to running */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e30_go_running);
+	}
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+	struct sdma_engine *sde;
+	unsigned int i;
+
+	/* idle all engines */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e70_go_idle);
+	}
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines.  Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+	unsigned i;
+	struct sdma_engine *sde;
+
+	/* kick off the engines state processing */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e10_go_hw_start);
+	}
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+	unsigned this_idx;
+	struct sdma_engine *sde;
+
+	for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+			++this_idx) {
+
+		sde = &dd->per_sdma[this_idx];
+		if (!list_empty(&sde->dmawait))
+			dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+				sde->this_idx);
+		sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+		del_timer_sync(&sde->err_progress_check_timer);
+
+		/*
+		 * This waits for the state machine to exit so it is not
+		 * necessary to kill the sdma_sw_clean_up_task to make sure
+		 * it is not running.
+		 */
+		sdma_finalput(&sde->state);
+	}
+	sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+	struct hfi1_devdata *dd,
+	struct sdma_desc *descp)
+{
+	switch (sdma_mapping_type(descp)) {
+	case SDMA_MAP_SINGLE:
+		dma_unmap_single(
+			&dd->pcidev->dev,
+			sdma_mapping_addr(descp),
+			sdma_mapping_len(descp),
+			DMA_TO_DEVICE);
+		break;
+	case SDMA_MAP_PAGE:
+		dma_unmap_page(
+			&dd->pcidev->dev,
+			sdma_mapping_addr(descp),
+			sdma_mapping_len(descp),
+			DMA_TO_DEVICE);
+		break;
+	}
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+	return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+		>> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx)
+{
+	u16 i;
+
+	if (tx->num_desc) {
+		u8 skip = 0, mode = ahg_mode(tx);
+
+		/* unmap first */
+		sdma_unmap_desc(dd, &tx->descp[0]);
+		/* determine number of AHG descriptors to skip */
+		if (mode > SDMA_AHG_APPLY_UPDATE1)
+			skip = mode >> 1;
+		for (i = 1 + skip; i < tx->num_desc; i++)
+			sdma_unmap_desc(dd, &tx->descp[i]);
+		tx->num_desc = 0;
+	}
+	kfree(tx->coalesce_buf);
+	tx->coalesce_buf = NULL;
+	/* kmalloc'ed descp */
+	if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+		tx->desc_limit = ARRAY_SIZE(tx->descs);
+		kfree(tx->descp);
+	}
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+	struct hfi1_devdata *dd = sde->dd;
+	int use_dmahead;
+	u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+	use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+					(dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+	hwhead = use_dmahead ?
+		(u16) le64_to_cpu(*sde->head_dma) :
+		(u16) read_sde_csr(sde, SD(HEAD));
+
+	if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+		u16 cnt;
+		u16 swtail;
+		u16 swhead;
+		int sane;
+
+		swhead = sde->descq_head & sde->sdma_mask;
+		/* this code is really bad for cache line trading */
+		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		cnt = sde->descq_cnt;
+
+		if (swhead < swtail)
+			/* not wrapped */
+			sane = (hwhead >= swhead) & (hwhead <= swtail);
+		else if (swhead > swtail)
+			/* wrapped around */
+			sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+				(hwhead <= swtail);
+		else
+			/* empty */
+			sane = (hwhead == swhead);
+
+		if (unlikely(!sane)) {
+			dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+				sde->this_idx,
+				use_dmahead ? "dma" : "kreg",
+				hwhead, swhead, swtail, cnt);
+			if (use_dmahead) {
+				/* try one more time, using csr */
+				use_dmahead = 0;
+				goto retry;
+			}
+			/* proceed as if no progress */
+			hwhead = swhead;
+		}
+	}
+	return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+	struct iowait *wait, *nw;
+	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+	unsigned i, n = 0, seq;
+	struct sdma_txreq *stx;
+	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+	do {
+		seq = read_seqbegin(&dev->iowait_lock);
+		if (!list_empty(&sde->dmawait)) {
+			/* at least one item */
+			write_seqlock(&dev->iowait_lock);
+			/* Harvest waiters wanting DMA descriptors */
+			list_for_each_entry_safe(
+					wait,
+					nw,
+					&sde->dmawait,
+					list) {
+				u16 num_desc = 0;
+
+				if (!wait->wakeup)
+					continue;
+				if (n == ARRAY_SIZE(waits))
+					break;
+				if (!list_empty(&wait->tx_head)) {
+					stx = list_first_entry(
+						&wait->tx_head,
+						struct sdma_txreq,
+						list);
+					num_desc = stx->num_desc;
+				}
+				if (num_desc > avail)
+					break;
+				avail -= num_desc;
+				list_del_init(&wait->list);
+				waits[n++] = wait;
+			}
+			write_sequnlock(&dev->iowait_lock);
+			break;
+		}
+	} while (read_seqretry(&dev->iowait_lock, seq));
+
+	for (i = 0; i < n; i++)
+		waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+	struct sdma_txreq *txp = NULL;
+	int progress = 0;
+	u16 hwhead, swhead, swtail;
+	int idle_check_done = 0;
+
+	hwhead = sdma_gethead(sde);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+
+retry:
+	txp = get_txhead(sde);
+	swhead = sde->descq_head & sde->sdma_mask;
+	trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+	while (swhead != hwhead) {
+		/* advance head, wrap if needed */
+		swhead = ++sde->descq_head & sde->sdma_mask;
+
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == swhead) {
+			int drained = 0;
+			/* protect against complete modifying */
+			struct iowait *wait = txp->wait;
+
+			/* remove from list */
+			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+			if (wait)
+				drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+			trace_hfi1_sdma_out_sn(sde, txp->sn);
+			if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+				dd_dev_err(sde->dd, "expected %llu got %llu\n",
+					sde->head_sn, txp->sn);
+			sde->head_sn++;
+#endif
+			sdma_txclean(sde->dd, txp);
+			if (txp->complete)
+				(*txp->complete)(
+					txp,
+					SDMA_TXREQ_S_OK,
+					drained);
+			if (wait && drained)
+				iowait_drain_wakeup(wait);
+			/* see if there is another txp */
+			txp = get_txhead(sde);
+		}
+		trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+		progress++;
+	}
+
+	/*
+	 * The SDMA idle interrupt is not guaranteed to be ordered with respect
+	 * to updates to the the dma_head location in host memory. The head
+	 * value read might not be fully up to date. If there are pending
+	 * descriptors and the SDMA idle interrupt fired then read from the
+	 * CSR SDMA head instead to get the latest value from the hardware.
+	 * The hardware SDMA head should be read at most once in this invocation
+	 * of sdma_make_progress(..) which is ensured by idle_check_done flag
+	 */
+	if ((status & sde->idle_mask) && !idle_check_done) {
+		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		if (swtail != hwhead) {
+			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+			idle_check_done = 1;
+			goto retry;
+		}
+	}
+
+	sde->last_status = status;
+	if (progress)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine.  It will
+ * contain bits _only_ for this SDMA engine.  It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+	trace_hfi1_sdma_engine_interrupt(sde, status);
+	write_seqlock(&sde->head_lock);
+	sdma_set_desc_cnt(sde, sde->descq_cnt / 2);
+	sdma_make_progress(sde, status);
+	write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+	unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+		   sde->this_idx,
+		   (unsigned long long)status,
+		   sdma_state_names[sde->state.current_state]);
+#endif
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+	if (status & ALL_SDMA_ENG_HALT_ERRS)
+		__sdma_process_event(sde, sdma_event_e60_hw_halted);
+	if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+		dd_dev_err(sde->dd,
+			"SDMA (%u) engine error: 0x%llx state %s\n",
+			sde->this_idx,
+			(unsigned long long)status,
+			sdma_state_names[sde->state.current_state]);
+		dump_sdma_state(sde);
+	}
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+	u64 set_senddmactrl = 0;
+	u64 clr_senddmactrl = 0;
+	unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+		   sde->this_idx,
+		   (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+	if (op & SDMA_SENDCTRL_OP_ENABLE)
+		set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+	if (op & SDMA_SENDCTRL_OP_INTENABLE)
+		set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+	if (op & SDMA_SENDCTRL_OP_HALT)
+		set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+	spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+	sde->p_senddmactrl |= set_senddmactrl;
+	sde->p_senddmactrl &= ~clr_senddmactrl;
+
+	if (op & SDMA_SENDCTRL_OP_CLEANUP)
+		write_sde_csr(sde, SD(CTRL),
+			sde->p_senddmactrl |
+			SD(CTRL_SDMA_CLEANUP_SMASK));
+	else
+		write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+	spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	/*
+	 * Set SendDmaLenGen and clear-then-set the MSB of the generation
+	 * count to enable generation checking and load the internal
+	 * generation counter.
+	 */
+	write_sde_csr(sde, SD(LEN_GEN),
+		(sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)
+	);
+	write_sde_csr(sde, SD(LEN_GEN),
+		((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT))
+		| (4ULL << SD(LEN_GEN_GENERATION_SHIFT))
+	);
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	smp_wmb(); /* see get_txhead() */
+	writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+	u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	sdma_setlengen(sde);
+	sdma_update_tail(sde, 0); /* Set SendDmaTail */
+	*sde->head_dma = 0;
+
+	reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+	      SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+	write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+	struct hfi1_devdata *dd = sde->dd;
+	u64 reg;
+
+	if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+		return;
+
+	reg = hfi1_pkt_base_sdma_integrity(dd);
+
+	if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+		CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+	else
+		SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+	write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+
+static void init_sdma_regs(
+	struct sdma_engine *sde,
+	u32 credits,
+	uint idle_cnt)
+{
+	u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+	struct hfi1_devdata *dd = sde->dd;
+
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+	sdma_setlengen(sde);
+	sdma_update_tail(sde, 0); /* Set SendDmaTail */
+	write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+	write_sde_csr(sde, SD(DESC_CNT), 0);
+	write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+	write_sde_csr(sde, SD(MEMORY),
+		((u64)credits <<
+			SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+		((u64)(credits * sde->this_idx) <<
+			SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+	write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+	set_sdma_integrity(sde);
+	opmask = OPCODE_CHECK_MASK_DISABLED;
+	opval = OPCODE_CHECK_VAL_DISABLED;
+	write_sde_csr(sde, SD(CHECK_OPCODE),
+		(opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+		(opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+		csr = read_csr(sde->dd, reg); \
+		dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
+	} while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+		csr = read_sde_csr(sde, reg); \
+		dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+			#reg, sde->this_idx, csr); \
+	} while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+		csr = read_csr(sde->dd, reg + (8 * i)); \
+		dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
+				#reg, i, csr); \
+	} while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+	u64 csr;
+	unsigned i;
+
+	sdma_dumpstate_helper(SD(CTRL));
+	sdma_dumpstate_helper(SD(STATUS));
+	sdma_dumpstate_helper0(SD(ERR_STATUS));
+	sdma_dumpstate_helper0(SD(ERR_MASK));
+	sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+	sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+	for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+		sdma_dumpstate_helper2(CCE_INT_STATUS));
+		sdma_dumpstate_helper2(CCE_INT_MASK);
+		sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+	}
+
+	sdma_dumpstate_helper(SD(TAIL));
+	sdma_dumpstate_helper(SD(HEAD));
+	sdma_dumpstate_helper(SD(PRIORITY_THLD));
+	sdma_dumpstate_helper(SD(IDLE_CNT);
+	sdma_dumpstate_helper(SD(RELOAD_CNT));
+	sdma_dumpstate_helper(SD(DESC_CNT));
+	sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+	sdma_dumpstate_helper(SD(MEMORY));
+	sdma_dumpstate_helper0(SD(ENGINES));
+	sdma_dumpstate_helper0(SD(MEM_SIZE));
+	/* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
+	sdma_dumpstate_helper(SD(BASE_ADDR));
+	sdma_dumpstate_helper(SD(LEN_GEN));
+	sdma_dumpstate_helper(SD(HEAD_ADDR));
+	sdma_dumpstate_helper(SD(CHECK_ENABLE));
+	sdma_dumpstate_helper(SD(CHECK_VL));
+	sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+	sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+	sdma_dumpstate_helper(SD(CHECK_SLID));
+	sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+	struct hw_sdma_desc *descq;
+	struct hw_sdma_desc *descqp;
+	u64 desc[2];
+	u64 addr;
+	u8 gen;
+	u16 len;
+	u16 head, tail, cnt;
+
+	head = sde->descq_head & sde->sdma_mask;
+	tail = sde->descq_tail & sde->sdma_mask;
+	cnt = sdma_descq_freecnt(sde);
+	descq = sde->descq;
+
+	dd_dev_err(sde->dd,
+		"SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+		sde->this_idx,
+		head,
+		tail,
+		cnt,
+		!list_empty(&sde->flushlist));
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &sde->descq[head];
+		desc[0] = le64_to_cpu(descqp->qw[0]);
+		desc[1] = le64_to_cpu(descqp->qw[1]);
+		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+				'H' : '-';
+		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+			& SDMA_DESC0_PHY_ADDR_MASK;
+		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+			& SDMA_DESC1_GENERATION_MASK;
+		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+			& SDMA_DESC0_BYTE_COUNT_MASK;
+		dd_dev_err(sde->dd,
+			"SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+			 head, flags, addr, gen, len);
+		dd_dev_err(sde->dd,
+			"\tdesc0:0x%016llx desc1 0x%016llx\n",
+			 desc[0], desc[1]);
+		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+			dd_dev_err(sde->dd,
+				"\taidx: %u amode: %u alen: %u\n",
+				(u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+					>> SDMA_DESC1_HEADER_INDEX_MASK),
+				(u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+					>> SDMA_DESC1_HEADER_MODE_SHIFT),
+				(u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK)
+					>> SDMA_DESC1_HEADER_DWS_SHIFT));
+		head++;
+		head &= sde->sdma_mask;
+	}
+}
+
+#define SDE_FMT \
+	"SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+	u16 head, tail;
+	struct hw_sdma_desc *descqp;
+	u64 desc[2];
+	u64 addr;
+	u8 gen;
+	u16 len;
+
+	head = sde->descq_head & sde->sdma_mask;
+	tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+	seq_printf(s, SDE_FMT, sde->this_idx,
+		sdma_state_name(sde->state.current_state),
+		(unsigned long long)read_sde_csr(sde, SD(CTRL)),
+		(unsigned long long)read_sde_csr(sde, SD(STATUS)),
+		(unsigned long long)read_sde_csr(sde,
+			SD(ENG_ERR_STATUS)),
+		(unsigned long long)read_sde_csr(sde, SD(TAIL)),
+		tail,
+		(unsigned long long)read_sde_csr(sde, SD(HEAD)),
+		head,
+		(unsigned long long)le64_to_cpu(*sde->head_dma),
+		(unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+		(unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+		(unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+		(unsigned long long)sde->last_status,
+		(unsigned long long)sde->ahg_bits,
+		sde->tx_tail,
+		sde->tx_head,
+		sde->descq_tail,
+		sde->descq_head,
+		   !list_empty(&sde->flushlist),
+		sde->descq_full_count,
+		(unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &sde->descq[head];
+		desc[0] = le64_to_cpu(descqp->qw[0]);
+		desc[1] = le64_to_cpu(descqp->qw[1]);
+		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+				'H' : '-';
+		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+			& SDMA_DESC0_PHY_ADDR_MASK;
+		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+			& SDMA_DESC1_GENERATION_MASK;
+		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+			& SDMA_DESC0_BYTE_COUNT_MASK;
+		seq_printf(s,
+			"\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+			head, flags, addr, gen, len);
+		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+			seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+				(u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+					>> SDMA_DESC1_HEADER_INDEX_MASK),
+				(u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+					>> SDMA_DESC1_HEADER_MODE_SHIFT));
+		head = (head + 1) & sde->sdma_mask;
+	}
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+	u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+	qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+	qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+			<< SDMA_DESC1_GENERATION_SHIFT;
+	return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+	int i;
+	u16 tail;
+	struct sdma_desc *descp = tx->descp;
+	u8 skip = 0, mode = ahg_mode(tx);
+
+	tail = sde->descq_tail & sde->sdma_mask;
+	sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+	sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+	trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+				   tail, &sde->descq[tail]);
+	tail = ++sde->descq_tail & sde->sdma_mask;
+	descp++;
+	if (mode > SDMA_AHG_APPLY_UPDATE1)
+		skip = mode >> 1;
+	for (i = 1; i < tx->num_desc; i++, descp++) {
+		u64 qw1;
+
+		sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+		if (skip) {
+			/* edits don't have generation */
+			qw1 = descp->qw[1];
+			skip--;
+		} else {
+			/* replace generation with real one for non-edits */
+			qw1 = add_gen(sde, descp->qw[1]);
+		}
+		sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+		trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+					   tail, &sde->descq[tail]);
+		tail = ++sde->descq_tail & sde->sdma_mask;
+	}
+	tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	tx->sn = sde->tail_sn++;
+	trace_hfi1_sdma_in_sn(sde, tx->sn);
+	WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+	sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+	sde->desc_avail -= tx->num_desc;
+	return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *tx)
+{
+	int ret;
+
+	sde->desc_avail = sdma_descq_freecnt(sde);
+	if (tx->num_desc <= sde->desc_avail)
+		return -EAGAIN;
+	/* pulse the head_lock */
+	if (wait && wait->sleep) {
+		unsigned seq;
+
+		seq = raw_seqcount_begin(
+			(const seqcount_t *)&sde->head_lock.seqcount);
+		ret = wait->sleep(sde, wait, tx, seq);
+		if (ret == -EAGAIN)
+			sde->desc_avail = sdma_descq_freecnt(sde);
+	} else
+		ret = -EBUSY;
+	return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring.  If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+		    struct iowait *wait,
+		    struct sdma_txreq *tx)
+{
+	int ret = 0;
+	u16 tail;
+	unsigned long flags;
+
+	/* user should have supplied entire packet */
+	if (unlikely(tx->tlen))
+		return -EINVAL;
+	tx->wait = wait;
+	spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+	if (unlikely(!__sdma_running(sde)))
+		goto unlock_noconn;
+	if (unlikely(tx->num_desc > sde->desc_avail))
+		goto nodesc;
+	tail = submit_tx(sde, tx);
+	if (wait)
+		atomic_inc(&wait->sdma_busy);
+	sdma_update_tail(sde, tail);
+unlock:
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+	return ret;
+unlock_noconn:
+	if (wait)
+		atomic_inc(&wait->sdma_busy);
+	tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	tx->sn = sde->tail_sn++;
+	trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+	spin_lock(&sde->flushlist_lock);
+	list_add_tail(&tx->list, &sde->flushlist);
+	spin_unlock(&sde->flushlist_lock);
+	if (wait) {
+		wait->tx_count++;
+		wait->count += tx->num_desc;
+	}
+	schedule_work(&sde->flush_worker);
+	ret = -ECOMM;
+	goto unlock;
+nodesc:
+	ret = sdma_check_progress(sde, wait, tx);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto retry;
+	}
+	sde->descq_full_count++;
+	goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring
+ * (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde,
+		    struct iowait *wait,
+		    struct list_head *tx_list)
+{
+	struct sdma_txreq *tx, *tx_next;
+	int ret = 0;
+	unsigned long flags;
+	u16 tail = INVALID_TAIL;
+	int count = 0;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+		tx->wait = wait;
+		if (unlikely(!__sdma_running(sde)))
+			goto unlock_noconn;
+		if (unlikely(tx->num_desc > sde->desc_avail))
+			goto nodesc;
+		if (unlikely(tx->tlen)) {
+			ret = -EINVAL;
+			goto update_tail;
+		}
+		list_del_init(&tx->list);
+		tail = submit_tx(sde, tx);
+		count++;
+		if (tail != INVALID_TAIL &&
+		    (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+			sdma_update_tail(sde, tail);
+			tail = INVALID_TAIL;
+		}
+	}
+update_tail:
+	if (wait)
+		atomic_add(count, &wait->sdma_busy);
+	if (tail != INVALID_TAIL)
+		sdma_update_tail(sde, tail);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+	return ret;
+unlock_noconn:
+	spin_lock(&sde->flushlist_lock);
+	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+		tx->wait = wait;
+		list_del_init(&tx->list);
+		if (wait)
+			atomic_inc(&wait->sdma_busy);
+		tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+		tx->sn = sde->tail_sn++;
+		trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+		list_add_tail(&tx->list, &sde->flushlist);
+		if (wait) {
+			wait->tx_count++;
+			wait->count += tx->num_desc;
+		}
+	}
+	spin_unlock(&sde->flushlist_lock);
+	schedule_work(&sde->flush_worker);
+	ret = -ECOMM;
+	goto update_tail;
+nodesc:
+	ret = sdma_check_progress(sde, wait, tx);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto retry;
+	}
+	sde->descq_full_count++;
+	goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde,
+	enum sdma_events event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+
+	__sdma_process_event(sde, event);
+
+	if (sde->state.current_state == sdma_state_s99_running)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+	enum sdma_events event)
+{
+	struct sdma_state *ss = &sde->state;
+	int need_progress = 0;
+
+	/* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+		   sdma_state_names[ss->current_state],
+		   sdma_event_names[event]);
+#endif
+
+	switch (ss->current_state) {
+	case sdma_state_s00_hw_down:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			break;
+		case sdma_event_e30_go_running:
+			/*
+			 * If down, but running requested (usually result
+			 * of link up, then we need to start up.
+			 * This can happen when hw down is requested while
+			 * bringing the link up with traffic active on
+			 * 7220, e.g. */
+			ss->go_s99_running = 1;
+			/* fall through and start dma engine */
+		case sdma_event_e10_go_hw_start:
+			/* This reference means the state machine is started */
+			sdma_get(&sde->state);
+			sdma_set_state(sde,
+				sdma_state_s10_hw_start_up_halt_wait);
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e40_sw_cleaned:
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s10_hw_start_up_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde,
+				sdma_state_s15_hw_start_up_clean_wait);
+			sdma_start_hw_clean_up(sde);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			sdma_start_err_halt_wait(sde);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s15_hw_start_up_clean_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s20_idle:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			sdma_set_state(sde, sdma_state_s99_running);
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+			sdma_start_err_halt_wait(sde);
+			break;
+		case sdma_event_e70_go_idle:
+			break;
+		case sdma_event_e85_link_down:
+			/* fall through */
+		case sdma_event_e80_hw_freeze:
+			sdma_set_state(sde, sdma_state_s80_hw_freeze);
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s30_sw_clean_up_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+			sdma_start_hw_clean_up(sde);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s40_hw_clean_up_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s50_hw_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			sdma_start_err_halt_wait(sde);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s60_idle_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			sdma_start_err_halt_wait(sde);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s80_hw_freeze:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s82_freeze_sw_clean:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			/* notify caller this engine is done cleaning */
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s99_running:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			need_progress = 1;
+			sdma_err_progress_check_schedule(sde);
+		case sdma_event_e90_sw_halted:
+			/*
+			* SW initiated halt does not perform engines
+			* progress check
+			*/
+			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+			sdma_start_err_halt_wait(sde);
+			break;
+		case sdma_event_e70_go_idle:
+			sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			/* fall through */
+		case sdma_event_e80_hw_freeze:
+			sdma_set_state(sde, sdma_state_s80_hw_freeze);
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		}
+		break;
+	}
+
+	ss->last_event = event;
+	if (need_progress)
+		sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors.  There doesn't seem
+ * much point in an interim step.
+ *
+ */
+int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int i;
+
+	tx->descp = kmalloc_array(
+			MAX_DESC,
+			sizeof(struct sdma_desc),
+			GFP_ATOMIC);
+	if (!tx->descp)
+		return -ENOMEM;
+	tx->desc_limit = MAX_DESC;
+	/* copy ones already built */
+	for (i = 0; i < tx->num_desc; i++)
+		tx->descp[i] = tx->descs[i];
+	return 0;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+	struct sdma_engine *sde;
+	int i;
+	u64 sreg;
+
+	sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+		SD(CHECK_SLID_MASK_SHIFT)) |
+		(((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+		SD(CHECK_SLID_VALUE_SHIFT));
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+			  i, (u32)sreg);
+		sde = &dd->per_sdma[i];
+		write_sde_csr(sde, SD(CHECK_SLID), sreg);
+	}
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int rval = 0;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = _extend_sdma_tx_descs(dd, tx);
+		if (rval)
+			return rval;
+	}
+	/* finish the one just added  */
+	tx->num_desc++;
+	make_tx_sdma_desc(
+		tx,
+		SDMA_MAP_NONE,
+		dd->sdma_pad_phys,
+		sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+	_sdma_close_tx(dd, tx);
+	return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen)
+{
+	u32 i, shift = 0, desc = 0;
+	u8 mode;
+
+	WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+	/* compute mode */
+	if (num_ahg == 1)
+		mode = SDMA_AHG_APPLY_UPDATE1;
+	else if (num_ahg <= 5)
+		mode = SDMA_AHG_APPLY_UPDATE2;
+	else
+		mode = SDMA_AHG_APPLY_UPDATE3;
+	tx->num_desc++;
+	/* initialize to consumed descriptors to zero */
+	switch (mode) {
+	case SDMA_AHG_APPLY_UPDATE3:
+		tx->num_desc++;
+		tx->descs[2].qw[0] = 0;
+		tx->descs[2].qw[1] = 0;
+		/* FALLTHROUGH */
+	case SDMA_AHG_APPLY_UPDATE2:
+		tx->num_desc++;
+		tx->descs[1].qw[0] = 0;
+		tx->descs[1].qw[1] = 0;
+		break;
+	}
+	ahg_hlen >>= 2;
+	tx->descs[0].qw[1] |=
+		(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+			<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+		(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+			<< SDMA_DESC1_HEADER_DWS_SHIFT) |
+		(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+			<< SDMA_DESC1_HEADER_MODE_SHIFT) |
+		(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+			<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+	for (i = 0; i < (num_ahg - 1); i++) {
+		if (!shift && !(i & 2))
+			desc++;
+		tx->descs[desc].qw[!!(i & 2)] |=
+			(((u64)ahg[i + 1])
+				<< shift);
+		shift = (shift + 32) & 63;
+	}
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+	int nr;
+	int oldbit;
+
+	if (!sde) {
+		trace_hfi1_ahg_allocate(sde, -EINVAL);
+		return -EINVAL;
+	}
+	while (1) {
+		nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+		if (nr > 31) {
+			trace_hfi1_ahg_allocate(sde, -ENOSPC);
+			return -ENOSPC;
+		}
+		oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+		if (!oldbit)
+			break;
+		cpu_relax();
+	}
+	trace_hfi1_ahg_allocate(sde, nr);
+	return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+	if (!sde)
+		return;
+	trace_hfi1_ahg_deallocate(sde, ahg_index);
+	if (ahg_index < 0 || ahg_index > 31)
+		return;
+	clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled.  Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+	int i;
+	enum sdma_events event = link_down ? sdma_event_e85_link_down :
+					     sdma_event_e80_hw_freeze;
+
+	/* set up the wait but do not wait here */
+	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+	/* tell all engines to stop running and wait */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i], event);
+
+	/* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Make sure all engines have moved out of the running state before
+	 * continuing.
+	 */
+	ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+				atomic_read(&dd->sdma_unfreeze_count) <= 0);
+	/* interrupted or count is negative, then unloading - just exit */
+	if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+		return;
+
+	/* set up the count for the next wait */
+	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+	/* tell all engines that the SPC is frozen, they can start cleaning */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+	/*
+	 * Wait for everyone to finish software clean before exiting.  The
+	 * software clean will read engine CSRs, so must be completed before
+	 * the next step, which will clear the engine CSRs.
+	 */
+	(void) wait_event_interruptible(dd->sdma_unfreeze_wq,
+				atomic_read(&dd->sdma_unfreeze_count) <= 0);
+	/* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
+ * that is left is a software clean.  We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* tell all engines start freeze clean up */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i],
+					sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+	struct sdma_engine *sde)
+{
+	trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+	/* assume we have selected a good cpu */
+	write_csr(sde->dd,
+		  CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask);
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
new file mode 100644
index 0000000..1e613fc
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.h
@@ -0,0 +1,1123 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+
+/* increased for AHG */
+#define NUM_DESC 6
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      (1ULL<<63)
+#define SDMA_DESC0_LAST_DESC_FLAG       (1ULL<<62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+	((1ULL<<SDMA_DESC0_BYTE_COUNT_WIDTH)-1ULL)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+	(SDMA_DESC0_BYTE_COUNT_MASK<<SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+	((1ULL<<SDMA_DESC0_PHY_ADDR_WIDTH)-1ULL)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+	(SDMA_DESC0_PHY_ADDR_MASK<<SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+	((1ULL<<SDMA_DESC1_HEADER_UPDATE1_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+	(SDMA_DESC1_HEADER_UPDATE1_MASK<<SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+	((1ULL<<SDMA_DESC1_HEADER_MODE_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+	(SDMA_DESC1_HEADER_MODE_MASK<<SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+	((1ULL<<SDMA_DESC1_HEADER_INDEX_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+	(SDMA_DESC1_HEADER_INDEX_MASK<<SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+	((1ULL<<SDMA_DESC1_HEADER_DWS_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+	(SDMA_DESC1_HEADER_DWS_MASK<<SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+	((1ULL<<SDMA_DESC1_GENERATION_WIDTH)-1ULL)
+#define SDMA_DESC1_GENERATION_SMASK \
+	(SDMA_DESC1_GENERATION_MASK<<SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         (1ULL<<1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    (1ULL<<0)
+
+enum sdma_states {
+	sdma_state_s00_hw_down,
+	sdma_state_s10_hw_start_up_halt_wait,
+	sdma_state_s15_hw_start_up_clean_wait,
+	sdma_state_s20_idle,
+	sdma_state_s30_sw_clean_up_wait,
+	sdma_state_s40_hw_clean_up_wait,
+	sdma_state_s50_hw_halt_wait,
+	sdma_state_s60_idle_halt_wait,
+	sdma_state_s80_hw_freeze,
+	sdma_state_s82_freeze_sw_clean,
+	sdma_state_s99_running,
+};
+
+enum sdma_events {
+	sdma_event_e00_go_hw_down,
+	sdma_event_e10_go_hw_start,
+	sdma_event_e15_hw_halt_done,
+	sdma_event_e25_hw_clean_up_done,
+	sdma_event_e30_go_running,
+	sdma_event_e40_sw_cleaned,
+	sdma_event_e50_hw_cleaned,
+	sdma_event_e60_hw_halted,
+	sdma_event_e70_go_idle,
+	sdma_event_e80_hw_freeze,
+	sdma_event_e81_hw_frozen,
+	sdma_event_e82_hw_unfreeze,
+	sdma_event_e85_link_down,
+	sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_cleanup:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+	struct kref          kref;
+	struct completion    comp;
+	enum sdma_states current_state;
+	unsigned             current_op;
+	unsigned             go_s99_running;
+	/* debugging/development */
+	enum sdma_states previous_state;
+	unsigned             previous_op;
+	enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+	/* private:  don't use directly */
+	__le64 qw[2];
+};
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+	/* private:  don't use directly */
+	u64 qw[2];
+};
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int, int);
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+struct sdma_txreq {
+	struct list_head list;
+	/* private: */
+	struct sdma_desc *descp;
+	/* private: */
+	void *coalesce_buf;
+	/* private: */
+	struct iowait *wait;
+	/* private: */
+	callback_t                  complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	u64 sn;
+#endif
+	/* private: - used in coalesce/pad processing */
+	u16                         packet_len;
+	/* private: - down-counted to trigger last */
+	u16                         tlen;
+	/* private: flags */
+	u16                         flags;
+	/* private: */
+	u16                         num_desc;
+	/* private: */
+	u16                         desc_limit;
+	/* private: */
+	u16                         next_descq_idx;
+	/* private: */
+	struct sdma_desc descs[NUM_DESC];
+};
+
+struct verbs_txreq {
+	struct hfi1_pio_header	phdr;
+	struct sdma_txreq       txreq;
+	struct hfi1_qp           *qp;
+	struct hfi1_swqe         *wqe;
+	struct hfi1_mregion	*mr;
+	struct hfi1_sge_state    *ss;
+	struct sdma_engine     *sde;
+	u16                     hdr_dwords;
+	u16                     hdr_inx;
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+	/* read mostly */
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	/* private: */
+	void __iomem *tail_csr;
+	u64 imask;			/* clear interrupt mask */
+	u64 idle_mask;
+	u64 progress_mask;
+	/* private: */
+	struct workqueue_struct *wq;
+	/* private: */
+	volatile __le64      *head_dma; /* DMA'ed by chip */
+	/* private: */
+	dma_addr_t            head_phys;
+	/* private: */
+	struct hw_sdma_desc *descq;
+	/* private: */
+	unsigned descq_full_count;
+	struct sdma_txreq **tx_ring;
+	/* private: */
+	dma_addr_t            descq_phys;
+	/* private */
+	u32 sdma_mask;
+	/* private */
+	struct sdma_state state;
+	/* private: */
+	u8 sdma_shift;
+	/* private: */
+	u8 this_idx; /* zero relative engine */
+	/* protect changes to senddmactrl shadow */
+	spinlock_t senddmactrl_lock;
+	/* private: */
+	u64 p_senddmactrl;		/* shadow per-engine SendDmaCtrl */
+
+	/* read/write using tail_lock */
+	spinlock_t            tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	/* private: */
+	u64                   tail_sn;
+#endif
+	/* private: */
+	u32                   descq_tail;
+	/* private: */
+	unsigned long         ahg_bits;
+	/* private: */
+	u16                   desc_avail;
+	/* private: */
+	u16                   tx_tail;
+	/* private: */
+	u16 descq_cnt;
+
+	/* read/write using head_lock */
+	/* private: */
+	seqlock_t            head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	/* private: */
+	u64                   head_sn;
+#endif
+	/* private: */
+	u32                   descq_head;
+	/* private: */
+	u16                   tx_head;
+	/* private: */
+	u64                   last_status;
+
+	/* private: */
+	struct list_head      dmawait;
+
+	/* CONFIG SDMA for now, just blindly duplicate */
+	/* private: */
+	struct tasklet_struct sdma_hw_clean_up_task
+		____cacheline_aligned_in_smp;
+
+	/* private: */
+	struct tasklet_struct sdma_sw_clean_up_task
+		____cacheline_aligned_in_smp;
+	/* private: */
+	struct work_struct err_halt_worker;
+	/* private */
+	struct timer_list     err_progress_check_timer;
+	u32                   progress_check_head;
+	/* private: */
+	struct work_struct flush_worker;
+	spinlock_t flushlist_lock;
+	/* private: */
+	struct list_head flushlist;
+};
+
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+	return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+	return sde->descq_cnt -
+		(sde->descq_tail -
+		 ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+	return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+	return engine->state.current_state == sdma_state_s99_running;
+}
+
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&engine->tail_lock, flags);
+	ret = __sdma_running(engine);
+	spin_unlock_irqrestore(&engine->tail_lock, flags);
+	return ret;
+}
+
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen);
+
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	u8 ahg_entry,
+	u8 num_ahg,
+	u32 *ahg,
+	u8 ahg_hlen,
+	void (*cb)(struct sdma_txreq *, int, int))
+{
+	if (tlen == 0)
+		return -ENODATA;
+	if (tlen > MAX_SDMA_PKT_SIZE)
+		return -EMSGSIZE;
+	tx->desc_limit = ARRAY_SIZE(tx->descs);
+	tx->descp = &tx->descs[0];
+	INIT_LIST_HEAD(&tx->list);
+	tx->num_desc = 0;
+	tx->flags = flags;
+	tx->complete = cb;
+	tx->coalesce_buf = NULL;
+	tx->wait = NULL;
+	tx->tlen = tx->packet_len = tlen;
+	tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+	tx->descs[0].qw[1] = 0;
+	if (flags & SDMA_TXREQ_F_AHG_COPY)
+		tx->descs[0].qw[1] |=
+			(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+				<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+			(((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+				<< SDMA_DESC1_HEADER_MODE_SHIFT);
+	else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+		_sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+	return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	void (*cb)(struct sdma_txreq *, int, int))
+{
+	return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+	return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+		>> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+		>> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+		>> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+	struct sdma_txreq *tx,
+	int type,
+	dma_addr_t addr,
+	size_t len)
+{
+	struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+	if (!tx->num_desc) {
+		/* qw[0] zero; qw[1] first, ahg mode already in from init */
+		desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	} else {
+		desc->qw[0] = 0;
+		desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	}
+	desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+				<< SDMA_DESC0_PHY_ADDR_SHIFT) |
+			(((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+				<< SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int _extend_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+				  struct sdma_txreq *tx)
+{
+	tx->descp[tx->num_desc].qw[0] |=
+		SDMA_DESC0_LAST_DESC_FLAG;
+	tx->descp[tx->num_desc].qw[1] |=
+		dd->default_desc1;
+	if (tx->flags & SDMA_TXREQ_F_URGENT)
+		tx->descp[tx->num_desc].qw[1] |=
+			(SDMA_DESC1_HEAD_TO_HOST_FLAG|
+			 SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	int type,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	int rval = 0;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = _extend_sdma_tx_descs(dd, tx);
+		if (rval)
+			return rval;
+	}
+	make_tx_sdma_desc(
+		tx,
+		type,
+		addr, len);
+	WARN_ON(len > tx->tlen);
+	tx->tlen -= len;
+	/* special cases for last */
+	if (!tx->tlen) {
+		if (tx->packet_len & (sizeof(u32) - 1))
+			rval = _pad_sdma_tx_descs(dd, tx);
+		else
+			_sdma_close_tx(dd, tx);
+	}
+	tx->num_desc++;
+	return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend descriptor array or couldn't allocate coalesce
+ * buffer.
+ *
+ */
+static inline int sdma_txadd_page(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	struct page *page,
+	unsigned long offset,
+	u16 len)
+{
+	dma_addr_t addr =
+		dma_map_page(
+			&dd->pcidev->dev,
+			page,
+			offset,
+			len,
+			DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+		sdma_txclean(dd, tx);
+		return -ENOSPC;
+	}
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	void *kvaddr,
+	u16 len)
+{
+	dma_addr_t addr =
+		dma_map_single(
+			&dd->pcidev->dev,
+			kvaddr,
+			len,
+			DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+		sdma_txclean(dd, tx);
+		return -ENOSPC;
+	}
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+		    struct iowait *wait,
+		    struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+		     struct iowait *wait,
+		     struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+	u16 data,
+	u8 dwindex,
+	u8 startbit,
+	u8 bits)
+{
+	return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+		((startbit & SDMA_AHG_FIELD_START_MASK) <<
+		SDMA_AHG_FIELD_START_SHIFT) |
+		((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+		SDMA_AHG_FIELD_LEN_SHIFT) |
+		((dwindex & SDMA_AHG_INDEX_MASK) <<
+		SDMA_AHG_INDEX_SHIFT) |
+		((data & SDMA_AHG_VALUE_MASK) <<
+		SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+				     struct sdma_txreq *tx)
+{
+	if (read_seqretry(&sde->head_lock, seq)) {
+		sde->desc_avail = sdma_descq_freecnt(sde);
+		if (tx->num_desc > sde->desc_avail)
+			return 0;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+	struct sdma_engine *sde,
+	struct iowait *wait)
+{
+	iowait_schedule(wait, sde->wq);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n] -> eng n    |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n  |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n  |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | sde[m] -> eng m+n  |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n|
+ *                                                  \- |--------------------|
+ *                                                    >| sde[1] -> eng 2+m+n|
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | sde[o] -> eng o+m+n|
+ *                                                     +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+	u32 mask;
+	struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+	struct rcu_head list;
+	u32 mask;
+	u8 actual_vls;
+	u8 vls;
+	struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+	struct hfi1_devdata *dd,
+	u8 port,
+	u8 num_vls,
+	u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+	struct sdma_engine *sde)
+{
+	if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+		return;
+	_sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+	char *r = s;
+
+	while (*s)
+		if (*s++ == '/')
+			r = s;
+	return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/staging/rdma/hfi1/srq.c
new file mode 100644
index 0000000..67786d4
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/srq.c
@@ -0,0 +1,397 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct hfi1_srq *srq = to_isrq(ibsrq);
+	struct hfi1_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct hfi1_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ */
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+			       struct ib_srq_init_attr *srq_init_attr,
+			       struct ib_udata *udata)
+{
+	struct hfi1_ibdev *dev = to_idev(ibpd->device);
+	struct hfi1_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_sge == 0 ||
+	    srq_init_attr->attr.max_sge > hfi1_max_srq_sges ||
+	    srq_init_attr->attr.max_wr == 0 ||
+	    srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct hfi1_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See hfi1_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    hfi1_create_mmap_info(dev, s, ibpd->uobject->context,
+					  srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == hfi1_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * hfi1_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ */
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask,
+		    struct ib_udata *udata)
+{
+	struct hfi1_srq *srq = to_isrq(ibsrq);
+	struct hfi1_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct hfi1_rwq *owq;
+		struct hfi1_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > hfi1_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct hfi1_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head and tail pointer values and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		tail = owq->tail;
+		if (head >= srq->rq.size || tail >= srq->rq.size) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct hfi1_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct hfi1_rwqe *)((char *)p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct hfi1_mmap_info *ip = srq->ip;
+			struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct hfi1_rwq) + size * sz;
+
+			hfi1_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See hfi1_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			/*
+			 * Put user mapping info onto the pending list
+			 * unless it already is on the list.
+			 */
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct hfi1_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * hfi1_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int hfi1_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct hfi1_srq *srq = to_isrq(ibsrq);
+	struct hfi1_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, hfi1_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
new file mode 100644
index 0000000..b78c728
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -0,0 +1,739 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+	struct cc_state *cc_state;
+
+	ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+		 + sizeof(__be16);
+
+	if (pos > ret)
+		return -EINVAL;
+
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	rcu_read_lock();
+	cc_state = get_cc_state(ppd);
+	if (cc_state == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	memcpy(buf, &cc_state->cct, count);
+	rcu_read_unlock();
+
+	return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+	/* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct kobj_type port_cc_ktype = {
+	.release = port_release,
+};
+
+static struct bin_attribute cc_table_bin_attr = {
+	.attr = {.name = "cc_table_bin", .mode = 0444},
+	.read = read_cc_table_bin,
+	.size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+	struct cc_state *cc_state;
+
+	ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+	if (pos > ret)
+		return -EINVAL;
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	rcu_read_lock();
+	cc_state = get_cc_state(ppd);
+	if (cc_state == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	memcpy(buf, &cc_state->cong_setting, count);
+	rcu_read_unlock();
+
+	return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+	.attr = {.name = "cc_settings_bin", .mode = 0444},
+	.read = read_cc_setting_bin,
+	.size = PAGE_SIZE,
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N)				    \
+	static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sc = N \
+	}
+
+struct hfi1_sc2vl_attr {
+	struct attribute attr;
+	int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+
+static struct attribute *sc2vl_default_attributes[] = {
+	&hfi1_sc2vl_attr_0.attr,
+	&hfi1_sc2vl_attr_1.attr,
+	&hfi1_sc2vl_attr_2.attr,
+	&hfi1_sc2vl_attr_3.attr,
+	&hfi1_sc2vl_attr_4.attr,
+	&hfi1_sc2vl_attr_5.attr,
+	&hfi1_sc2vl_attr_6.attr,
+	&hfi1_sc2vl_attr_7.attr,
+	&hfi1_sc2vl_attr_8.attr,
+	&hfi1_sc2vl_attr_9.attr,
+	&hfi1_sc2vl_attr_10.attr,
+	&hfi1_sc2vl_attr_11.attr,
+	&hfi1_sc2vl_attr_12.attr,
+	&hfi1_sc2vl_attr_13.attr,
+	&hfi1_sc2vl_attr_14.attr,
+	&hfi1_sc2vl_attr_15.attr,
+	&hfi1_sc2vl_attr_16.attr,
+	&hfi1_sc2vl_attr_17.attr,
+	&hfi1_sc2vl_attr_18.attr,
+	&hfi1_sc2vl_attr_19.attr,
+	&hfi1_sc2vl_attr_20.attr,
+	&hfi1_sc2vl_attr_21.attr,
+	&hfi1_sc2vl_attr_22.attr,
+	&hfi1_sc2vl_attr_23.attr,
+	&hfi1_sc2vl_attr_24.attr,
+	&hfi1_sc2vl_attr_25.attr,
+	&hfi1_sc2vl_attr_26.attr,
+	&hfi1_sc2vl_attr_27.attr,
+	&hfi1_sc2vl_attr_28.attr,
+	&hfi1_sc2vl_attr_29.attr,
+	&hfi1_sc2vl_attr_30.attr,
+	&hfi1_sc2vl_attr_31.attr,
+	NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct hfi1_sc2vl_attr *sattr =
+		container_of(attr, struct hfi1_sc2vl_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+	.show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_sc2vl_ops,
+	.default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N)				    \
+	static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {	  \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sl = N						  \
+	}
+
+struct hfi1_sl2sc_attr {
+	struct attribute attr;
+	int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+
+static struct attribute *sl2sc_default_attributes[] = {
+	&hfi1_sl2sc_attr_0.attr,
+	&hfi1_sl2sc_attr_1.attr,
+	&hfi1_sl2sc_attr_2.attr,
+	&hfi1_sl2sc_attr_3.attr,
+	&hfi1_sl2sc_attr_4.attr,
+	&hfi1_sl2sc_attr_5.attr,
+	&hfi1_sl2sc_attr_6.attr,
+	&hfi1_sl2sc_attr_7.attr,
+	&hfi1_sl2sc_attr_8.attr,
+	&hfi1_sl2sc_attr_9.attr,
+	&hfi1_sl2sc_attr_10.attr,
+	&hfi1_sl2sc_attr_11.attr,
+	&hfi1_sl2sc_attr_12.attr,
+	&hfi1_sl2sc_attr_13.attr,
+	&hfi1_sl2sc_attr_14.attr,
+	&hfi1_sl2sc_attr_15.attr,
+	&hfi1_sl2sc_attr_16.attr,
+	&hfi1_sl2sc_attr_17.attr,
+	&hfi1_sl2sc_attr_18.attr,
+	&hfi1_sl2sc_attr_19.attr,
+	&hfi1_sl2sc_attr_20.attr,
+	&hfi1_sl2sc_attr_21.attr,
+	&hfi1_sl2sc_attr_22.attr,
+	&hfi1_sl2sc_attr_23.attr,
+	&hfi1_sl2sc_attr_24.attr,
+	&hfi1_sl2sc_attr_25.attr,
+	&hfi1_sl2sc_attr_26.attr,
+	&hfi1_sl2sc_attr_27.attr,
+	&hfi1_sl2sc_attr_28.attr,
+	&hfi1_sl2sc_attr_29.attr,
+	&hfi1_sl2sc_attr_30.attr,
+	&hfi1_sl2sc_attr_31.attr,
+	NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct hfi1_sl2sc_attr *sattr =
+		container_of(attr, struct hfi1_sl2sc_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+	.show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_sl2sc_ops,
+	.default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+	static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.vl = N						  \
+	}
+
+struct hfi1_vl2mtu_attr {
+	struct attribute attr;
+	int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+	&hfi1_vl2mtu_attr_0.attr,
+	&hfi1_vl2mtu_attr_1.attr,
+	&hfi1_vl2mtu_attr_2.attr,
+	&hfi1_vl2mtu_attr_3.attr,
+	&hfi1_vl2mtu_attr_4.attr,
+	&hfi1_vl2mtu_attr_5.attr,
+	&hfi1_vl2mtu_attr_6.attr,
+	&hfi1_vl2mtu_attr_7.attr,
+	&hfi1_vl2mtu_attr_8.attr,
+	&hfi1_vl2mtu_attr_9.attr,
+	&hfi1_vl2mtu_attr_10.attr,
+	&hfi1_vl2mtu_attr_11.attr,
+	&hfi1_vl2mtu_attr_12.attr,
+	&hfi1_vl2mtu_attr_13.attr,
+	&hfi1_vl2mtu_attr_14.attr,
+	&hfi1_vl2mtu_attr_15.attr,
+	NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct hfi1_vl2mtu_attr *vlattr =
+		container_of(attr, struct hfi1_vl2mtu_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+	.show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_vl2mtu_ops,
+	.default_attrs = vl2mtu_default_attributes
+};
+
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (!dd->boardname)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+	return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/*
+	 * Return the smaller of send and receive contexts.
+	 * Normally, user level applications would require both a send
+	 * and a receive context, so returning the smaller of the two counts
+	 * give a more accurate picture of total contexts available.
+	 */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 min(dd->num_rcv_contexts - dd->first_user_ctxt,
+			     (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of free user ports (contexts) available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+
+}
+
+static ssize_t store_chip_reset(struct device *device,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = hfi1_reset_device(dd->unit);
+bail:
+	return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx)					\
+	scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",		\
+			      ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	struct hfi1_temp temp;
+	int ret = -ENXIO;
+
+	ret = hfi1_tempsense_rd(dd, &temp);
+	if (!ret) {
+		int idx = 0;
+
+		idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+		idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+				"%u %u %u\n", temp.triggers & 0x1,
+				temp.triggers & 0x2, temp.triggers & 0x4);
+		ret = idx;
+	}
+	return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_board_id,
+	&dev_attr_nctxts,
+	&dev_attr_nfreectxts,
+	&dev_attr_serial,
+	&dev_attr_boardversion,
+	&dev_attr_tempsense,
+	&dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+			   struct kobject *kobj)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (!port_num || port_num > dd->num_pports) {
+		dd_dev_err(dd,
+			"Skipping infiniband class with invalid port %u\n",
+			port_num);
+		return -ENODEV;
+	}
+	ppd = &dd->pport[port_num - 1];
+
+	ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+				   "sc2vl");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping sc2vl sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail;
+	}
+	kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+				   "sl2sc");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping sl2sc sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_sc2vl;
+	}
+	kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+				   "vl2mtu");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_sl2sc;
+	}
+	kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+
+	ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+				   kobj, "CCMgtA");
+	if (ret) {
+		dd_dev_err(dd,
+		 "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_vl2mtu;
+	}
+
+	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+	if (ret) {
+		dd_dev_err(dd,
+		 "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc;
+	}
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+	if (ret) {
+		dd_dev_err(dd,
+		 "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc_entry_bin;
+	}
+
+	dd_dev_info(dd,
+		"IB%u: Congestion Control Agent enabled for port %d\n",
+		dd->unit, port_num);
+
+	return 0;
+
+bail_cc_entry_bin:
+	sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+			      &cc_setting_bin_attr);
+bail_cc:
+	kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+	kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+	kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+	kobject_put(&ppd->sc2vl_kobj);
+bail:
+	return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+	struct ib_device *dev = &dd->verbs_dev.ibdev;
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+		ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+		device_remove_file(&dev->dev, hfi1_attributes[i]);
+	return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	for (i = 0; i < dd->num_pports; i++) {
+		ppd = &dd->pport[i];
+
+		sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				      &cc_setting_bin_attr);
+		sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				      &cc_table_bin_attr);
+		kobject_put(&ppd->pport_cc_kobj);
+		kobject_put(&ppd->vl2mtu_kobj);
+		kobject_put(&ppd->sl2sc_kobj);
+		kobject_put(&ppd->sc2vl_kobj);
+	}
+}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
new file mode 100644
index 0000000..70ad7b9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.c
@@ -0,0 +1,221 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+	struct hfi1_other_headers *ohdr;
+	u8 opcode;
+	u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+	if (lnh == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	return hdr_len_by_opcode[opcode] == 0 ?
+	       0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN  "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+	return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+const char *parse_everbs_hdrs(
+	struct trace_seq *p,
+	u8 opcode,
+	void *ehdrs)
+{
+	union ib_ehdrs *eh = ehdrs;
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (opcode) {
+	/* imm */
+	case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+	case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+	case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+	case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+	case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+	case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		trace_seq_printf(p, IMM_PRN,
+			be32_to_cpu(eh->imm_data));
+		break;
+	/* reth + imm */
+	case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+	case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+			(unsigned long long)ib_u64_get(
+				(__be32 *)&eh->rc.reth.vaddr),
+			be32_to_cpu(eh->rc.reth.rkey),
+			be32_to_cpu(eh->rc.reth.length),
+			be32_to_cpu(eh->rc.imm_data));
+		break;
+	/* reth */
+	case OP(RC, RDMA_READ_REQUEST):
+	case OP(RC, RDMA_WRITE_FIRST):
+	case OP(UC, RDMA_WRITE_FIRST):
+	case OP(RC, RDMA_WRITE_ONLY):
+	case OP(UC, RDMA_WRITE_ONLY):
+		trace_seq_printf(p, RETH_PRN,
+			(unsigned long long)ib_u64_get(
+				(__be32 *)&eh->rc.reth.vaddr),
+			be32_to_cpu(eh->rc.reth.rkey),
+			be32_to_cpu(eh->rc.reth.length));
+		break;
+	case OP(RC, RDMA_READ_RESPONSE_FIRST):
+	case OP(RC, RDMA_READ_RESPONSE_LAST):
+	case OP(RC, RDMA_READ_RESPONSE_ONLY):
+	case OP(RC, ACKNOWLEDGE):
+		trace_seq_printf(p, AETH_PRN,
+			be32_to_cpu(eh->aeth) >> 24,
+			be32_to_cpu(eh->aeth) & HFI1_QPN_MASK);
+		break;
+	/* aeth + atomicacketh */
+	case OP(RC, ATOMIC_ACKNOWLEDGE):
+		trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+			(be32_to_cpu(eh->at.aeth) >> 24) & 0xff,
+			be32_to_cpu(eh->at.aeth) & HFI1_QPN_MASK,
+			(unsigned long long)ib_u64_get(eh->at.atomic_ack_eth));
+		break;
+	/* atomiceth */
+	case OP(RC, COMPARE_SWAP):
+	case OP(RC, FETCH_ADD):
+		trace_seq_printf(p, ATOMICETH_PRN,
+			(unsigned long long)ib_u64_get(eh->atomic_eth.vaddr),
+			eh->atomic_eth.rkey,
+			(unsigned long long)ib_u64_get(
+				(__be32 *)&eh->atomic_eth.swap_data),
+			(unsigned long long) ib_u64_get(
+				 (__be32 *)&eh->atomic_eth.compare_data));
+		break;
+	/* deth */
+	case OP(UD, SEND_ONLY):
+	case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+		trace_seq_printf(p, DETH_PRN,
+			be32_to_cpu(eh->ud.deth[0]),
+			be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK);
+		break;
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *parse_sdma_flags(
+	struct trace_seq *p,
+	u64 desc0, u64 desc1)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+	flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+	flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
+	flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+	flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+	trace_seq_printf(p, "%s", flags);
+	if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+		trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+			(u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT)
+				& SDMA_DESC1_HEADER_MODE_MASK),
+			(u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT)
+				& SDMA_DESC1_HEADER_INDEX_MASK),
+			(u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT)
+				& SDMA_DESC1_HEADER_DWS_MASK));
+	return ret;
+}
+
+const char *print_u32_array(
+	struct trace_seq *p,
+	u32 *arr, int len)
+{
+	int i;
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	for (i = 0; i < len ; i++)
+		trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *print_u64_array(
+	struct trace_seq *p,
+	u64 *arr, int len)
+{
+	int i;
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	for (i = 0; i < len; i++)
+		trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
new file mode 100644
index 0000000..d7851c0
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.h
@@ -0,0 +1,1409 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR hfi1
+
+#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+	packettype_name(EXPECTED),              \
+	packettype_name(EAGER),                 \
+	packettype_name(IB),                    \
+	packettype_name(ERROR),                 \
+	packettype_name(BYPASS))
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 u64 eflags,
+		 u32 ctxt,
+		 u32 etype,
+		 u32 hlen,
+		 u32 tlen,
+		 u32 updegr,
+		 u32 etail),
+	TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(u64, eflags)
+		__field(u32, ctxt)
+		__field(u32, etype)
+		__field(u32, hlen)
+		__field(u32, tlen)
+		__field(u32, updegr)
+		__field(u32, etail)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		__entry->eflags = eflags;
+		__entry->ctxt = ctxt;
+		__entry->etype = etype;
+		__entry->hlen = hlen;
+		__entry->tlen = tlen;
+		__entry->updegr = updegr;
+		__entry->etail = etail;
+	),
+	TP_printk(
+"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+		__get_str(dev),
+		__entry->ctxt,
+		__entry->eflags,
+		__entry->etype, show_packettype(__entry->etype),
+		__entry->hlen,
+		__entry->tlen,
+		__entry->updegr,
+		__entry->etail
+	)
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+	TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+	TP_ARGS(dd, ctxt),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(u32, ctxt)
+		__field(u8, slow_path)
+		__field(u8, dma_rtail)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		__entry->ctxt = ctxt;
+		if (dd->rcd[ctxt]->do_interrupt ==
+		    &handle_receive_interrupt) {
+			__entry->slow_path = 1;
+			__entry->dma_rtail = 0xFF;
+		} else if (dd->rcd[ctxt]->do_interrupt ==
+			&handle_receive_interrupt_dma_rtail){
+			__entry->dma_rtail = 1;
+			__entry->slow_path = 0;
+		} else if (dd->rcd[ctxt]->do_interrupt ==
+			 &handle_receive_interrupt_nodma_rtail) {
+			__entry->dma_rtail = 0;
+			__entry->slow_path = 0;
+		}
+	),
+	TP_printk(
+		"[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+		__get_str(dev),
+		__entry->ctxt,
+		__entry->slow_path,
+		__entry->dma_rtail
+	)
+);
+
+const char *print_u64_array(struct trace_seq *, u64 *, int);
+
+TRACE_EVENT(hfi1_exp_tid_map,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, int dir,
+		     unsigned long *maps, u16 count),
+	    TP_ARGS(ctxt, subctxt, dir, maps, count),
+	    TP_STRUCT__entry(
+		    __field(unsigned, ctxt)
+		    __field(u16, subctxt)
+		    __field(int, dir)
+		    __field(u16, count)
+		    __dynamic_array(unsigned long, maps, sizeof(*maps) * count)
+		    ),
+	    TP_fast_assign(
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->dir = dir;
+		    __entry->count = count;
+		    memcpy(__get_dynamic_array(maps), maps,
+			   sizeof(*maps) * count);
+		    ),
+	    TP_printk("[%3u:%02u] %s tidmaps %s",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      (__entry->dir ? ">" : "<"),
+		      print_u64_array(p, __get_dynamic_array(maps),
+				      __entry->count)
+		    )
+	);
+
+TRACE_EVENT(hfi1_exp_rcv_set,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+		     unsigned long vaddr, u64 phys_addr, void *page),
+	    TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page),
+	    TP_STRUCT__entry(
+		    __field(unsigned, ctxt)
+		    __field(u16, subctxt)
+		    __field(u32, tid)
+		    __field(unsigned long, vaddr)
+		    __field(u64, phys_addr)
+		    __field(void *, page)
+		    ),
+	    TP_fast_assign(
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->tid = tid;
+		    __entry->vaddr = vaddr;
+		    __entry->phys_addr = phys_addr;
+		    __entry->page = page;
+		    ),
+	    TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->tid,
+		      __entry->vaddr,
+		      __entry->phys_addr,
+		      __entry->page
+		    )
+	);
+
+TRACE_EVENT(hfi1_exp_rcv_free,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+		     unsigned long phys, void *page),
+	    TP_ARGS(ctxt, subctxt, tid, phys, page),
+	    TP_STRUCT__entry(
+		    __field(unsigned, ctxt)
+		    __field(u16, subctxt)
+		    __field(u32, tid)
+		    __field(unsigned long, phys)
+		    __field(void *, page)
+		    ),
+	    TP_fast_assign(
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->tid = tid;
+		    __entry->phys = phys;
+		    __entry->page = page;
+		    ),
+	    TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->tid,
+		      __entry->phys,
+		      __entry->page
+		    )
+	);
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+	TP_PROTO(struct send_context *sc, int extra),
+	TP_ARGS(sc, extra),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sc->dd)
+		__field(u32, sw_index)
+		__field(u32, hw_context)
+		__field(int, extra)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sc->dd);
+		__entry->sw_index = sc->sw_index;
+		__entry->hw_context = sc->hw_context;
+		__entry->extra = extra;
+	),
+	TP_printk(
+		"[%s] ctxt %u(%u) extra %d",
+		__get_str(dev),
+		__entry->sw_index,
+		__entry->hw_context,
+		__entry->extra
+	)
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+	TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+	TP_ARGS(sc, needint, credit_ctrl),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sc->dd)
+		__field(u32, sw_index)
+		__field(u32, hw_context)
+		__field(u32, needint)
+		__field(u64, credit_ctrl)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sc->dd);
+		__entry->sw_index = sc->sw_index;
+		__entry->hw_context = sc->hw_context;
+		__entry->needint = needint;
+		__entry->credit_ctrl = credit_ctrl;
+	),
+	TP_printk(
+		"[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+		__get_str(dev),
+		__entry->sw_index,
+		__entry->hw_context,
+		__entry->needint,
+		(unsigned long long)__entry->credit_ctrl
+	)
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+	TP_PROTO(struct hfi1_qp *qp, u32 flags),
+	TP_ARGS(qp, flags),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, flags)
+		__field(u32, s_flags)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->flags = flags;
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->s_flags = qp->s_flags;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->flags,
+		__entry->s_flags
+	)
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+	     TP_PROTO(struct hfi1_qp *qp, u32 flags),
+	     TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+	     TP_PROTO(struct hfi1_qp *qp, u32 flags),
+	     TP_ARGS(qp, flags));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_qphash
+DECLARE_EVENT_CLASS(hfi1_qphash_template,
+	TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, bucket)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->bucket = bucket;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x bucket %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->bucket
+	)
+);
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert,
+	TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove,
+	TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(
+	struct trace_seq *p,
+	u8 opcode,
+	void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+const char *parse_sdma_flags(
+	struct trace_seq *p,
+	u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+	lrh_name(LRH_BTH),               \
+	lrh_name(LRH_GRH))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+	ib_opcode_name(RC_SEND_FIRST),                     \
+	ib_opcode_name(RC_SEND_MIDDLE),                    \
+	ib_opcode_name(RC_SEND_LAST),                      \
+	ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_SEND_ONLY),                      \
+	ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+	ib_opcode_name(RC_ACKNOWLEDGE),                    \
+	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+	ib_opcode_name(RC_COMPARE_SWAP),                   \
+	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(UC_SEND_FIRST),                     \
+	ib_opcode_name(UC_SEND_MIDDLE),                    \
+	ib_opcode_name(UC_SEND_LAST),                      \
+	ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_SEND_ONLY),                      \
+	ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(UD_SEND_ONLY),                      \
+	ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE))
+
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+	"op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+	"f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 struct hfi1_ib_header *hdr),
+	TP_ARGS(dd, hdr),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		/* LRH */
+		__field(u8, vl)
+		__field(u8, lver)
+		__field(u8, sl)
+		__field(u8, lnh)
+		__field(u16, dlid)
+		__field(u16, len)
+		__field(u16, slid)
+		/* BTH */
+		__field(u8, opcode)
+		__field(u8, se)
+		__field(u8, m)
+		__field(u8, pad)
+		__field(u8, tver)
+		__field(u16, pkey)
+		__field(u8, f)
+		__field(u8, b)
+		__field(u32, qpn)
+		__field(u8, a)
+		__field(u32, psn)
+		/* extended headers */
+		__dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+	),
+	TP_fast_assign(
+		struct hfi1_other_headers *ohdr;
+
+		DD_DEV_ASSIGN(dd);
+		/* LRH */
+		__entry->vl =
+			(u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+		__entry->lver =
+			(u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+		__entry->sl =
+			(u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+		__entry->lnh =
+			(u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+		__entry->dlid =
+			be16_to_cpu(hdr->lrh[1]);
+		/* allow for larger len */
+		__entry->len =
+			be16_to_cpu(hdr->lrh[2]);
+		__entry->slid =
+			be16_to_cpu(hdr->lrh[3]);
+		/* BTH */
+		if (__entry->lnh == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else
+			ohdr = &hdr->u.l.oth;
+		__entry->opcode =
+			(be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+		__entry->se =
+			(be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+		__entry->m =
+			 (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+		__entry->pad =
+			(be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		__entry->tver =
+			(be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+		__entry->pkey =
+			be32_to_cpu(ohdr->bth[0]) & 0xffff;
+		__entry->f =
+			(be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT)
+			& HFI1_FECN_MASK;
+		__entry->b =
+			(be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT)
+			& HFI1_BECN_MASK;
+		__entry->qpn =
+			be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		__entry->a =
+			(be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+		/* allow for larger PSN */
+		__entry->psn =
+			be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+		/* extended headers */
+		 memcpy(
+			__get_dynamic_array(ehdrs),
+			&ohdr->u,
+			ibhdr_exhdr_len(hdr));
+	),
+	TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+		__get_str(dev),
+		/* LRH */
+		__entry->vl,
+		__entry->lver,
+		__entry->sl,
+		__entry->lnh, show_lnh(__entry->lnh),
+		__entry->dlid,
+		__entry->len,
+		__entry->slid,
+		/* BTH */
+		__entry->opcode, show_ib_opcode(__entry->opcode),
+		__entry->se,
+		__entry->m,
+		__entry->pad,
+		__entry->tver,
+		__entry->pkey,
+		__entry->f,
+		__entry->b,
+		__entry->qpn,
+		__entry->a,
+		__entry->psn,
+		/* extended headers */
+		__parse_ib_ehdrs(
+			__entry->opcode,
+			(void *)__get_dynamic_array(ehdrs))
+	)
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+	     TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+	     TP_ARGS(dd, hdr));
+
+#define SNOOP_PRN \
+	"slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+	"svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_snoop
+
+
+TRACE_EVENT(snoop_capture,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 int hdr_len,
+		 struct hfi1_ib_header *hdr,
+		 int data_len,
+		 void *data),
+	TP_ARGS(dd, hdr_len, hdr, data_len, data),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(u16, slid)
+		__field(u16, dlid)
+		__field(u32, qpn)
+		__field(u8, opcode)
+		__field(u8, sl)
+		__field(u16, pkey)
+		__field(u32, hdr_len)
+		__field(u32, data_len)
+		__field(u8, lnh)
+		__dynamic_array(u8, raw_hdr, hdr_len)
+		__dynamic_array(u8, raw_pkt, data_len)
+	),
+	TP_fast_assign(
+		struct hfi1_other_headers *ohdr;
+
+		__entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+		if (__entry->lnh == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else
+			ohdr = &hdr->u.l.oth;
+		DD_DEV_ASSIGN(dd);
+		__entry->slid = be16_to_cpu(hdr->lrh[3]);
+		__entry->dlid = be16_to_cpu(hdr->lrh[1]);
+		__entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		__entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+		__entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+		__entry->pkey =	be32_to_cpu(ohdr->bth[0]) & 0xffff;
+		__entry->hdr_len = hdr_len;
+		__entry->data_len = data_len;
+		memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+		memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+	),
+	TP_printk("[%s] " SNOOP_PRN,
+		__get_str(dev),
+		__entry->slid,
+		__entry->dlid,
+		__entry->qpn,
+		__entry->opcode,
+		show_ib_opcode(__entry->opcode),
+		__entry->sl,
+		__entry->pkey,
+		__entry->hdr_len,
+		__entry->data_len
+	)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+	"cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "	\
+	"rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+	    TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+	    TP_ARGS(dd, uctxt),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(unsigned, ctxt)
+		    __field(u32, credits)
+		    __field(u64, hw_free)
+		    __field(u64, piobase)
+		    __field(u16, rcvhdrq_cnt)
+		    __field(u64, rcvhdrq_phys)
+		    __field(u32, eager_cnt)
+		    __field(u64, rcvegr_phys)
+		    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = uctxt->ctxt;
+		    __entry->credits = uctxt->sc->credits;
+		    __entry->hw_free = (u64)uctxt->sc->hw_free;
+		    __entry->piobase = (u64)uctxt->sc->base_addr;
+		    __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+		    __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+		    __entry->eager_cnt = uctxt->egrbufs.alloced;
+		    __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys;
+		    ),
+	    TP_printk(
+		    "[%s] ctxt %u " UCTXT_FMT,
+		    __get_str(dev),
+		    __entry->ctxt,
+		    __entry->credits,
+		    __entry->hw_free,
+		    __entry->piobase,
+		    __entry->rcvhdrq_cnt,
+		    __entry->rcvhdrq_phys,
+		    __entry->eager_cnt,
+		    __entry->rcvegr_phys
+		    )
+	);
+
+#define CINFO_FMT \
+	"egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+	    TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
+		     struct hfi1_ctxt_info cinfo),
+	    TP_ARGS(dd, ctxt, subctxt, cinfo),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(unsigned, ctxt)
+		    __field(unsigned, subctxt)
+		    __field(u16, egrtids)
+		    __field(u16, rcvhdrq_cnt)
+		    __field(u16, rcvhdrq_size)
+		    __field(u16, sdma_ring_size)
+		    __field(u32, rcvegr_size)
+		    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->egrtids = cinfo.egrtids;
+		    __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+		    __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+		    __entry->sdma_ring_size = cinfo.sdma_ring_size;
+		    __entry->rcvegr_size = cinfo.rcvegr_size;
+		    ),
+	    TP_printk(
+		    "[%s] ctxt %u:%u " CINFO_FMT,
+		    __get_str(dev),
+		    __entry->ctxt,
+		    __entry->subctxt,
+		    __entry->egrtids,
+		    __entry->rcvegr_size,
+		    __entry->rcvhdrq_cnt,
+		    __entry->rcvhdrq_size,
+		    __entry->sdma_ring_size
+		    )
+	);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sma
+
+#define BCT_FORMAT \
+	"shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+	be16_to_cpu( \
+		((struct buffer_control *)__get_dynamic_array(bct))->field \
+	)
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+	TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+	TP_ARGS(dd, bc),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__dynamic_array(u8, bct, sizeof(*bc))
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		memcpy(
+			__get_dynamic_array(bct),
+			bc,
+			sizeof(*bc));
+	),
+	TP_printk(BCT_FORMAT,
+		BCT(overall_shared_limit),
+
+		BCT(vl[0].dedicated),
+		BCT(vl[0].shared),
+
+		BCT(vl[1].dedicated),
+		BCT(vl[1].shared),
+
+		BCT(vl[2].dedicated),
+		BCT(vl[2].shared),
+
+		BCT(vl[3].dedicated),
+		BCT(vl[3].shared),
+
+		BCT(vl[4].dedicated),
+		BCT(vl[4].shared),
+
+		BCT(vl[5].dedicated),
+		BCT(vl[5].shared),
+
+		BCT(vl[6].dedicated),
+		BCT(vl[6].shared),
+
+		BCT(vl[7].dedicated),
+		BCT(vl[7].shared),
+
+		BCT(vl[15].dedicated),
+		BCT(vl[15].shared)
+	)
+);
+
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+	     TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+	     TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+	     TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+	     TP_ARGS(dd, bc));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sdma
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u64 desc0,
+		u64 desc1,
+		u16 e,
+		void *descp),
+	TP_ARGS(sde, desc0, desc1, e, descp),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(void *, descp)
+		__field(u64, desc0)
+		__field(u64, desc1)
+		__field(u16, e)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->desc0 = desc0;
+		__entry->desc1 = desc1;
+		__entry->idx = sde->this_idx;
+		__entry->descp = descp;
+		__entry->e = e;
+	),
+	TP_printk(
+		"[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+		__get_str(dev),
+		__entry->idx,
+		__parse_sdma_flags(__entry->desc0, __entry->desc1),
+		(__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT)
+			& SDMA_DESC0_PHY_ADDR_MASK,
+		(u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT)
+			& SDMA_DESC1_GENERATION_MASK),
+		(u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+			& SDMA_DESC0_BYTE_COUNT_MASK),
+		__entry->desc0,
+		__entry->desc1,
+		__entry->descp,
+		__entry->e
+	)
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+	TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+	TP_ARGS(dd, sel, vl, idx),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(u32, sel)
+		__field(u8, vl)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		__entry->sel = sel;
+		__entry->vl = vl;
+		__entry->idx = idx;
+	),
+	TP_printk(
+		"[%s] selecting SDE %u sel 0x%x vl %u",
+		__get_str(dev),
+		__entry->idx,
+		__entry->sel,
+		__entry->vl
+	)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u64 status
+	),
+	TP_ARGS(sde, status),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(u64, status)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->status = status;
+		__entry->idx = sde->this_idx;
+	),
+	TP_printk(
+		"[%s] SDE(%u) status %llx",
+		__get_str(dev),
+		__entry->idx,
+		(unsigned long long)__entry->status
+	)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u64 status
+	),
+	TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u64 status
+	),
+	TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		int aidx
+	),
+	TP_ARGS(sde, aidx),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(int, aidx)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->idx = sde->this_idx;
+		__entry->aidx = aidx;
+	),
+	TP_printk(
+		"[%s] SDE(%u) aidx %d",
+		__get_str(dev),
+		__entry->idx,
+		__entry->aidx
+	)
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+	     TP_PROTO(
+		struct sdma_engine *sde,
+		int aidx
+	     ),
+	     TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+	     TP_PROTO(
+		struct sdma_engine *sde,
+		int aidx
+	     ),
+	     TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u16 hwhead,
+		u16 swhead,
+		struct sdma_txreq *txp
+	),
+	TP_ARGS(sde, hwhead, swhead, txp),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(u64, sn)
+		__field(u16, hwhead)
+		__field(u16, swhead)
+		__field(u16, txnext)
+		__field(u16, tx_tail)
+		__field(u16, tx_head)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->hwhead = hwhead;
+		__entry->swhead = swhead;
+		__entry->tx_tail = sde->tx_tail;
+		__entry->tx_head = sde->tx_head;
+		__entry->txnext = txp ? txp->next_descq_idx : ~0;
+		__entry->idx = sde->this_idx;
+		__entry->sn = txp ? txp->sn : ~0;
+	),
+	TP_printk(
+		"[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+		__get_str(dev),
+		__entry->idx,
+		__entry->sn,
+		__entry->hwhead,
+		__entry->swhead,
+		__entry->txnext,
+		__entry->tx_head,
+		__entry->tx_tail
+	)
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+	    TP_PROTO(
+		struct sdma_engine *sde,
+		u16 hwhead,
+		u16 swhead,
+		struct sdma_txreq *txp
+	    ),
+	TP_ARGS(sde, hwhead, swhead, txp),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(u16, hwhead)
+		__field(u16, swhead)
+		__field(u16, txnext)
+		__field(u16, tx_tail)
+		__field(u16, tx_head)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->hwhead = hwhead;
+		__entry->swhead = swhead;
+		__entry->tx_tail = sde->tx_tail;
+		__entry->tx_head = sde->tx_head;
+		__entry->txnext = txp ? txp->next_descq_idx : ~0;
+		__entry->idx = sde->this_idx;
+	),
+	TP_printk(
+		"[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+		__get_str(dev),
+		__entry->idx,
+		__entry->hwhead,
+		__entry->swhead,
+		__entry->txnext,
+		__entry->tx_head,
+		__entry->tx_tail
+	)
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		u64 sn
+	),
+	TP_ARGS(sde, sn),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__field(u64, sn)
+		__field(u8, idx)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__entry->sn = sn;
+		__entry->idx = sde->this_idx;
+	),
+	TP_printk(
+		"[%s] SDE(%u) sn %llu",
+		__get_str(dev),
+		__entry->idx,
+		__entry->sn
+	)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+	     TP_PROTO(
+		struct sdma_engine *sde,
+		u64 sn
+	     ),
+	     TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+	     TP_PROTO(
+		struct sdma_engine *sde,
+		u64 sn
+	     ),
+	     TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+	"[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+		     struct hfi1_pkt_header *hdr, u32 tidval),
+	    TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u16, req)
+		    __field(__le32, pbc0)
+		    __field(__le32, pbc1)
+		    __field(__be32, lrh0)
+		    __field(__be32, lrh1)
+		    __field(__be32, bth0)
+		    __field(__be32, bth1)
+		    __field(__be32, bth2)
+		    __field(__le32, kdeth0)
+		    __field(__le32, kdeth1)
+		    __field(__le32, kdeth2)
+		    __field(__le32, kdeth3)
+		    __field(__le32, kdeth4)
+		    __field(__le32, kdeth5)
+		    __field(__le32, kdeth6)
+		    __field(__le32, kdeth7)
+		    __field(__le32, kdeth8)
+		    __field(u32, tidval)
+		    ),
+	    TP_fast_assign(
+		    __le32 *pbc = (__le32 *)hdr->pbc;
+		    __be32 *lrh = (__be32 *)hdr->lrh;
+		    __be32 *bth = (__be32 *)hdr->bth;
+		    __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->req = req;
+		    __entry->pbc0 = pbc[0];
+		    __entry->pbc1 = pbc[1];
+		    __entry->lrh0 = be32_to_cpu(lrh[0]);
+		    __entry->lrh1 = be32_to_cpu(lrh[1]);
+		    __entry->bth0 = be32_to_cpu(bth[0]);
+		    __entry->bth1 = be32_to_cpu(bth[1]);
+		    __entry->bth2 = be32_to_cpu(bth[2]);
+		    __entry->kdeth0 = kdeth[0];
+		    __entry->kdeth1 = kdeth[1];
+		    __entry->kdeth2 = kdeth[2];
+		    __entry->kdeth3 = kdeth[3];
+		    __entry->kdeth4 = kdeth[4];
+		    __entry->kdeth5 = kdeth[5];
+		    __entry->kdeth6 = kdeth[6];
+		    __entry->kdeth7 = kdeth[7];
+		    __entry->kdeth8 = kdeth[8];
+		    __entry->tidval = tidval;
+		    ),
+	    TP_printk(USDMA_HDR_FORMAT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->req,
+		      __entry->pbc1,
+		      __entry->pbc0,
+		      __entry->lrh0,
+		      __entry->lrh1,
+		      __entry->bth0,
+		      __entry->bth1,
+		      __entry->bth2,
+		      __entry->kdeth0,
+		      __entry->kdeth1,
+		      __entry->kdeth2,
+		      __entry->kdeth3,
+		      __entry->kdeth4,
+		      __entry->kdeth5,
+		      __entry->kdeth6,
+		      __entry->kdeth7,
+		      __entry->kdeth8,
+		      __entry->tidval
+		    )
+	);
+
+#define SDMA_UREQ_FMT \
+	"[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+	    TP_ARGS(dd, ctxt, subctxt, i),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd);
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u8, ver_opcode)
+		    __field(u8, iovcnt)
+		    __field(u16, npkts)
+		    __field(u16, fragsize)
+		    __field(u16, comp_idx)
+		    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->ver_opcode = i[0] & 0xff;
+		    __entry->iovcnt = (i[0] >> 8) & 0xff;
+		    __entry->npkts = i[1];
+		    __entry->fragsize = i[2];
+		    __entry->comp_idx = i[3];
+		    ),
+	    TP_printk(SDMA_UREQ_FMT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->ver_opcode,
+		      __entry->iovcnt,
+		      __entry->npkts,
+		      __entry->fragsize,
+		      __entry->comp_idx
+		    )
+	);
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)			\
+	__print_symbolic(st,				\
+			 usdma_complete_name(FREE),	\
+			 usdma_complete_name(QUEUED),	\
+			 usdma_complete_name(COMPLETE), \
+			 usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+		     u8 state, int code),
+	    TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u16, idx)
+		    __field(u8, state)
+		    __field(int, code)
+		    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->idx = idx;
+		    __entry->state = state;
+		    __entry->code = code;
+		    ),
+	    TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+		      __get_str(dev), __entry->ctxt, __entry->subctxt,
+		      __entry->idx, show_usdma_complete_state(__entry->state),
+		      __entry->code)
+	);
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+		     u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+	    TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u16, req)
+		    __field(u8, sde)
+		    __field(u8, idx)
+		    __field(int, len)
+		    __field(u32, tidval)
+		    __array(u32, ahg, 10)
+		    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->req = req;
+		    __entry->sde = sde;
+		    __entry->idx = ahgidx;
+		    __entry->len = len;
+		    __entry->tidval = tidval;
+		    memcpy(__entry->ahg, ahg, len * sizeof(u32));
+		    ),
+	    TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->req,
+		      __entry->sde,
+		      __entry->idx,
+		      __entry->len - 1,
+		      __print_u32_hex(__entry->ahg, __entry->len),
+		      __entry->tidval
+		    )
+	);
+
+TRACE_EVENT(hfi1_sdma_state,
+	TP_PROTO(
+		struct sdma_engine *sde,
+		const char *cstate,
+		const char *nstate
+	),
+	TP_ARGS(sde, cstate, nstate),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(sde->dd)
+		__string(curstate, cstate)
+		__string(newstate, nstate)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(sde->dd);
+		__assign_str(curstate, cstate);
+		__assign_str(newstate, nstate);
+	),
+	TP_printk("[%s] current state %s new state %s",
+		__get_str(dev),
+		__get_str(curstate),
+		__get_str(newstate)
+	)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_sdma_rc,
+	TP_PROTO(struct hfi1_qp *qp, u32 psn),
+	TP_ARGS(qp, psn),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, flags)
+		__field(u32, psn)
+		__field(u32, sending_psn)
+		__field(u32, sending_hpsn)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->flags = qp->s_flags;
+		__entry->psn = psn;
+		__entry->sending_psn = qp->s_sending_psn;
+		__entry->sending_hpsn = qp->s_sending_hpsn;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x flags 0x%x psn 0x%x sending_psn 0x%x sending_hpsn 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->flags,
+		__entry->psn,
+		__entry->sending_psn,
+		__entry->sending_psn
+	)
+);
+
+DEFINE_EVENT(hfi1_sdma_rc, hfi1_rc_sendcomplete,
+	     TP_PROTO(struct hfi1_qp *qp, u32 psn),
+	     TP_ARGS(qp, psn)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+	TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+		 int src),
+	TP_ARGS(dd, is_entry, src),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__array(char, buf, 64)
+		__field(int, src)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd)
+		is_entry->is_name(__entry->buf, 64, src - is_entry->start);
+		__entry->src = src;
+	),
+	TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+		  __entry->src)
+);
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_trace
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+	TP_PROTO(const char *function, struct va_format *vaf),
+	TP_ARGS(function, vaf),
+	TP_STRUCT__entry(
+		__string(function, function)
+		__dynamic_array(char, msg, MAX_MSG_LEN)
+	),
+	TP_fast_assign(
+		__assign_str(function, function);
+		WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+		     MAX_MSG_LEN, vaf->fmt,
+		     *vaf->va) >= MAX_MSG_LEN);
+	),
+	TP_printk("(%s) %s",
+		  __get_str(function),
+		  __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);		\
+									\
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,				\
+	TP_PROTO(const char *function, struct va_format *vaf),		\
+	TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)		\
+{									\
+	struct va_format vaf = {					\
+		.fmt = fmt,						\
+	};								\
+	va_list args;							\
+									\
+	va_start(args, fmt);						\
+	vaf.va = &args;							\
+	trace_hfi1_ ##lvl(func, &vaf);					\
+	va_end(args);							\
+	return;								\
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+
+#define hfi1_cdbg(which, fmt, ...) \
+	__hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+	hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+	trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
new file mode 100644
index 0000000..ea54fd2
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.c
@@ -0,0 +1,518 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * "Two Wire Serial Interface" support.
+ *
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the hfi1_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
+{
+	/*
+	 * implicit read of EXTStatus is as good as explicit
+	 * read of scratch, if all we want to do is flush
+	 * writes.
+	 */
+	hfi1_gpio_mod(dd, target, 0, 0, 0);
+	rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+	u32 mask;
+
+	udelay(1);
+
+	mask = QSFP_HFI0_I2CCLK;
+
+	/* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+	hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+	/*
+	 * Allow for slow slaves by simple
+	 * delay for falling edge, sampling on rise.
+	 */
+	if (!bit)
+		udelay(2);
+	else {
+		int rise_usec;
+
+		for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+			if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
+				break;
+			udelay(2);
+		}
+		if (rise_usec <= 0)
+			dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+				    SCL_WAIT_USEC);
+	}
+	i2c_wait_for_writes(dd, target);
+}
+
+static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+	u32 mask;
+
+	mask = QSFP_HFI0_I2CDAT;
+
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+	i2c_wait_for_writes(dd, target);
+	udelay(2);
+}
+
+static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+	u32 read_val, mask;
+
+	mask = QSFP_HFI0_I2CDAT;
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	hfi1_gpio_mod(dd, target, 0, 0, mask);
+	read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+	if (wait)
+		i2c_wait_for_writes(dd, target);
+	return (read_val & mask) >> GPIO_SDA_NUM;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the hfi1_ib device
+ */
+static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, target, 1);
+	scl_out(dd, target, 1);
+	ack_received = sda_in(dd, target, 1) == 0;
+	scl_out(dd, target, 0);
+	return ack_received;
+}
+
+static void stop_cmd(struct hfi1_devdata *dd, u32 target);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the hfi1_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, target, 1);
+		data |= sda_in(dd, target, 0);
+		scl_out(dd, target, 0);
+	}
+	if (last) {
+		scl_out(dd, target, 1);
+		stop_cmd(dd, target);
+	} else {
+		sda_out(dd, target, 0);
+		scl_out(dd, target, 1);
+		scl_out(dd, target, 0);
+		sda_out(dd, target, 1);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the hfi1_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, target, bit);
+		scl_out(dd, target, 1);
+		scl_out(dd, target, 0);
+	}
+	return (!i2c_ackrcv(dd, target)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct hfi1_devdata *dd, u32 target)
+{
+	sda_out(dd, target, 1);
+	scl_out(dd, target, 1);
+	sda_out(dd, target, 0);
+	udelay(1);
+	scl_out(dd, target, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct hfi1_devdata *dd, u32 target)
+{
+	scl_out(dd, target, 0);
+	sda_out(dd, target, 0);
+	scl_out(dd, target, 1);
+	sda_out(dd, target, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct hfi1_devdata *dd, u32 target)
+{
+	stop_seq(dd, target);
+	udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * hfi1_twsi_reset - reset I2C communication
+ * @dd: the hfi1_ib device
+ */
+
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
+{
+	int clock_cycles_left = 9;
+	int was_high = 0;
+	u32 pins, mask;
+
+	/* Both SCL and SDA should be high. If not, there
+	 * is something wrong.
+	 */
+	mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
+
+	/*
+	 * Force pins to desired innocuous state.
+	 * This is the default power-on state with out=0 and dir=0,
+	 * So tri-stated and should be floating high (barring HW problems)
+	 */
+	hfi1_gpio_mod(dd, target, 0, 0, mask);
+
+	/*
+	 * Clock nine times to get all listeners into a sane state.
+	 * If SDA does not go high at any point, we are wedged.
+	 * One vendor recommends then issuing START followed by STOP.
+	 * we cannot use our "normal" functions to do that, because
+	 * if SCL drops between them, another vendor's part will
+	 * wedge, dropping SDA and keeping it low forever, at the end of
+	 * the next transaction (even if it was not the device addressed).
+	 * So our START and STOP take place with SCL held high.
+	 */
+	while (clock_cycles_left--) {
+		scl_out(dd, target, 0);
+		scl_out(dd, target, 1);
+		/* Note if SDA is high, but keep clocking to sync slave */
+		was_high |= sda_in(dd, target, 0);
+	}
+
+	if (was_high) {
+		/*
+		 * We saw a high, which we hope means the slave is sync'd.
+		 * Issue START, STOP, pause for T_BUF.
+		 */
+
+		pins = hfi1_gpio_mod(dd, target, 0, 0, 0);
+		if ((pins & mask) != mask)
+			dd_dev_err(dd, "GPIO pins not at rest: %d\n",
+				    pins & mask);
+		/* Drop SDA to issue START */
+		udelay(1); /* Guarantee .6 uSec setup */
+		sda_out(dd, target, 0);
+		udelay(1); /* Guarantee .6 uSec hold */
+		/* At this point, SCL is high, SDA low. Raise SDA for STOP */
+		sda_out(dd, target, 1);
+		udelay(TWSI_BUF_WAIT_USEC);
+	}
+
+	return !was_high;
+}
+
+#define HFI1_TWSI_START 0x100
+#define HFI1_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
+{
+	int ret = 1;
+
+	if (flags & HFI1_TWSI_START)
+		start_seq(dd, target);
+
+	/* Leaves SCL low (from i2c_ackrcv()) */
+	ret = wr_byte(dd, target, data);
+
+	if (flags & HFI1_TWSI_STOP)
+		stop_cmd(dd, target);
+	return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define HFI1_TEMP_DEV 0x98
+
+/*
+ * hfi1_twsi_blk_rd
+ * General interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+		     void *buffer, int len)
+{
+	int ret;
+	u8 *bp = buffer;
+
+	ret = 1;
+
+	if (dev == HFI1_TWSI_NO_DEV) {
+		/* legacy not-really-I2C */
+		addr = (addr << 1) | READ_CMD;
+		ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
+	} else {
+		/* Actual I2C */
+		ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START);
+		if (ret) {
+			stop_cmd(dd, target);
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * SFF spec claims we do _not_ stop after the addr
+		 * but simply issue a start with the "read" dev-addr.
+		 * Since we are implicitly waiting for ACK here,
+		 * we need t_buf (nominally 20uSec) before that start,
+		 * and cannot rely on the delay built in to the STOP
+		 */
+		ret = twsi_wr(dd, target, addr, 0);
+		udelay(TWSI_BUF_WAIT_USEC);
+
+		if (ret) {
+			dd_dev_err(dd,
+				"Failed to write interface read addr %02X\n",
+				addr);
+			ret = 1;
+			goto bail;
+		}
+		ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
+	}
+	if (ret) {
+		stop_cmd(dd, target);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * block devices keeps clocking data out as long as we ack,
+	 * automatically incrementing the address. Some have "pages"
+	 * whose boundaries will not be crossed, but the handling
+	 * of these is left to the caller, who is in a better
+	 * position to know.
+	 */
+	while (len-- > 0) {
+		/*
+		 * Get and store data, sending ACK if length remaining,
+		 * else STOP
+		 */
+		*bp++ = rd_byte(dd, target, !len);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * hfi1_twsi_blk_wr
+ * General interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+		     const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret = 1;
+
+	while (len > 0) {
+		if (dev == HFI1_TWSI_NO_DEV) {
+			if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
+				    HFI1_TWSI_START)) {
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (twsi_wr(dd, target,
+				    dev | WRITE_CMD, HFI1_TWSI_START))
+				goto failed_write;
+			ret = twsi_wr(dd, target, addr, 0);
+			if (ret) {
+				dd_dev_err(dd,
+					"Failed to write interface write addr %02X\n",
+					addr);
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		addr += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++)
+			if (twsi_wr(dd, target, *bp++, 0))
+				goto failed_write;
+
+		stop_cmd(dd, target);
+
+		/*
+		 * Wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. Legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (twsi_wr(dd, target,
+			       dev | READ_CMD, HFI1_TWSI_START)) {
+			stop_cmd(dd, target);
+			if (!--max_wait_time)
+				goto failed_write;
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd, target, 1);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd, target);
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
new file mode 100644
index 0000000..5907e02
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.h
@@ -0,0 +1,68 @@
+#ifndef _TWSI_H
+#define _TWSI_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define HFI1_TWSI_NO_DEV 0xFF
+
+struct hfi1_devdata;
+
+/* Bit position of SDA pin in ASIC_QSFP* registers  */
+#define  GPIO_SDA_NUM 1
+
+/* these functions must be called with qsfp_lock held */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+		     void *buffer, int len);
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+		     const void *buffer, int len);
+
+
+#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
new file mode 100644
index 0000000..b536f39
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/uc.c
@@ -0,0 +1,585 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "sdma.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct hfi1_qp *qp)
+{
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_swqe *wqe;
+	unsigned long flags;
+	u32 hwords = 5;
+	u32 bth0 = 0;
+	u32 len;
+	u32 pmtu = qp->pmtu;
+	int ret = 0;
+	int middle = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+		if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_iowait.sdma_busy)) {
+			qp->s_flags |= HFI1_S_WAIT_DMA;
+			goto bail;
+		}
+		clear_ahg(qp);
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr->ibh.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_hfi1_state_ops[qp->state] &
+		    HFI1_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head) {
+			clear_ahg(qp);
+			goto bail;
+		}
+		/*
+		 * Start a new request.
+		 */
+		wqe->psn = qp->s_next_psn;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		len = wqe->length;
+		qp->s_len = len;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+			     mask_psn(qp->s_next_psn++), middle);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+	struct hfi1_ib_header *hdr = packet->hdr;
+	u32 rcv_flags = packet->rcv_flags;
+	void *data = packet->ebuf;
+	u32 tlen = packet->tlen;
+	struct hfi1_qp *qp = packet->qp;
+	struct hfi1_other_headers *ohdr = packet->ohdr;
+	u32 opcode;
+	u32 hdrsize = packet->hlen;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	struct ib_reth *reth;
+	int has_grh = rcv_flags & HFI1_HAS_GRH;
+	int ret;
+	u32 bth1;
+	struct ib_grh *grh = NULL;
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	bth1 = be32_to_cpu(ohdr->bth[1]);
+	if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+		if (bth1 & HFI1_BECN_SMASK) {
+			struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+			u32 rqpn, lqpn;
+			u16 rlid = be16_to_cpu(hdr->lrh[3]);
+			u8 sl, sc5;
+
+			lqpn = bth1 & HFI1_QPN_MASK;
+			rqpn = qp->remote_qpn;
+
+			sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+			sl = ibp->sc_to_sl[sc5];
+
+			process_becn(ppd, sl, rlid, lqpn, rqpn,
+					IB_CC_SVCTYPE_UC);
+		}
+
+		if (bth1 & HFI1_FECN_SMASK) {
+			u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+			u16 slid = be16_to_cpu(hdr->lrh[3]);
+			u16 dlid = be16_to_cpu(hdr->lrh[1]);
+			u32 src_qp = qp->remote_qpn;
+			u8 sc5;
+
+			sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+			return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+		}
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+inv:
+		if (qp->r_state == OP(SEND_FIRST) ||
+		    qp->r_state == OP(SEND_MIDDLE)) {
+			set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+			qp->r_sge.num_sge = 0;
+		} else
+			hfi1_put_ss(&qp->r_sge);
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			goto drop;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+		qp_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+		if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+			qp->r_sge = qp->s_rdma_read_sge;
+		else {
+			ret = hfi1_get_rwqe(qp, 0);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+			/*
+			 * qp->s_rdma_read_sge will be the owner
+			 * of the mr references.
+			 */
+			qp->s_rdma_read_sge = qp->r_sge;
+		}
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto rewind;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto rewind;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, 0);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto rewind;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto rewind;
+		wc.opcode = IB_WC_RECV;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, 0);
+		hfi1_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		/*
+		 * It seems that IB mandates the presence of an SL in a
+		 * work completion only for the UD transport (see section
+		 * 11.4.2 of IBTA Vol. 1).
+		 *
+		 * However, the way the SL is chosen below is consistent
+		 * with the way that IB/qib works and is trying avoid
+		 * introducing incompatibilities.
+		 *
+		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+		 */
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			      (ohdr->bth[0] &
+				cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			goto drop;
+		}
+		reth = &ohdr->u.rc.reth;
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+					  vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto drop;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+			wc.ex.imm_data = ohdr->u.rc.imm_data;
+			goto rdma_last_imm;
+		}
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto drop;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto drop;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+			hfi1_put_ss(&qp->s_rdma_read_sge);
+		else {
+			ret = hfi1_get_rwqe(qp, 1);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+		hfi1_put_ss(&qp->r_sge);
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+		hfi1_put_ss(&qp->r_sge);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		goto drop;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	return;
+
+rewind:
+	set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+	qp->r_sge.num_sge = 0;
+drop:
+	ibp->n_pkt_drops++;
+	return;
+
+op_err:
+	hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	return;
+
+}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
new file mode 100644
index 0000000..d40d1a1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -0,0 +1,885 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe)
+{
+	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct hfi1_pportdata *ppd;
+	struct hfi1_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct hfi1_sge_state ssge;
+	struct hfi1_sge *sge;
+	struct ib_wc wc;
+	u32 length;
+	enum ib_qp_type sqptype, dqptype;
+
+	rcu_read_lock();
+
+	qp = hfi1_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+	if (!qp) {
+		ibp->n_pkt_drops++;
+		rcu_read_unlock();
+		return;
+	}
+
+	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : sqp->ibqp.qp_type;
+	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : qp->ibqp.qp_type;
+
+	if (dqptype != sqptype ||
+	    !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+		ibp->n_pkt_drops++;
+		goto drop;
+	}
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	ppd = ppd_from_ibp(ibp);
+
+	if (qp->ibqp.qp_num > 1) {
+		u16 pkey;
+		u16 slid;
+		u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+		pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+		slid = ppd->lid | (ah_attr->src_path_bits &
+				   ((1 << ppd->lmc) - 1));
+		if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+						qp->s_pkey_index, slid))) {
+			hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey,
+				       ah_attr->sl,
+				       sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				       cpu_to_be16(slid),
+				       cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (qp->ibqp.qp_num) {
+		u32 qkey;
+
+		qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
+			sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+		if (unlikely(qkey != qp->qkey)) {
+			u16 lid;
+
+			lid = ppd->lid | (ah_attr->src_path_bits &
+					  ((1 << ppd->lmc) - 1));
+			hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				       ah_attr->sl,
+				       sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				       cpu_to_be16(lid),
+				       cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof(wc));
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & HFI1_R_REUSE_SGE)
+		qp->r_flags &= ~HFI1_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = hfi1_get_rwqe(qp, 0);
+		if (ret < 0) {
+			hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto bail_unlock;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			goto bail_unlock;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= HFI1_R_REUSE_SGE;
+		ibp->n_pkt_drops++;
+		goto bail_unlock;
+	}
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
+			      sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= HFI1_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	hfi1_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+		goto bail_unlock;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+		if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+		    sqp->ibqp.qp_type == IB_QPT_SMI)
+			wc.pkey_index = swqe->wr.wr.ud.pkey_index;
+		else
+			wc.pkey_index = sqp->s_pkey_index;
+	} else {
+		wc.pkey_index = 0;
+	}
+	wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+	/* Check for loopback when the port lid is not set */
+	if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+		wc.slid = HFI1_PERMISSIVE_LID;
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		      swqe->wr.send_flags & IB_SEND_SOLICITED);
+	ibp->n_loop_pkts++;
+bail_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+	rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct hfi1_qp *qp)
+{
+	struct hfi1_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct hfi1_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+	u8 sc5;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_iowait.sdma_busy)) {
+			qp->s_flags |= HFI1_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE ||
+	    ah_attr->dlid == HFI1_PERMISSIVE_LID) {
+		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+		if (unlikely(!loopback && (lid == ppd->lid ||
+		    (lid == HFI1_PERMISSIVE_LID &&
+		     qp->ibqp.qp_type == IB_QPT_GSI)))) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_iowait.sdma_busy)) {
+				qp->s_flags |= HFI1_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_srate = ah_attr->static_rate;
+	qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+	qp->s_sge.total_len = wqe->length;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+					       &ah_attr->grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = HFI1_LRH_GRH;
+		ohdr = &qp->s_hdr->ibh.u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = HFI1_LRH_BTH;
+		ohdr = &qp->s_hdr->ibh.u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	sc5 = ibp->sl_to_sc[ah_attr->sl];
+	lrh0 |= (ah_attr->sl & 0xf) << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI) {
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+		qp->s_sc = 0xf;
+	} else {
+		lrh0 |= (sc5 & 0xf) << 12;
+		qp->s_sc = sc5;
+	}
+	qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+	qp->s_hdr->ibh.lrh[2] =
+		cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE))
+		qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+	else {
+		lid = ppd->lid;
+		if (lid) {
+			lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+			qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid);
+		} else
+			qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+	}
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+		bth0 |= hfi1_get_pkey(ibp, wqe->wr.wr.ud.pkey_index);
+	else
+		bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++));
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+	/* disarm any ahg */
+	qp->s_hdr->ahgcount = 0;
+	qp->s_hdr->ahgidx = 0;
+	qp->s_hdr->tx_flags = 0;
+	qp->s_hdr->sde = NULL;
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check.  It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned i;
+
+	if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+		unsigned lim_idx = -1;
+
+		for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+			/* here we look for an exact match */
+			if (ppd->pkeys[i] == pkey)
+				return i;
+			if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+				lim_idx = i;
+		}
+
+		/* did not find 0xffff return 0x7fff idx if found */
+		if (pkey == FULL_MGMT_P_KEY)
+			return lim_idx;
+
+		/* no match...  */
+		return -1;
+	}
+
+	pkey &= 0x7fff; /* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+		if ((ppd->pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 */
+	return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+		u32 pkey, u32 slid, u32 dlid, u8 sc5,
+		const struct ib_grh *old_grh)
+{
+	u64 pbc, pbc_flags = 0;
+	u32 bth0, plen, vl, hwords = 5;
+	u16 lrh0;
+	u8 sl = ibp->sc_to_sl[sc5];
+	struct hfi1_ib_header hdr;
+	struct hfi1_other_headers *ohdr;
+	struct pio_buf *pbuf;
+	struct send_context *ctxt = qp_to_send_context(qp, sc5);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (old_grh) {
+		struct ib_grh *grh = &hdr.u.l.grh;
+
+		grh->version_tclass_flow = old_grh->version_tclass_flow;
+		grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+		grh->hop_limit = 0xff;
+		grh->sgid = old_grh->dgid;
+		grh->dgid = old_grh->sgid;
+		ohdr = &hdr.u.l.oth;
+		lrh0 = HFI1_LRH_GRH;
+		hwords += sizeof(struct ib_grh) / sizeof(u32);
+	} else {
+		ohdr = &hdr.u.oth;
+		lrh0 = HFI1_LRH_BTH;
+	}
+
+	lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+	bth0 = pkey | (IB_OPCODE_CNP << 24);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+
+	ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+	ohdr->bth[2] = 0; /* PSN 0 */
+
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(slid);
+
+	plen = 2 /* PBC */ + hwords;
+	pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+	vl = sc_to_vlt(ppd->dd, sc5);
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	if (ctxt) {
+		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+		if (pbuf)
+			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+						 &hdr, hwords);
+	}
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ *   - Checks are done using the pkey directly from the packet's BTH,
+ *     and specifically _not_ the pkey that we attach to the completion,
+ *     which may be different.
+ *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ *     which originated on another node). SMPs which are sent from, and
+ *     destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ *   - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+			 struct hfi1_qp *qp, u16 slid, struct opa_smp *smp)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	/*
+	 * I don't think it's possible for us to get here with sc != 0xf,
+	 * but check it to be certain.
+	 */
+	if (sc5 != 0xf)
+		return 1;
+
+	if (rcv_pkey_check(ppd, pkey, sc5, slid))
+		return 1;
+
+	/*
+	 * At this point we know (and so don't need to check again) that
+	 * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+	 * (see ingress_pkey_check).
+	 */
+	if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+	    smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+		ingress_pkey_table_fail(ppd, pkey, slid);
+		return 1;
+	}
+
+	/*
+	 * SMPs fall into one of four (disjoint) categories:
+	 * SMA request, SMA response, trap, or trap repress.
+	 * Our response depends, in part, on which type of
+	 * SMP we're processing.
+	 *
+	 * If this is not an SMA request, or trap repress:
+	 *   - accept MAD if the port is running an SM
+	 *   - pkey == FULL_MGMT_P_KEY =>
+	 *       reply with unsupported method (i.e., just mark
+	 *       the smp's status field here, and let it be
+	 *       processed normally)
+	 *   - pkey != LIM_MGMT_P_KEY =>
+	 *       increment port recv constraint errors, drop MAD
+	 * If this is an SMA request or trap repress:
+	 *   - pkey != FULL_MGMT_P_KEY =>
+	 *       increment port recv constraint errors, drop MAD
+	 */
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		if (pkey != FULL_MGMT_P_KEY) {
+			ingress_pkey_table_fail(ppd, pkey, slid);
+			return 1;
+		}
+		break;
+	case IB_MGMT_METHOD_SEND:
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_MGMT_METHOD_REPORT_RESP:
+		if (ibp->port_cap_flags & IB_PORT_SM)
+			return 0;
+		if (pkey == FULL_MGMT_P_KEY) {
+			smp->status |= IB_SMP_UNSUP_METHOD;
+			return 0;
+		}
+		if (pkey != LIM_MGMT_P_KEY) {
+			ingress_pkey_table_fail(ppd, pkey, slid);
+			return 1;
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_other_headers *ohdr = packet->ohdr;
+	int opcode;
+	u32 hdrsize = packet->hlen;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid, pkey;
+	int mgmt_pkey_idx = -1;
+	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+	struct hfi1_ib_header *hdr = packet->hdr;
+	u32 rcv_flags = packet->rcv_flags;
+	void *data = packet->ebuf;
+	u32 tlen = packet->tlen;
+	struct hfi1_qp *qp = packet->qp;
+	bool has_grh = rcv_flags & HFI1_HAS_GRH;
+	bool sc4_bit = has_sc4_bit(packet);
+	u8 sc;
+	u32 bth1;
+	int is_mcast;
+	struct ib_grh *grh = NULL;
+
+	qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+	src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) &&
+			(dlid != HFI1_PERMISSIVE_LID);
+	bth1 = be32_to_cpu(ohdr->bth[1]);
+	if (unlikely(bth1 & HFI1_BECN_SMASK)) {
+		/*
+		 * In pre-B0 h/w the CNP_OPCODE is handled via an
+		 * error path (errata 291394).
+		 */
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+		u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+		u8 sl, sc5;
+
+		sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+		sc5 |= sc4_bit;
+		sl = ibp->sc_to_sl[sc5];
+
+		process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	opcode &= 0xff;
+
+	pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+	if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
+		u16 slid = be16_to_cpu(hdr->lrh[3]);
+		u8 sc5;
+
+		sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+		sc5 |= sc4_bit;
+
+		return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+	}
+	/*
+	 * Get the number of bytes the message was padded by
+	 * and drop incomplete packets.
+	 */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4)))
+		goto drop;
+
+	tlen -= hdrsize + pad + 4;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE))
+			goto drop;
+		if (qp->ibqp.qp_num > 1) {
+			struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+			u16 slid;
+			u8 sc5;
+
+			sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+			sc5 |= sc4_bit;
+
+			slid = be16_to_cpu(hdr->lrh[3]);
+			if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+				/*
+				 * Traps will not be sent for packets dropped
+				 * by the HW. This is fine, as sending trap
+				 * for invalid pkeys is optional according to
+				 * IB spec (release 1.3, section 10.9.4)
+				 */
+				hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+					       pkey,
+					       (be16_to_cpu(hdr->lrh[0]) >> 4) &
+						0xF,
+					       src_qp, qp->ibqp.qp_num,
+					       hdr->lrh[3], hdr->lrh[1]);
+				return;
+			}
+		} else {
+			/* GSI packet */
+			mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+			if (mgmt_pkey_idx < 0)
+				goto drop;
+
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				       (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				       src_qp, qp->ibqp.qp_num,
+				       hdr->lrh[3], hdr->lrh[1]);
+			return;
+		}
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (unlikely(qp->ibqp.qp_num == 1 &&
+			     (tlen > 2048 ||
+			      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+			goto drop;
+	} else {
+		/* Received on QP0, and so by definition, this is an SMP */
+		struct opa_smp *smp = (struct opa_smp *)data;
+		u16 slid = be16_to_cpu(hdr->lrh[3]);
+		u8 sc5;
+
+		sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+		sc5 |= sc4_bit;
+
+		if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+			goto drop;
+
+		if (tlen > 2048)
+			goto drop;
+		if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		     hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			goto drop;
+
+		/* look up SMI pkey */
+		mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+		if (mgmt_pkey_idx < 0)
+			goto drop;
+
+	}
+
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		tlen -= sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else
+		goto drop;
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & HFI1_R_REUSE_SGE)
+		qp->r_flags &= ~HFI1_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = hfi1_get_rwqe(qp, 0);
+		if (ret < 0) {
+			hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			return;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			return;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= HFI1_R_REUSE_SGE;
+		goto drop;
+	}
+	if (has_grh) {
+		hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			      sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+	hfi1_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+		return;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+
+	if (qp->ibqp.qp_type == IB_QPT_GSI ||
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
+		if (mgmt_pkey_idx < 0) {
+			if (net_ratelimit()) {
+				struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+				struct hfi1_devdata *dd = ppd->dd;
+
+				dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+					   qp->ibqp.qp_type);
+				mgmt_pkey_idx = 0;
+			}
+		}
+		wc.pkey_index = (unsigned)mgmt_pkey_idx;
+	} else
+		wc.pkey_index = 0;
+
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+	sc |= sc4_bit;
+	wc.sl = ibp->sc_to_sl[sc];
+
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		      (ohdr->bth[0] &
+			cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
new file mode 100644
index 0000000..9071afb
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -0,0 +1,156 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "hfi.h"
+
+static void __hfi1_release_user_pages(struct page **p, size_t num_pages,
+				      int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+				 struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__hfi1_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_map_page - a safety wrapper around pci_map_page()
+ *
+ */
+dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
+			 unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	return phys;
+}
+
+/**
+ * hfi1_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+			struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __hfi1_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void hfi1_release_user_pages(struct page **p, size_t num_pages)
+{
+	if (current->mm) /* during close after signal, mm can be NULL */
+		down_write(&current->mm->mmap_sem);
+
+	__hfi1_release_user_pages(p, num_pages, 1);
+
+	if (current->mm) {
+		current->mm->pinned_vm -= num_pages;
+		up_write(&current->mm->mmap_sem);
+	}
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
new file mode 100644
index 0000000..5552661
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -0,0 +1,1444 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "sdma.h"
+#include "verbs.h"  /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+	(((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+	(((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+	(((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field)						\
+	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {					\
+		u32 dwval = le32_to_cpu(dw);				\
+		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+		dwval |= (((val) & KDETH_##field##_MASK) << \
+			  KDETH_##field##_SHIFT);			\
+		dw = cpu_to_le32(dwval);				\
+	} while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)			\
+	do {								\
+		if ((idx) < ARRAY_SIZE((arr)))				\
+			(arr)[(idx++)] = sdma_build_ahg_descriptor(	\
+				(__force u16)(value), (dw), (bit),	\
+							(width));	\
+		else							\
+			return -ERANGE;					\
+	} while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define USER_SDMA_TXREQ_FLAGS_LAST_PKT   (1 << 0)
+
+#define SDMA_REQ_IN_USE     0
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE  2
+#define SDMA_REQ_HAVE_AHG   3
+#define SDMA_REQ_HAS_ERROR  4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE (1 << 0)
+#define SDMA_PKT_Q_ACTIVE   (1 << 1)
+#define SDMA_PKT_Q_DEFERRED (1 << 2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct user_sdma_iovec {
+	struct iovec iov;
+	/* number of pages in this vector */
+	unsigned npages;
+	/* array of pinned pages for this vector */
+	struct page **pages;
+	/* offset into the virtual address space of the vector at
+	 * which we last left off. */
+	u64 offset;
+};
+
+struct user_sdma_request {
+	struct sdma_req_info info;
+	struct hfi1_user_sdma_pkt_q *pq;
+	struct hfi1_user_sdma_comp_q *cq;
+	/* This is the original header from user space */
+	struct hfi1_pkt_header hdr;
+	/*
+	 * Pointer to the SDMA engine for this request.
+	 * Since different request could be on different VLs,
+	 * each request will need it's own engine pointer.
+	 */
+	struct sdma_engine *sde;
+	u8 ahg_idx;
+	u32 ahg[9];
+	/*
+	 * KDETH.Offset (Eager) field
+	 * We need to remember the initial value so the headers
+	 * can be updated properly.
+	 */
+	u32 koffset;
+	/*
+	 * KDETH.OFFSET (TID) field
+	 * The offset can cover multiple packets, depending on the
+	 * size of the TID entry.
+	 */
+	u32 tidoffset;
+	/*
+	 * KDETH.OM
+	 * Remember this because the header template always sets it
+	 * to 0.
+	 */
+	u8 omfactor;
+	/*
+	 * pointer to the user's task_struct. We are going to
+	 * get a reference to it so we can process io vectors
+	 * at a later time.
+	 */
+	struct task_struct *user_proc;
+	/*
+	 * pointer to the user's mm_struct. We are going to
+	 * get a reference to it so it doesn't get freed
+	 * since we might not be in process context when we
+	 * are processing the iov's.
+	 * Using this mm_struct, we can get vma based on the
+	 * iov's address (find_vma()).
+	 */
+	struct mm_struct *user_mm;
+	/*
+	 * We copy the iovs for this request (based on
+	 * info.iovcnt). These are only the data vectors
+	 */
+	unsigned data_iovs;
+	/* total length of the data in the request */
+	u32 data_len;
+	/* progress index moving along the iovs array */
+	unsigned iov_idx;
+	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+	/* number of elements copied to the tids array */
+	u16 n_tids;
+	/* TID array values copied from the tid_iov vector */
+	u32 *tids;
+	u16 tididx;
+	u32 sent;
+	u64 seqnum;
+	spinlock_t list_lock;
+	struct list_head txps;
+	unsigned long flags;
+};
+
+struct user_sdma_txreq {
+	/* Packet header for the txreq */
+	struct hfi1_pkt_header hdr;
+	struct sdma_txreq txreq;
+	struct user_sdma_request *req;
+	struct user_sdma_iovec *iovec1;
+	struct user_sdma_iovec *iovec2;
+	u16 flags;
+	unsigned busycount;
+	u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...)				     \
+	hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+		 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+		 ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...)			 \
+	hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+		 (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
+static void user_sdma_free_request(struct user_sdma_request *);
+static int pin_vector_pages(struct user_sdma_request *,
+			    struct user_sdma_iovec *);
+static void unpin_vector_pages(struct user_sdma_iovec *);
+static int check_header_template(struct user_sdma_request *,
+				 struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+			    struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+				struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct user_sdma_request *,
+					enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+	struct sdma_engine *,
+	struct iowait *,
+	struct sdma_txreq *,
+	unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+
+static inline int iovec_may_free(struct user_sdma_iovec *iovec,
+				       void (*free)(struct user_sdma_iovec *))
+{
+	if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+		free(iovec);
+		return 1;
+	}
+	return 0;
+}
+
+static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
+{
+	iovec->offset = iovec->iov.iov_len;
+}
+
+static int defer_packet_queue(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *txreq,
+	unsigned seq)
+{
+	struct hfi1_user_sdma_pkt_q *pq =
+		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+	struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+	struct user_sdma_txreq *tx =
+		container_of(txreq, struct user_sdma_txreq, txreq);
+
+	if (sdma_progress(sde, seq, txreq)) {
+		if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+			goto eagain;
+	}
+	/*
+	 * We are assuming that if the list is enqueued somewhere, it
+	 * is to the dmawait list since that is the only place where
+	 * it is supposed to be enqueued.
+	 */
+	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+	write_seqlock(&dev->iowait_lock);
+	if (list_empty(&pq->busy.list))
+		list_add_tail(&pq->busy.list, &sde->dmawait);
+	write_sequnlock(&dev->iowait_lock);
+	return -EBUSY;
+eagain:
+	return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+	struct hfi1_user_sdma_pkt_q *pq =
+		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+	wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+	struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj;
+
+	memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+	int ret = 0;
+	unsigned memsize;
+	char buf[64];
+	struct hfi1_devdata *dd;
+	struct hfi1_user_sdma_comp_q *cq;
+	struct hfi1_user_sdma_pkt_q *pq;
+	unsigned long flags;
+
+	if (!uctxt || !fp) {
+		ret = -EBADF;
+		goto done;
+	}
+
+	if (!hfi1_sdma_comp_ring_size) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	dd = uctxt->dd;
+
+	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+	if (!pq) {
+		dd_dev_err(dd,
+			   "[%u:%u] Failed to allocate SDMA request struct\n",
+			   uctxt->ctxt, subctxt_fp(fp));
+		goto pq_nomem;
+	}
+	memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+	pq->reqs = kmalloc(memsize, GFP_KERNEL);
+	if (!pq->reqs) {
+		dd_dev_err(dd,
+			   "[%u:%u] Failed to allocate SDMA request queue (%u)\n",
+			   uctxt->ctxt, subctxt_fp(fp), memsize);
+		goto pq_reqs_nomem;
+	}
+	INIT_LIST_HEAD(&pq->list);
+	pq->dd = dd;
+	pq->ctxt = uctxt->ctxt;
+	pq->subctxt = subctxt_fp(fp);
+	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+	pq->state = SDMA_PKT_Q_INACTIVE;
+	atomic_set(&pq->n_reqs, 0);
+
+	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+		    activate_packet_queue);
+	pq->reqidx = 0;
+	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+		 subctxt_fp(fp));
+	pq->txreq_cache = kmem_cache_create(buf,
+			       sizeof(struct user_sdma_txreq),
+					    L1_CACHE_BYTES,
+					    SLAB_HWCACHE_ALIGN,
+					    sdma_kmem_cache_ctor);
+	if (!pq->txreq_cache) {
+		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+			   uctxt->ctxt);
+		goto pq_txreq_nomem;
+	}
+	user_sdma_pkt_fp(fp) = pq;
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		dd_dev_err(dd,
+			   "[%u:%u] Failed to allocate SDMA completion queue\n",
+			   uctxt->ctxt, subctxt_fp(fp));
+		goto cq_nomem;
+	}
+
+	memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size,
+			PAGE_SIZE);
+	cq->comps = vmalloc_user(memsize);
+	if (!cq->comps) {
+		dd_dev_err(dd,
+		      "[%u:%u] Failed to allocate SDMA completion queue entries\n",
+		      uctxt->ctxt, subctxt_fp(fp));
+		goto cq_comps_nomem;
+	}
+	cq->nentries = hfi1_sdma_comp_ring_size;
+	user_sdma_comp_fp(fp) = cq;
+
+	spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+	list_add(&pq->list, &uctxt->sdma_queues);
+	spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+	goto done;
+
+cq_comps_nomem:
+	kfree(cq);
+cq_nomem:
+	kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+	kfree(pq->reqs);
+pq_reqs_nomem:
+	kfree(pq);
+	user_sdma_pkt_fp(fp) = NULL;
+pq_nomem:
+	ret = -ENOMEM;
+done:
+	return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_user_sdma_pkt_q *pq;
+	unsigned long flags;
+
+	hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+		  uctxt->ctxt, fd->subctxt);
+	pq = fd->pq;
+	if (pq) {
+		u16 i, j;
+
+		spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+		if (!list_empty(&pq->list))
+			list_del_init(&pq->list);
+		spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+		iowait_sdma_drain(&pq->busy);
+		if (pq->reqs) {
+			for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) &&
+				     j < pq->n_max_reqs; j++) {
+				struct user_sdma_request *req = &pq->reqs[j];
+
+				if (test_bit(SDMA_REQ_IN_USE, &req->flags)) {
+					set_comp_state(req, ERROR, -ECOMM);
+					user_sdma_free_request(req);
+					i++;
+				}
+			}
+			kfree(pq->reqs);
+		}
+		if (pq->txreq_cache)
+			kmem_cache_destroy(pq->txreq_cache);
+		kfree(pq);
+		fd->pq = NULL;
+	}
+	if (fd->cq) {
+		if (fd->cq->comps)
+			vfree(fd->cq->comps);
+		kfree(fd->cq);
+		fd->cq = NULL;
+	}
+	return 0;
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+				   unsigned long dim, unsigned long *count)
+{
+	int ret = 0, i = 0, sent;
+	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp);
+	struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp);
+	struct hfi1_devdata *dd = pq->dd;
+	unsigned long idx = 0;
+	u8 pcount = initial_pkt_count;
+	struct sdma_req_info info;
+	struct user_sdma_request *req;
+	u8 opcode, sc, vl;
+
+	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+		hfi1_cdbg(
+		   SDMA,
+		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+		   dd->unit, uctxt->ctxt, subctxt_fp(fp),
+		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+	if (ret) {
+		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+			  dd->unit, uctxt->ctxt, subctxt_fp(fp), ret);
+		ret = -EFAULT;
+		goto done;
+	}
+	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp),
+				     (u16 *)&info);
+	if (cq->comps[info.comp_idx].status == QUEUED) {
+		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
+			  dd->unit, uctxt->ctxt, subctxt_fp(fp),
+			  info.comp_idx);
+		ret = -EBADSLT;
+		goto done;
+	}
+	if (!info.fragsize) {
+		hfi1_cdbg(SDMA,
+			  "[%u:%u:%u:%u] Request does not specify fragsize",
+			  dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+		ret = -EINVAL;
+		goto done;
+	}
+	/*
+	 * We've done all the safety checks that we can up to this point,
+	 * "allocate" the request entry.
+	 */
+	hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+		  uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+	req = pq->reqs + info.comp_idx;
+	memset(req, 0, sizeof(*req));
+	/* Mark the request as IN_USE before we start filling it in. */
+	set_bit(SDMA_REQ_IN_USE, &req->flags);
+	req->data_iovs = req_iovcnt(info.ctrl) - 1;
+	req->pq = pq;
+	req->cq = cq;
+	INIT_LIST_HEAD(&req->txps);
+	spin_lock_init(&req->list_lock);
+	memcpy(&req->info, &info, sizeof(info));
+
+	if (req_opcode(info.ctrl) == EXPECTED)
+		req->data_iovs--;
+
+	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+			 MAX_VECTORS_PER_REQ);
+		ret = -EINVAL;
+		goto done;
+	}
+	/* Copy the header from the user buffer */
+	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+			     sizeof(req->hdr));
+	if (ret) {
+		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+		ret = -EFAULT;
+		goto free_req;
+	}
+
+	/* If Static rate control is not enabled, sanitize the header. */
+	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+		req->hdr.pbc[2] = 0;
+
+	/* Validate the opcode. Do not trust packets from user space blindly. */
+	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+	if ((opcode & USER_OPCODE_CHECK_MASK) !=
+	     USER_OPCODE_CHECK_VAL) {
+		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+		ret = -EINVAL;
+		goto free_req;
+	}
+	/*
+	 * Validate the vl. Do not trust packets from user space blindly.
+	 * VL comes from PBC, SC comes from LRH, and the VL needs to
+	 * match the SC look up.
+	 */
+	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+	if (vl >= dd->pport->vls_operational ||
+	    vl != sc_to_vlt(dd, sc)) {
+		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	/*
+	 * Also should check the BTH.lnh. If it says the next header is GRH then
+	 * the RXE parsing will be off and will land in the middle of the KDETH
+	 * or miss it entirely.
+	 */
+	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+		SDMA_DBG(req, "User tried to pass in a GRH");
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+	/* Calculate the initial TID offset based on the values of
+	   KDETH.OFFSET and KDETH.OM that are passed in. */
+	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+		 KDETH_OM_LARGE : KDETH_OM_SMALL);
+	SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+	idx++;
+
+	/* Save all the IO vector structures */
+	while (i < req->data_iovs) {
+		memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+		req->iovs[i].offset = 0;
+		req->data_len += req->iovs[i++].iov.iov_len;
+	}
+	SDMA_DBG(req, "total data length %u", req->data_len);
+
+	if (pcount > req->info.npkts)
+		pcount = req->info.npkts;
+	/*
+	 * Copy any TID info
+	 * User space will provide the TID info only when the
+	 * request type is EXPECTED. This is true even if there is
+	 * only one packet in the request and the header is already
+	 * setup. The reason for the singular TID case is that the
+	 * driver needs to perform safety checks.
+	 */
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+			ret = -EINVAL;
+			goto free_req;
+		}
+		req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+		if (!req->tids) {
+			ret = -ENOMEM;
+			goto free_req;
+		}
+		/*
+		 * We have to copy all of the tids because they may vary
+		 * in size and, therefore, the TID count might not be
+		 * equal to the pkt count. However, there is no way to
+		 * tell at this point.
+		 */
+		ret = copy_from_user(req->tids, iovec[idx].iov_base,
+				     ntids * sizeof(*req->tids));
+		if (ret) {
+			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+				 ntids, ret);
+			ret = -EFAULT;
+			goto free_req;
+		}
+		req->n_tids = ntids;
+		idx++;
+	}
+
+	/* Have to select the engine */
+	req->sde = sdma_select_engine_vl(dd,
+					 (u32)(uctxt->ctxt + subctxt_fp(fp)),
+					 vl);
+	if (!req->sde || !sdma_running(req->sde)) {
+		ret = -ECOMM;
+		goto free_req;
+	}
+
+	/* We don't need an AHG entry if the request contains only one packet */
+	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+		int ahg = sdma_ahg_alloc(req->sde);
+
+		if (likely(ahg >= 0)) {
+			req->ahg_idx = (u8)ahg;
+			set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+		}
+	}
+
+	set_comp_state(req, QUEUED, 0);
+	/* Send the first N packets in the request to buy us some time */
+	sent = user_sdma_send_pkts(req, pcount);
+	if (unlikely(sent < 0)) {
+		if (sent != -EBUSY) {
+			ret = sent;
+			goto send_err;
+		} else
+			sent = 0;
+	}
+	atomic_inc(&pq->n_reqs);
+
+	if (sent < req->info.npkts) {
+		/* Take the references to the user's task and mm_struct */
+		get_task_struct(current);
+		req->user_proc = current;
+
+		/*
+		 * This is a somewhat blocking send implementation.
+		 * The driver will block the caller until all packets of the
+		 * request have been submitted to the SDMA engine. However, it
+		 * will not wait for send completions.
+		 */
+		while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+			ret = user_sdma_send_pkts(req, pcount);
+			if (ret < 0) {
+				if (ret != -EBUSY)
+					goto send_err;
+				wait_event_interruptible_timeout(
+					pq->busy.wait_dma,
+					(pq->state == SDMA_PKT_Q_ACTIVE),
+					msecs_to_jiffies(
+						SDMA_IOWAIT_TIMEOUT));
+			}
+		}
+
+	}
+	ret = 0;
+	*count += idx;
+	goto done;
+send_err:
+	set_comp_state(req, ERROR, ret);
+free_req:
+	user_sdma_free_request(req);
+done:
+	return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+					    struct user_sdma_txreq *tx)
+{
+	/*
+	 * Determine the proper size of the packet data.
+	 * The size of the data of the first packet is in the header
+	 * template. However, it includes the header and ICRC, which need
+	 * to be subtracted.
+	 * The size of the remaining packets is the minimum of the frag
+	 * size (MTU) or remaining data in the request.
+	 */
+	u32 len;
+
+	if (!req->seqnum) {
+		len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+		       (sizeof(tx->hdr) - 4));
+	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
+		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+			PAGE_SIZE;
+		/* Get the data length based on the remaining space in the
+		 * TID pair. */
+		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+		/* If we've filled up the TID pair, move to the next one. */
+		if (unlikely(!len) && ++req->tididx < req->n_tids &&
+		    req->tids[req->tididx]) {
+			tidlen = EXP_TID_GET(req->tids[req->tididx],
+					     LEN) * PAGE_SIZE;
+			req->tidoffset = 0;
+			len = min_t(u32, tidlen, req->info.fragsize);
+		}
+		/* Since the TID pairs map entire pages, make sure that we
+		 * are not going to try to send more data that we have
+		 * remaining. */
+		len = min(len, req->data_len - req->sent);
+	} else
+		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+	SDMA_DBG(req, "Data Length = %u", len);
+	return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
+	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+	int ret = 0;
+	unsigned npkts = 0;
+	struct user_sdma_txreq *tx = NULL;
+	struct hfi1_user_sdma_pkt_q *pq = NULL;
+	struct user_sdma_iovec *iovec = NULL;
+
+	if (!req->pq) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	pq = req->pq;
+
+	/*
+	 * Check if we might have sent the entire request already
+	 */
+	if (unlikely(req->seqnum == req->info.npkts)) {
+		if (!list_empty(&req->txps))
+			goto dosend;
+		goto done;
+	}
+
+	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+		maxpkts = req->info.npkts - req->seqnum;
+
+	while (npkts < maxpkts) {
+		u32 datalen = 0, queued = 0, data_sent = 0;
+		u64 iov_offset = 0;
+
+		/*
+		 * Check whether any of the completions have come back
+		 * with errors. If so, we are not going to process any
+		 * more packets from this request.
+		 */
+		if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+			set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+			ret = -EFAULT;
+			goto done;
+		}
+
+		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+		if (!tx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		tx->flags = 0;
+		tx->req = req;
+		tx->busycount = 0;
+		tx->iovec1 = NULL;
+		tx->iovec2 = NULL;
+
+		if (req->seqnum == req->info.npkts - 1)
+			tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
+
+		/*
+		 * Calculate the payload size - this is min of the fragment
+		 * (MTU) size or the remaining bytes in the request but only
+		 * if we have payload data.
+		 */
+		if (req->data_len) {
+			iovec = &req->iovs[req->iov_idx];
+			if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+				if (++req->iov_idx == req->data_iovs) {
+					ret = -EFAULT;
+					goto free_txreq;
+				}
+				iovec = &req->iovs[req->iov_idx];
+				WARN_ON(iovec->offset);
+			}
+
+			/*
+			 * This request might include only a header and no user
+			 * data, so pin pages only if there is data and it the
+			 * pages have not been pinned already.
+			 */
+			if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
+				ret = pin_vector_pages(req, iovec);
+				if (ret)
+					goto free_tx;
+			}
+
+			tx->iovec1 = iovec;
+			datalen = compute_data_length(req, tx);
+			if (!datalen) {
+				SDMA_DBG(req,
+					 "Request has data but pkt len is 0");
+				ret = -EFAULT;
+				goto free_tx;
+			}
+		}
+
+		if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+			if (!req->seqnum) {
+				u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+				u32 lrhlen = get_lrh_len(req->hdr, datalen);
+				/*
+				 * Copy the request header into the tx header
+				 * because the HW needs a cacheline-aligned
+				 * address.
+				 * This copy can be optimized out if the hdr
+				 * member of user_sdma_request were also
+				 * cacheline aligned.
+				 */
+				memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+				if (PBC2LRH(pbclen) != lrhlen) {
+					pbclen = (pbclen & 0xf000) |
+						LRH2PBC(lrhlen);
+					tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+				}
+				ret = sdma_txinit_ahg(&tx->txreq,
+						      SDMA_TXREQ_F_AHG_COPY,
+						      sizeof(tx->hdr) + datalen,
+						      req->ahg_idx, 0, NULL, 0,
+						      user_sdma_txreq_cb);
+				if (ret)
+					goto free_tx;
+				ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+							&tx->hdr,
+							sizeof(tx->hdr));
+				if (ret)
+					goto free_txreq;
+			} else {
+				int changes;
+
+				changes = set_txreq_header_ahg(req, tx,
+							       datalen);
+				if (changes < 0)
+					goto free_tx;
+				sdma_txinit_ahg(&tx->txreq,
+						SDMA_TXREQ_F_USE_AHG,
+						datalen, req->ahg_idx, changes,
+						req->ahg, sizeof(req->hdr),
+						user_sdma_txreq_cb);
+			}
+		} else {
+			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+					  datalen, user_sdma_txreq_cb);
+			if (ret)
+				goto free_tx;
+			/*
+			 * Modify the header for this packet. This only needs
+			 * to be done if we are not going to use AHG. Otherwise,
+			 * the HW will do it based on the changes we gave it
+			 * during sdma_txinit_ahg().
+			 */
+			ret = set_txreq_header(req, tx, datalen);
+			if (ret)
+				goto free_txreq;
+		}
+
+		/*
+		 * If the request contains any data vectors, add up to
+		 * fragsize bytes to the descriptor.
+		 */
+		while (queued < datalen &&
+		       (req->sent + data_sent) < req->data_len) {
+			unsigned long base, offset;
+			unsigned pageidx, len;
+
+			base = (unsigned long)iovec->iov.iov_base;
+			offset = ((base + iovec->offset + iov_offset) &
+				  ~PAGE_MASK);
+			pageidx = (((iovec->offset + iov_offset +
+				     base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+			len = offset + req->info.fragsize > PAGE_SIZE ?
+				PAGE_SIZE - offset : req->info.fragsize;
+			len = min((datalen - queued), len);
+			ret = sdma_txadd_page(pq->dd, &tx->txreq,
+					      iovec->pages[pageidx],
+					      offset, len);
+			if (ret) {
+				dd_dev_err(pq->dd,
+					   "SDMA txreq add page failed %d\n",
+					   ret);
+				iovec_set_complete(iovec);
+				goto free_txreq;
+			}
+			iov_offset += len;
+			queued += len;
+			data_sent += len;
+			if (unlikely(queued < datalen &&
+				     pageidx == iovec->npages &&
+				     req->iov_idx < req->data_iovs - 1)) {
+				iovec->offset += iov_offset;
+				iovec = &req->iovs[++req->iov_idx];
+				if (!iovec->pages) {
+					ret = pin_vector_pages(req, iovec);
+					if (ret)
+						goto free_txreq;
+				}
+				iov_offset = 0;
+				tx->iovec2 = iovec;
+
+			}
+		}
+		/*
+		 * The txreq was submitted successfully so we can update
+		 * the counters.
+		 */
+		req->koffset += datalen;
+		if (req_opcode(req->info.ctrl) == EXPECTED)
+			req->tidoffset += datalen;
+		req->sent += data_sent;
+		if (req->data_len) {
+			if (tx->iovec1 && !tx->iovec2)
+				tx->iovec1->offset += iov_offset;
+			else if (tx->iovec2)
+				tx->iovec2->offset += iov_offset;
+		}
+		/*
+		 * It is important to increment this here as it is used to
+		 * generate the BTH.PSN and, therefore, can't be bulk-updated
+		 * outside of the loop.
+		 */
+		tx->seqnum = req->seqnum++;
+		list_add_tail(&tx->txreq.list, &req->txps);
+		npkts++;
+	}
+dosend:
+	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+	if (list_empty(&req->txps))
+		if (req->seqnum == req->info.npkts) {
+			set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+			/*
+			 * The txreq has already been submitted to the HW queue
+			 * so we can free the AHG entry now. Corruption will not
+			 * happen due to the sequential manner in which
+			 * descriptors are processed.
+			 */
+			if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+				sdma_ahg_free(req->sde, req->ahg_idx);
+		}
+	goto done;
+free_txreq:
+	sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+	kmem_cache_free(pq->txreq_cache, tx);
+done:
+	return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long len   = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+			    struct user_sdma_iovec *iovec) {
+	int ret = 0;
+	unsigned pinned;
+
+	iovec->npages = num_user_pages(&iovec->iov);
+	iovec->pages = kzalloc(sizeof(*iovec->pages) *
+			       iovec->npages, GFP_KERNEL);
+	if (!iovec->pages) {
+		SDMA_DBG(req, "Failed page array alloc");
+		ret = -ENOMEM;
+		goto done;
+	}
+	/* If called by the kernel thread, use the user's mm */
+	if (current->flags & PF_KTHREAD)
+		use_mm(req->user_proc->mm);
+	pinned = get_user_pages_fast(
+		(unsigned long)iovec->iov.iov_base,
+		iovec->npages, 0, iovec->pages);
+	/* If called by the kernel thread, unuse the user's mm */
+	if (current->flags & PF_KTHREAD)
+		unuse_mm(req->user_proc->mm);
+	if (pinned != iovec->npages) {
+		SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned,
+			 iovec->npages);
+		ret = -EFAULT;
+		goto pfree;
+	}
+	goto done;
+pfree:
+	unpin_vector_pages(iovec);
+done:
+	return ret;
+}
+
+static void unpin_vector_pages(struct user_sdma_iovec *iovec)
+{
+	unsigned i;
+
+	if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) {
+		hfi1_cdbg(SDMA,
+			  "the complete vector has not been sent yet %llu %zu",
+			  iovec->offset, iovec->iov.iov_len);
+		return;
+	}
+	for (i = 0; i < iovec->npages; i++)
+		if (iovec->pages[i])
+			put_page(iovec->pages[i]);
+	kfree(iovec->pages);
+	iovec->pages = NULL;
+	iovec->npages = 0;
+	iovec->offset = 0;
+}
+
+static int check_header_template(struct user_sdma_request *req,
+				 struct hfi1_pkt_header *hdr, u32 lrhlen,
+				 u32 datalen)
+{
+	/*
+	 * Perform safety checks for any type of packet:
+	 *    - transfer size is multiple of 64bytes
+	 *    - packet length is multiple of 4bytes
+	 *    - entire request length is multiple of 4bytes
+	 *    - packet length is not larger than MTU size
+	 *
+	 * These checks are only done for the first packet of the
+	 * transfer since the header is "given" to us by user space.
+	 * For the remainder of the packets we compute the values.
+	 */
+	if (req->info.fragsize % PIO_BLOCK_SIZE ||
+	    lrhlen & 0x3 || req->data_len & 0x3  ||
+	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+		return -EINVAL;
+
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		/*
+		 * The header is checked only on the first packet. Furthermore,
+		 * we ensure that at least one TID entry is copied when the
+		 * request is submitted. Therefore, we don't have to verify that
+		 * tididx points to something sane.
+		 */
+		u32 tidval = req->tids[req->tididx],
+			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+			tididx = EXP_TID_GET(tidval, IDX),
+			tidctrl = EXP_TID_GET(tidval, CTRL),
+			tidoff;
+		__le32 kval = hdr->kdeth.ver_tid_offset;
+
+		tidoff = KDETH_GET(kval, OFFSET) *
+			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+			   KDETH_OM_LARGE : KDETH_OM_SMALL);
+		/*
+		 * Expected receive packets have the following
+		 * additional checks:
+		 *     - offset is not larger than the TID size
+		 *     - TIDCtrl values match between header and TID array
+		 *     - TID indexes match between header and TID array
+		 */
+		if ((tidoff + datalen > tidlen) ||
+		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
+		    KDETH_GET(kval, TID) != tididx)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+	u32 val = be32_to_cpu(bthpsn),
+		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+			0xffffffull),
+		psn = val & mask;
+	if (expct)
+		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+	else
+		psn = psn + frags;
+	return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+			    struct user_sdma_txreq *tx, u32 datalen)
+{
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct hfi1_pkt_header *hdr = &tx->hdr;
+	u16 pbclen;
+	int ret;
+	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+
+	/* Copy the header template to the request before modification */
+	memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+	/*
+	 * Check if the PBC and LRH length are mismatched. If so
+	 * adjust both in the header.
+	 */
+	pbclen = le16_to_cpu(hdr->pbc[0]);
+	if (PBC2LRH(pbclen) != lrhlen) {
+		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+		hdr->pbc[0] = cpu_to_le16(pbclen);
+		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+		/*
+		 * Third packet
+		 * This is the first packet in the sequence that has
+		 * a "static" size that can be used for the rest of
+		 * the packets (besides the last one).
+		 */
+		if (unlikely(req->seqnum == 2)) {
+			/*
+			 * From this point on the lengths in both the
+			 * PBC and LRH are the same until the last
+			 * packet.
+			 * Adjust the template so we don't have to update
+			 * every packet
+			 */
+			req->hdr.pbc[0] = hdr->pbc[0];
+			req->hdr.lrh[2] = hdr->lrh[2];
+		}
+	}
+	/*
+	 * We only have to modify the header if this is not the
+	 * first packet in the request. Otherwise, we use the
+	 * header given to us.
+	 */
+	if (unlikely(!req->seqnum)) {
+		ret = check_header_template(req, hdr, lrhlen, datalen);
+		if (ret)
+			return ret;
+		goto done;
+
+	}
+
+	hdr->bth[2] = cpu_to_be32(
+		set_pkt_bth_psn(hdr->bth[2],
+				(req_opcode(req->info.ctrl) == EXPECTED),
+				req->seqnum));
+
+	/* Set ACK request on last packet */
+	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+		hdr->bth[2] |= cpu_to_be32(1UL<<31);
+
+	/* Set the new offset */
+	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+	/* Expected packets have to fill in the new TID information */
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		tidval = req->tids[req->tididx];
+		/*
+		 * If the offset puts us at the end of the current TID,
+		 * advance everything.
+		 */
+		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+					 PAGE_SIZE)) {
+			req->tidoffset = 0;
+			/* Since we don't copy all the TIDs, all at once,
+			 * we have to check again. */
+			if (++req->tididx > req->n_tids - 1 ||
+			    !req->tids[req->tididx]) {
+				return -EINVAL;
+			}
+			tidval = req->tids[req->tididx];
+		}
+		req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+		/* Set KDETH.TIDCtrl based on value for this TID. */
+		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+			  EXP_TID_GET(tidval, CTRL));
+		/* Set KDETH.TID based on value for this TID */
+		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+			  EXP_TID_GET(tidval, IDX));
+		/* Clear KDETH.SH only on the last packet */
+		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+		/*
+		 * Set the KDETH.OFFSET and KDETH.OM based on size of
+		 * transfer.
+		 */
+		SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+			 req->tidoffset, req->tidoffset / req->omfactor,
+			 !!(req->omfactor - KDETH_OM_SMALL));
+		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+			  req->tidoffset / req->omfactor);
+		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+			  !!(req->omfactor - KDETH_OM_SMALL));
+	}
+done:
+	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+				    req->info.comp_idx, hdr, tidval);
+	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+				struct user_sdma_txreq *tx, u32 len)
+{
+	int diff = 0;
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct hfi1_pkt_header *hdr = &req->hdr;
+	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+
+	if (PBC2LRH(pbclen) != lrhlen) {
+		/* PBC.PbcLengthDWs */
+		AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+			       cpu_to_le16(LRH2PBC(lrhlen)));
+		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
+		AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+			       cpu_to_be16(lrhlen >> 2));
+	}
+
+	/*
+	 * Do the common updates
+	 */
+	/* BTH.PSN and BTH.A */
+	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+		val32 |= 1UL << 31;
+	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+	/* KDETH.Offset */
+	AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+		       cpu_to_le16(req->koffset & 0xffff));
+	AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+		       cpu_to_le16(req->koffset >> 16));
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		__le16 val;
+
+		tidval = req->tids[req->tididx];
+
+		/*
+		 * If the offset puts us at the end of the current TID,
+		 * advance everything.
+		 */
+		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+					 PAGE_SIZE)) {
+			req->tidoffset = 0;
+			/* Since we don't copy all the TIDs, all at once,
+			 * we have to check again. */
+			if (++req->tididx > req->n_tids - 1 ||
+			    !req->tids[req->tididx]) {
+				return -EINVAL;
+			}
+			tidval = req->tids[req->tididx];
+		}
+		req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+				  PAGE_SIZE) >=
+				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+			KDETH_OM_SMALL;
+		/* KDETH.OM and KDETH.OFFSET (TID) */
+		AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+			       ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+				((req->tidoffset / req->omfactor) & 0x7fff)));
+		/* KDETH.TIDCtrl, KDETH.TID */
+		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+					(EXP_TID_GET(tidval, IDX) & 0x3ff));
+		/* Clear KDETH.SH on last packet */
+		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
+			val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+								INTR) >> 16);
+			val &= cpu_to_le16(~(1U << 13));
+			AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+		} else
+			AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+	}
+
+	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+					req->info.comp_idx, req->sde->this_idx,
+					req->ahg_idx, req->ahg, diff, tidval);
+	return diff;
+}
+
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
+			       int drain)
+{
+	struct user_sdma_txreq *tx =
+		container_of(txreq, struct user_sdma_txreq, txreq);
+	struct user_sdma_request *req = tx->req;
+	struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL;
+	u64 tx_seqnum;
+
+	if (unlikely(!req || !pq))
+		return;
+
+	if (tx->iovec1)
+		iovec_may_free(tx->iovec1, unpin_vector_pages);
+	if (tx->iovec2)
+		iovec_may_free(tx->iovec2, unpin_vector_pages);
+
+	tx_seqnum = tx->seqnum;
+	kmem_cache_free(pq->txreq_cache, tx);
+
+	if (status != SDMA_TXREQ_S_OK) {
+		dd_dev_err(pq->dd, "SDMA completion with error %d", status);
+		set_comp_state(req, ERROR, status);
+		set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+		/* Do not free the request until the sender loop has ack'ed
+		 * the error and we've seen all txreqs. */
+		if (tx_seqnum == ACCESS_ONCE(req->seqnum) &&
+		    test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
+			atomic_dec(&pq->n_reqs);
+			user_sdma_free_request(req);
+		}
+	} else {
+		if (tx_seqnum == req->info.npkts - 1) {
+			/* We've sent and completed all packets in this
+			 * request. Signal completion to the user */
+			atomic_dec(&pq->n_reqs);
+			set_comp_state(req, COMPLETE, 0);
+			user_sdma_free_request(req);
+		}
+	}
+	if (!atomic_read(&pq->n_reqs))
+		xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req)
+{
+	if (!list_empty(&req->txps)) {
+		struct sdma_txreq *t, *p;
+
+		list_for_each_entry_safe(t, p, &req->txps, list) {
+			struct user_sdma_txreq *tx =
+				container_of(t, struct user_sdma_txreq, txreq);
+			list_del_init(&t->list);
+			sdma_txclean(req->pq->dd, t);
+			kmem_cache_free(req->pq->txreq_cache, tx);
+		}
+	}
+	if (req->data_iovs) {
+		int i;
+
+		for (i = 0; i < req->data_iovs; i++)
+			if (req->iovs[i].npages && req->iovs[i].pages)
+				unpin_vector_pages(&req->iovs[i]);
+	}
+	if (req->user_proc)
+		put_task_struct(req->user_proc);
+	kfree(req->tids);
+	clear_bit(SDMA_REQ_IN_USE, &req->flags);
+}
+
+static inline void set_comp_state(struct user_sdma_request *req,
+					enum hfi1_sdma_comp_state state,
+					int ret)
+{
+	SDMA_DBG(req, "Setting completion status %u %d", state, ret);
+	req->cq->comps[req->info.comp_idx].status = state;
+	if (state == ERROR)
+		req->cq->comps[req->info.comp_idx].errcode = -ret;
+	trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
+					req->pq->subctxt, req->info.comp_idx,
+					state, ret);
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
new file mode 100644
index 0000000..fa44225
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.h
@@ -0,0 +1,89 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x7FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)	\
+	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+	struct list_head list;
+	unsigned ctxt;
+	unsigned subctxt;
+	u16 n_max_reqs;
+	atomic_t n_reqs;
+	u16 reqidx;
+	struct hfi1_devdata *dd;
+	struct kmem_cache *txreq_cache;
+	struct user_sdma_request *reqs;
+	struct iowait busy;
+	unsigned state;
+};
+
+struct hfi1_user_sdma_comp_q {
+	u16 nentries;
+	struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+				   unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
new file mode 100644
index 0000000..5f4b661
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -0,0 +1,2142 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+		   uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static void verbs_sdma_complete(
+	struct sdma_txreq *cookie,
+	int status,
+	int drained);
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; hfi1_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = HFI1_POST_RECV_OK,
+	[IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+	    HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK |
+	    HFI1_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+	    HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+	    HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+	[IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV |
+	    HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+};
+
+struct hfi1_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+	/* RC */
+	[IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
+	[IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
+	[IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
+	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
+	[IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
+	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
+	[IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
+	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+	/* UC */
+	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
+	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
+	[IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
+	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
+	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+	/* UD */
+	[IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
+	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+	/* RC */
+	[IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+	/* UC */
+	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+	/* UD */
+	[IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
+	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
+	/* CNP */
+	[IB_OPCODE_CNP]				      = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void hfi1_copy_sge(
+	struct hfi1_sge_state *ss,
+	void *data, u32 length,
+	int release)
+{
+	struct hfi1_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				hfi1_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= HFI1_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release)
+{
+	struct hfi1_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				hfi1_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= HFI1_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/**
+ * post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+	struct hfi1_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	struct hfi1_lkey_table *rkt;
+	struct hfi1_pd *pd;
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (unlikely(wr->num_sge > qp->s_max_sge))
+		return -EINVAL;
+
+	ppd = &dd->pport[qp->port_num - 1];
+	ibp = &ppd->ibport_data;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (wr->opcode == IB_WR_FAST_REG_MR) {
+		return -EINVAL;
+	} else if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			return -EINVAL;
+	} else if (qp->ibqp.qp_type != IB_QPT_RC) {
+		/* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			return -EINVAL;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			return -EINVAL;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		return -EINVAL;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		return -EINVAL;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		return -EINVAL;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last)
+		return -ENOMEM;
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.pd);
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	j = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j],
+					  &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval_free;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval_free;
+	} else {
+		struct hfi1_ah *ah = to_iah(wr->wr.ud.ah);
+
+		atomic_inc(&ah->refcount);
+	}
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	return 0;
+
+bail_inval_free:
+	/* release mr holds */
+	while (j) {
+		struct hfi1_sge *sge = &wqe->sg_list[--j];
+
+		hfi1_put_mr(sge->mr);
+	}
+	return -EINVAL;
+}
+
+/**
+ * post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	int err = 0;
+	int call_send;
+	unsigned long flags;
+	unsigned nreq = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		return -EINVAL;
+	}
+
+	/* sq empty and not list -> call send */
+	call_send = qp->s_head == qp->s_last && !wr->next;
+
+	for (; wr; wr = wr->next) {
+		err = post_one_send(qp, wr);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			goto bail;
+		}
+		nreq++;
+	}
+bail:
+	if (nreq && !call_send)
+		hfi1_schedule_send(qp);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (nreq && call_send)
+		hfi1_do_send(&qp->s_iowait.iowork);
+	return err;
+}
+
+/**
+ * post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			struct ib_recv_wr **bad_wr)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	struct hfi1_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct hfi1_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp;
+
+	if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK))
+		goto dropit;
+	if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+	    (opcode == IB_OPCODE_CNP))
+		return 1;
+dropit:
+	ibp = &packet->rcd->ppd->ibport_data;
+	ibp->n_pkt_drops++;
+	return 0;
+}
+
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_ib_header *hdr = packet->hdr;
+	u32 tlen = packet->tlen;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == HFI1_LRH_BTH)
+		packet->ohdr = &hdr->u.oth;
+	else if (lnh == HFI1_LRH_GRH) {
+		u32 vtf;
+
+		packet->ohdr = &hdr->u.l.oth;
+		if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+		packet->rcv_flags |= HFI1_HAS_GRH;
+	} else
+		goto drop;
+
+	trace_input_ibhdr(rcd->dd, hdr);
+
+	opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK;
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) &&
+	    (lid != HFI1_PERMISSIVE_LID))) {
+		struct hfi1_mcast *mcast;
+		struct hfi1_mcast_qp *p;
+
+		if (lnh != HFI1_LRH_GRH)
+			goto drop;
+		mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid);
+		if (mcast == NULL)
+			goto drop;
+		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+			packet->qp = p->qp;
+			spin_lock(&packet->qp->r_lock);
+			if (likely((qp_ok(opcode, packet))))
+				opcode_handler_tbl[opcode](packet);
+			spin_unlock(&packet->qp->r_lock);
+		}
+		/*
+		 * Notify hfi1_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		rcu_read_lock();
+		packet->qp = hfi1_lookup_qpn(ibp, qp_num);
+		if (!packet->qp) {
+			rcu_read_unlock();
+			goto drop;
+		}
+		spin_lock(&packet->qp->r_lock);
+		if (likely((qp_ok(opcode, packet))))
+			opcode_handler_tbl[opcode](packet);
+		spin_unlock(&packet->qp->r_lock);
+		rcu_read_unlock();
+	}
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+	struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+	struct list_head *list = &dev->memwait;
+	struct hfi1_qp *qp = NULL;
+	struct iowait *wait;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	if (!list_empty(list)) {
+		wait = list_first_entry(list, struct iowait, list);
+		qp = container_of(wait, struct hfi1_qp, s_iowait);
+		list_del_init(&qp->s_iowait.list);
+		/* refcount held until actual wake up */
+		if (!list_empty(list))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+	}
+	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+	if (qp)
+		hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM);
+}
+
+void update_sge(struct hfi1_sge_state *ss, u32 length)
+{
+	struct hfi1_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr->lkey) {
+		if (++sge->n >= HFI1_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+						struct hfi1_qp *qp)
+{
+	struct verbs_txreq *tx;
+	unsigned long flags;
+
+	tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+	if (!tx) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		write_seqlock(&dev->iowait_lock);
+		if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK &&
+		    list_empty(&qp->s_iowait.list)) {
+			dev->n_txwait++;
+			qp->s_flags |= HFI1_S_WAIT_TX;
+			list_add_tail(&qp->s_iowait.list, &dev->txwait);
+			trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX);
+			atomic_inc(&qp->refcount);
+		}
+		qp->s_flags &= ~HFI1_S_BUSY;
+		write_sequnlock(&dev->iowait_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = ERR_PTR(-EBUSY);
+	}
+	return tx;
+}
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+					    struct hfi1_qp *qp)
+{
+	struct verbs_txreq *tx;
+
+	tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+	if (!tx)
+		/* call slow path to get the lock */
+		tx =  __get_txreq(dev, qp);
+	if (tx)
+		tx->qp = qp;
+	return tx;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+	struct hfi1_ibdev *dev;
+	struct hfi1_qp *qp;
+	unsigned long flags;
+	unsigned int seq;
+
+	qp = tx->qp;
+	dev = to_idev(qp->ibqp.device);
+
+	if (tx->mr) {
+		hfi1_put_mr(tx->mr);
+		tx->mr = NULL;
+	}
+	sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+	/* Free verbs_txreq and return to slab cache */
+	kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+	do {
+		seq = read_seqbegin(&dev->iowait_lock);
+		if (!list_empty(&dev->txwait)) {
+			struct iowait *wait;
+
+			write_seqlock_irqsave(&dev->iowait_lock, flags);
+			/* Wake up first QP wanting a free struct */
+			wait = list_first_entry(&dev->txwait, struct iowait,
+						list);
+			qp = container_of(wait, struct hfi1_qp, s_iowait);
+			list_del_init(&qp->s_iowait.list);
+			/* refcount held until actual wake up */
+			write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+			hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX);
+			break;
+		}
+	} while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+	struct sdma_txreq *cookie,
+	int status,
+	int drained)
+{
+	struct verbs_txreq *tx =
+		container_of(cookie, struct verbs_txreq, txreq);
+	struct hfi1_qp *qp = tx->qp;
+
+	spin_lock(&qp->s_lock);
+	if (tx->wqe)
+		hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+	else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		struct hfi1_ib_header *hdr;
+
+		hdr = &tx->phdr.hdr;
+		hfi1_rc_send_complete(qp, hdr);
+	}
+	if (drained) {
+		/*
+		 * This happens when the send engine notes
+		 * a QP in the error state and cannot
+		 * do the flush work until that QP's
+		 * sdma work has finished.
+		 */
+		if (qp->s_flags & HFI1_S_WAIT_DMA) {
+			qp->s_flags &= ~HFI1_S_WAIT_DMA;
+			hfi1_schedule_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+
+	hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+		write_seqlock(&dev->iowait_lock);
+		if (list_empty(&qp->s_iowait.list)) {
+			if (list_empty(&dev->memwait))
+				mod_timer(&dev->mem_timer, jiffies + 1);
+			qp->s_flags |= HFI1_S_WAIT_KMEM;
+			list_add_tail(&qp->s_iowait.list, &dev->memwait);
+			trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM);
+			atomic_inc(&qp->refcount);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~HFI1_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static int build_verbs_ulp_payload(
+	struct sdma_engine *sde,
+	struct hfi1_sge_state *ss,
+	u32 length,
+	struct verbs_txreq *tx)
+{
+	struct hfi1_sge *sg_list = ss->sg_list;
+	struct hfi1_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 len;
+	int ret = 0;
+
+	while (length) {
+		len = ss->sge.length;
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		WARN_ON_ONCE(len == 0);
+		ret = sdma_txadd_kvaddr(
+			sde->dd,
+			&tx->txreq,
+			ss->sge.vaddr,
+			len);
+		if (ret)
+			goto bail_txadd;
+		update_sge(ss, len);
+		length -= len;
+	}
+	return ret;
+bail_txadd:
+	/* unwind cursor */
+	ss->sge = sge;
+	ss->num_sge = num_sge;
+	ss->sg_list = sg_list;
+	return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ *       the tx desc is freed without having been submitted to the ring
+ *
+ * This routine insures the following all the helper routine
+ * calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+	struct sdma_engine *sde,
+	struct hfi1_sge_state *ss,
+	u32 length,
+	struct verbs_txreq *tx,
+	struct ahg_ib_header *ahdr,
+	u64 pbc)
+{
+	int ret = 0;
+	struct hfi1_pio_header *phdr;
+	u16 hdrbytes = tx->hdr_dwords << 2;
+
+	phdr = &tx->phdr;
+	if (!ahdr->ahgcount) {
+		ret = sdma_txinit_ahg(
+			&tx->txreq,
+			ahdr->tx_flags,
+			hdrbytes + length,
+			ahdr->ahgidx,
+			0,
+			NULL,
+			0,
+			verbs_sdma_complete);
+		if (ret)
+			goto bail_txadd;
+		phdr->pbc = cpu_to_le64(pbc);
+		memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc));
+		/* add the header */
+		ret = sdma_txadd_kvaddr(
+			sde->dd,
+			&tx->txreq,
+			&tx->phdr,
+			tx->hdr_dwords << 2);
+		if (ret)
+			goto bail_txadd;
+	} else {
+		struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth;
+		struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth;
+
+		/* needed in rc_send_complete() */
+		phdr->hdr.lrh[0] = ahdr->ibh.lrh[0];
+		if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) {
+			sohdr = &ahdr->ibh.u.l.oth;
+			dohdr = &phdr->hdr.u.l.oth;
+		}
+		/* opcode */
+		dohdr->bth[0] = sohdr->bth[0];
+		/* PSN/ACK  */
+		dohdr->bth[2] = sohdr->bth[2];
+		ret = sdma_txinit_ahg(
+			&tx->txreq,
+			ahdr->tx_flags,
+			length,
+			ahdr->ahgidx,
+			ahdr->ahgcount,
+			ahdr->ahgdesc,
+			hdrbytes,
+			verbs_sdma_complete);
+		if (ret)
+			goto bail_txadd;
+	}
+
+	/* add the ulp payload - if any.  ss can be NULL for acks */
+	if (ss)
+		ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+	return ret;
+}
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+			u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			u32 plen, u32 dwords, u64 pbc)
+{
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct verbs_txreq *tx;
+	struct sdma_txreq *stx;
+	u64 pbc_flags = 0;
+	struct sdma_engine *sde;
+	u8 sc5 = qp->s_sc;
+	int ret;
+
+	if (!list_empty(&qp->s_iowait.tx_head)) {
+		stx = list_first_entry(
+			&qp->s_iowait.tx_head,
+			struct sdma_txreq,
+			list);
+		list_del_init(&stx->list);
+		tx = container_of(stx, struct verbs_txreq, txreq);
+		ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx);
+		if (unlikely(ret == -ECOMM))
+			goto bail_ecomm;
+		return ret;
+	}
+
+	tx = get_txreq(dev, qp);
+	if (IS_ERR(tx))
+		goto bail_tx;
+
+	if (!qp->s_hdr->sde) {
+		tx->sde = sde = qp_to_sdma_engine(qp, sc5);
+		if (!sde)
+			goto bail_no_sde;
+	} else
+		tx->sde = sde = qp->s_hdr->sde;
+
+	if (likely(pbc == 0)) {
+		u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+		/* No vl15 here */
+		/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+		pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+		pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	}
+	tx->wqe = qp->s_wqe;
+	tx->mr = qp->s_rdma_mr;
+	if (qp->s_rdma_mr)
+		qp->s_rdma_mr = NULL;
+	tx->hdr_dwords = hdrwords + 2;
+	ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc);
+	if (unlikely(ret))
+		goto bail_build;
+	trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+	ret =  sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq);
+	if (unlikely(ret == -ECOMM))
+		goto bail_ecomm;
+	return ret;
+
+bail_no_sde:
+	hfi1_put_txreq(tx);
+bail_ecomm:
+	/* The current one got "sent" */
+	return 0;
+bail_build:
+	/* kmalloc or mapping fail */
+	hfi1_put_txreq(tx);
+	return wait_kmem(dev, qp);
+bail_tx:
+	return PTR_ERR(tx);
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, sc_piobufavail()
+	 * could be called. Therefore, put QP on the I/O wait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+		write_seqlock(&dev->iowait_lock);
+		if (list_empty(&qp->s_iowait.list)) {
+			struct hfi1_ibdev *dev = &dd->verbs_dev;
+			int was_empty;
+
+			dev->n_piowait++;
+			qp->s_flags |= HFI1_S_WAIT_PIO;
+			was_empty = list_empty(&sc->piowait);
+			list_add_tail(&qp->s_iowait.list, &sc->piowait);
+			trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO);
+			atomic_inc(&qp->refcount);
+			/* counting: only call wantpiobuf_intr if first user */
+			if (was_empty)
+				hfi1_sc_wantpiobuf_intr(sc, 1);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~HFI1_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
+	u8 vl;
+
+	vl = sc_to_vlt(dd, sc5);
+	if (vl >= ppd->vls_supported && vl != 15)
+		return NULL;
+	return dd->vld[vl].sc;
+}
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+			u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			u32 plen, u32 dwords, u64 pbc)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 *hdr = (u32 *)&ahdr->ibh;
+	u64 pbc_flags = 0;
+	u32 sc5;
+	unsigned long flags = 0;
+	struct send_context *sc;
+	struct pio_buf *pbuf;
+	int wc_status = IB_WC_SUCCESS;
+
+	/* vl15 special case taken care of in ud.c */
+	sc5 = qp->s_sc;
+	sc = qp_to_send_context(qp, sc5);
+
+	if (!sc)
+		return -EINVAL;
+	if (likely(pbc == 0)) {
+		u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+		/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+		pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+		pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	}
+	pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+	if (unlikely(pbuf == NULL)) {
+		if (ppd->host_link_state != HLS_UP_ACTIVE) {
+			/*
+			 * If we have filled the PIO buffers to capacity and are
+			 * not in an active state this request is not going to
+			 * go out to so just complete it with an error or else a
+			 * ULP or the core may be stuck waiting.
+			 */
+			hfi1_cdbg(
+				PIO,
+				"alloc failed. state not active, completing");
+			wc_status = IB_WC_GENERAL_ERR;
+			goto pio_bail;
+		} else {
+			/*
+			 * This is a normal occurrence. The PIO buffs are full
+			 * up but we are still happily sending, well we could be
+			 * so lets continue to queue the request.
+			 */
+			hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+			return no_bufs_available(qp, sc);
+		}
+	}
+
+	if (len == 0) {
+		pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+	} else {
+		if (ss) {
+			seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
+			while (len) {
+				void *addr = ss->sge.vaddr;
+				u32 slen = ss->sge.length;
+
+				if (slen > len)
+					slen = len;
+				update_sge(ss, slen);
+				seg_pio_copy_mid(pbuf, addr, slen);
+				len -= slen;
+			}
+			seg_pio_copy_end(pbuf);
+		}
+	}
+
+	trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+
+	if (qp->s_rdma_mr) {
+		hfi1_put_mr(qp->s_rdma_mr);
+		qp->s_rdma_mr = NULL;
+	}
+
+pio_bail:
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_send_complete(qp, qp->s_wqe, wc_status);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_rc_send_complete(qp, &ahdr->ibh);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	return 0;
+}
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+	u16 mkey = pkey & PKEY_LOW_15_MASK;
+	u16 ment = ent & PKEY_LOW_15_MASK;
+
+	if (mkey == ment) {
+		/*
+		 * If pkey[15] is set (full partition member),
+		 * is bit 15 in the corresponding table element
+		 * clear (limited member)?
+		 */
+		if (pkey & PKEY_MEMBER_MASK)
+			return !!(ent & PKEY_MEMBER_MASK);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * egress_pkey_check - return 0 if hdr's pkey matches according to the
+ * criteria in the OPAv1 spec., section 9.11.7.
+ */
+static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
+				    struct hfi1_ib_header *hdr,
+				    struct hfi1_qp *qp)
+{
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_devdata *dd;
+	int i = 0;
+	u16 pkey;
+	u8 lnh, sc5 = qp->s_sc;
+
+	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+		return 0;
+
+	/* locate the pkey within the headers */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == HFI1_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else
+		ohdr = &hdr->u.oth;
+
+	pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+
+	/* Is the pkey = 0x0, or 0x8000? */
+	if ((pkey & PKEY_LOW_15_MASK) == 0)
+		goto bad;
+
+	/* The most likely matching pkey has index qp->s_pkey_index */
+	if (unlikely(!egress_pkey_matches_entry(pkey,
+					ppd->pkeys[qp->s_pkey_index]))) {
+		/* no match - try the entire table */
+		for (; i < MAX_PKEY_VALUES; i++) {
+			if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+				break;
+		}
+	}
+
+	if (i < MAX_PKEY_VALUES)
+		return 0;
+bad:
+	incr_cntr64(&ppd->port_xmit_constraint_errors);
+	dd = ppd->dd;
+	if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
+		u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+		dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
+		dd->err_info_xmit_constraint.slid = slid;
+		dd->err_info_xmit_constraint.pkey = pkey;
+	}
+	return 1;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ahdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+		    u32 hdrwords, struct hfi1_sge_state *ss, u32 len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	u32 plen;
+	int ret;
+	int pio = 0;
+	unsigned long flags = 0;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
+	    !(dd->flags & HFI1_HAS_SEND_DMA))
+		pio = 1;
+
+	ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp);
+	if (unlikely(ret)) {
+		/*
+		 * The value we are returning here does not get propagated to
+		 * the verbs caller. Thus we need to complete the request with
+		 * error otherwise the caller could be sitting waiting on the
+		 * completion event. Only do this for PIO. SDMA has its own
+		 * mechanism for handling the errors. So for SDMA we can just
+		 * return.
+		 */
+		if (pio) {
+			hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+				  __func__);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		return -EINVAL;
+	}
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +2 counts for the pbc control qword
+	 */
+	plen = hdrwords + dwords + 2;
+
+	if (pio) {
+		ret = dd->process_pio_send(
+			qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+	} else {
+#ifdef CONFIG_SDMA_VERBOSITY
+		dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
+			   slashstrip(__FILE__), __LINE__, __func__);
+		dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", hdrwords, len);
+#endif
+		ret = dd->process_dma_send(
+			qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+	}
+
+	return ret;
+}
+
+static int query_device(struct ib_device *ibdev,
+			struct ib_device_attr *props,
+			struct ib_udata *uhw)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+	props->vendor_part_id = dd->pcidev->device;
+	props->hw_ver = dd->minrev;
+	props->sys_image_guid = ib_hfi1_sys_image_guid;
+	props->max_mr_size = ~0ULL;
+	props->max_qp = hfi1_max_qps;
+	props->max_qp_wr = hfi1_max_qp_wrs;
+	props->max_sge = hfi1_max_sges;
+	props->max_sge_rd = hfi1_max_sges;
+	props->max_cq = hfi1_max_cqs;
+	props->max_ah = hfi1_max_ahs;
+	props->max_cqe = hfi1_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = hfi1_max_pds;
+	props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = hfi1_max_srqs;
+	props->max_srq_wr = hfi1_max_srq_wrs;
+	props->max_srq_sge = hfi1_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = hfi1_get_npkeys(dd);
+	props->max_mcast_grp = hfi1_max_mcast_grps;
+	props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+	u16 out = 0;
+
+	if (in & OPA_LINK_SPEED_25G)
+		out |= IB_SPEED_EDR;
+	if (in & OPA_LINK_SPEED_12_5G)
+		out |= IB_SPEED_FDR;
+
+	return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+	switch (in) {
+	case OPA_LINK_WIDTH_1X:
+	/* map 2x and 3x to 1x as they don't exist in IB */
+	case OPA_LINK_WIDTH_2X:
+	case OPA_LINK_WIDTH_3X:
+		return IB_WIDTH_1X;
+	default: /* link down or unknown, return our largest width */
+	case OPA_LINK_WIDTH_4X:
+		return IB_WIDTH_4X;
+	}
+}
+
+static int query_port(struct ib_device *ibdev, u8 port,
+		      struct ib_port_attr *props)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u16 lid = ppd->lid;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : 0;
+	props->lmc = ppd->lmc;
+	props->sm_lid = ibp->sm_lid;
+	props->sm_sl = ibp->sm_sl;
+	/* OPA logical states match IB logical states */
+	props->state = driver_lstate(ppd);
+	props->phys_state = hfi1_ibphys_portstate(ppd);
+	props->port_cap_flags = ibp->port_cap_flags;
+	props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = hfi1_get_npkeys(dd);
+	props->bad_pkey_cntr = ibp->pkey_violations;
+	props->qkey_viol_cntr = ibp->qkey_violations;
+	props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+	/* see rate_show() in ib core/sysfs.c */
+	props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+	props->max_vl_num = ppd->vls_supported;
+	props->init_type_reply = 0;
+
+	/* Once we are a "first class" citizen and have added the OPA MTUs to
+	 * the core we can advertise the larger MTU enum to the ULPs, for now
+	 * advertise only 4K.
+	 *
+	 * Those applications which are either OPA aware or pass the MTU enum
+	 * from the Path Records to us will get the new 8k MTU.  Those that
+	 * attempt to process the MTU enum may fail in various ways.
+	 */
+	props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+				      4096 : hfi1_max_mtu), IB_MTU_4096);
+	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+		mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+	props->subnet_timeout = ibp->subnet_timeout;
+
+	return 0;
+}
+
+static int port_immutable(struct ib_device *ibdev, u8 port_num,
+			  struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	memset(immutable, 0, sizeof(*immutable));
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+	immutable->max_mad_size = OPA_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static int modify_device(struct ib_device *device,
+			 int device_modify_mask,
+			 struct ib_device_modify *device_modify)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+	unsigned i;
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+			hfi1_node_desc_chg(ibp);
+		}
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		ib_hfi1_sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+			hfi1_sys_guid_chg(ibp);
+		}
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int modify_port(struct ib_device *ibdev, u8 port,
+		       int port_modify_mask, struct ib_port_modify *props)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	int ret = 0;
+
+	ibp->port_cap_flags |= props->set_port_cap_mask;
+	ibp->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (props->set_port_cap_mask || props->clr_port_cap_mask)
+		hfi1_cap_mask_chg(ibp);
+	if (port_modify_mask & IB_PORT_SHUTDOWN) {
+		set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+		  OPA_LINKDOWN_REASON_UNKNOWN);
+		ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+	}
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		ibp->qkey_violations = 0;
+	return ret;
+}
+
+static int query_gid(struct ib_device *ibdev, u8 port,
+		     int index, union ib_gid *gid)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	int ret = 0;
+
+	if (!port || port > dd->num_pports)
+		ret = -EINVAL;
+	else {
+		struct hfi1_ibport *ibp = to_iport(ibdev, port);
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+		gid->global.subnet_prefix = ibp->gid_prefix;
+		if (index == 0)
+			gid->global.interface_id = cpu_to_be64(ppd->guid);
+		else if (index < HFI1_GUIDS_PER_PORT)
+			gid->global.interface_id = ibp->guids[index - 1];
+		else
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static struct ib_pd *alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+	struct hfi1_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.  Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == hfi1_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int dealloc_pd(struct ib_pd *ibpd)
+{
+	struct hfi1_pd *pd = to_ipd(ibpd);
+	struct hfi1_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+	return ibp->sl_to_sc[ah->sl];
+}
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	u8 sc5;
+
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != HFI1_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH))
+		goto bail;
+	if ((ah_attr->ah_flags & IB_AH_GRH) &&
+	    ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT)
+		goto bail;
+	if (ah_attr->dlid == 0)
+		goto bail;
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > ibdev->phys_port_cnt)
+		goto bail;
+	if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
+	    ib_rate_to_mbps(ah_attr->static_rate) < 0)
+		goto bail;
+	if (ah_attr->sl >= OPA_MAX_SLS)
+		goto bail;
+	/* test the mapping for validity */
+	ibp = to_iport(ibdev, ah_attr->port_num);
+	ppd = ppd_from_ibp(ibp);
+	sc5 = ibp->sl_to_sc[ah_attr->sl];
+	dd = dd_from_ppd(ppd);
+	if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+		goto bail;
+	return 0;
+bail:
+	return -EINVAL;
+}
+
+/**
+ * create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *create_ah(struct ib_pd *pd,
+			       struct ib_ah_attr *ah_attr)
+{
+	struct hfi1_ah *ah;
+	struct ib_ah *ret;
+	struct hfi1_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	if (hfi1_check_ah(pd->device, ah_attr)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == hfi1_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	atomic_set(&ah->refcount, 0);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+	struct ib_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct hfi1_qp *qp0;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.dlid = dlid;
+	attr.port_num = ppd_from_ibp(ibp)->port;
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->qp[0]);
+	if (qp0)
+		ah = ib_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
+/**
+ * destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int destroy_ah(struct ib_ah *ibah)
+{
+	struct hfi1_ibdev *dev = to_idev(ibah->device);
+	struct hfi1_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	if (atomic_read(&ah->refcount) != 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct hfi1_ah *ah = to_iah(ibah);
+
+	if (hfi1_check_ah(ibah->device, ah_attr))
+		return -EINVAL;
+
+	ah->attr = *ah_attr;
+
+	return 0;
+}
+
+static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct hfi1_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+
+	return 0;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+	return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static int query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		      u16 *pkey)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (index >= hfi1_get_npkeys(dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = hfi1_get_pkey(to_iport(ibdev, port), index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the driver
+ */
+
+static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata)
+{
+	struct hfi1_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+	int i;
+
+	for (i = 0; i < sz; i++) {
+		ibp->sl_to_sc[i] = i;
+		ibp->sc_to_sl[i] = i;
+	}
+
+	spin_lock_init(&ibp->lock);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
+	ibp->sm_lid = 0;
+	/* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+	ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+		IB_PORT_CAP_MASK_NOTICE_SUP;
+	ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	RCU_INIT_POINTER(ibp->qp[0], NULL);
+	RCU_INIT_POINTER(ibp->qp[1], NULL);
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+	struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+	memset(tx, 0, sizeof(*tx));
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+	struct hfi1_pportdata *ppd = dd->pport;
+	unsigned i, lk_tab_size;
+	int ret;
+	size_t lcpysz = IB_DEVICE_NAME_MAX;
+	u16 descq_cnt;
+
+	ret = hfi1_qp_init(dev);
+	if (ret)
+		goto err_qp_init;
+
+
+	for (i = 0; i < dd->num_pports; i++)
+		init_ibport(ppd + i);
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&dev->n_pds_lock);
+	spin_lock_init(&dev->n_ahs_lock);
+	spin_lock_init(&dev->n_cqs_lock);
+	spin_lock_init(&dev->n_qps_lock);
+	spin_lock_init(&dev->n_srqs_lock);
+	spin_lock_init(&dev->n_mcast_grps_lock);
+	init_timer(&dev->mem_timer);
+	dev->mem_timer.function = mem_timer;
+	dev->mem_timer.data = (unsigned long) dev;
+
+	/*
+	 * The top hfi1_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	spin_lock_init(&dev->lk_table.lock);
+	dev->lk_table.max = 1 << hfi1_lkey_table_size;
+	/* ensure generation is at least 4 bits (keys.c) */
+	if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) {
+		dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
+			      hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS);
+		hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS;
+	}
+	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+	dev->lk_table.table = (struct hfi1_mregion __rcu **)
+		vmalloc(lk_tab_size);
+	if (dev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	RCU_INIT_POINTER(dev->dma_mr, NULL);
+	for (i = 0; i < dev->lk_table.max; i++)
+		RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
+	INIT_LIST_HEAD(&dev->pending_mmaps);
+	spin_lock_init(&dev->pending_lock);
+	seqlock_init(&dev->iowait_lock);
+	dev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&dev->mmap_offset_lock);
+	INIT_LIST_HEAD(&dev->txwait);
+	INIT_LIST_HEAD(&dev->memwait);
+
+	descq_cnt = sdma_get_descq_cnt();
+
+	/* SLAB_HWCACHE_ALIGN for AHG */
+	dev->verbs_txreq_cache = kmem_cache_create("hfi1_vtxreq_cache",
+						   sizeof(struct verbs_txreq),
+						   0, SLAB_HWCACHE_ALIGN,
+						   verbs_txreq_kmem_cache_ctor);
+	if (!dev->verbs_txreq_cache) {
+		ret = -ENOMEM;
+		goto err_verbs_txreq;
+	}
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * HFIs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!ib_hfi1_sys_image_guid)
+		ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+	lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+	strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_guid = cpu_to_be64(ppd->guid);
+	ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION;
+	ibdev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+		(1ull << IB_USER_VERBS_CMD_REG_MR)              |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	ibdev->phys_port_cnt = dd->num_pports;
+	ibdev->num_comp_vectors = 1;
+	ibdev->dma_device = &dd->pcidev->dev;
+	ibdev->query_device = query_device;
+	ibdev->modify_device = modify_device;
+	ibdev->query_port = query_port;
+	ibdev->modify_port = modify_port;
+	ibdev->query_pkey = query_pkey;
+	ibdev->query_gid = query_gid;
+	ibdev->alloc_ucontext = alloc_ucontext;
+	ibdev->dealloc_ucontext = dealloc_ucontext;
+	ibdev->alloc_pd = alloc_pd;
+	ibdev->dealloc_pd = dealloc_pd;
+	ibdev->create_ah = create_ah;
+	ibdev->destroy_ah = destroy_ah;
+	ibdev->modify_ah = modify_ah;
+	ibdev->query_ah = query_ah;
+	ibdev->create_srq = hfi1_create_srq;
+	ibdev->modify_srq = hfi1_modify_srq;
+	ibdev->query_srq = hfi1_query_srq;
+	ibdev->destroy_srq = hfi1_destroy_srq;
+	ibdev->create_qp = hfi1_create_qp;
+	ibdev->modify_qp = hfi1_modify_qp;
+	ibdev->query_qp = hfi1_query_qp;
+	ibdev->destroy_qp = hfi1_destroy_qp;
+	ibdev->post_send = post_send;
+	ibdev->post_recv = post_receive;
+	ibdev->post_srq_recv = hfi1_post_srq_receive;
+	ibdev->create_cq = hfi1_create_cq;
+	ibdev->destroy_cq = hfi1_destroy_cq;
+	ibdev->resize_cq = hfi1_resize_cq;
+	ibdev->poll_cq = hfi1_poll_cq;
+	ibdev->req_notify_cq = hfi1_req_notify_cq;
+	ibdev->get_dma_mr = hfi1_get_dma_mr;
+	ibdev->reg_phys_mr = hfi1_reg_phys_mr;
+	ibdev->reg_user_mr = hfi1_reg_user_mr;
+	ibdev->dereg_mr = hfi1_dereg_mr;
+	ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list;
+	ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list;
+	ibdev->alloc_fmr = hfi1_alloc_fmr;
+	ibdev->map_phys_fmr = hfi1_map_phys_fmr;
+	ibdev->unmap_fmr = hfi1_unmap_fmr;
+	ibdev->dealloc_fmr = hfi1_dealloc_fmr;
+	ibdev->attach_mcast = hfi1_multicast_attach;
+	ibdev->detach_mcast = hfi1_multicast_detach;
+	ibdev->process_mad = hfi1_process_mad;
+	ibdev->mmap = hfi1_mmap;
+	ibdev->dma_ops = &hfi1_dma_mapping_ops;
+	ibdev->get_port_immutable = port_immutable;
+
+	strncpy(ibdev->node_desc, init_utsname()->nodename,
+		sizeof(ibdev->node_desc));
+
+	ret = ib_register_device(ibdev, hfi1_create_port_files);
+	if (ret)
+		goto err_reg;
+
+	ret = hfi1_create_agents(dev);
+	if (ret)
+		goto err_agents;
+
+	ret = hfi1_verbs_register_sysfs(dd);
+	if (ret)
+		goto err_class;
+
+	goto bail;
+
+err_class:
+	hfi1_free_agents(dev);
+err_agents:
+	ib_unregister_device(ibdev);
+err_reg:
+err_verbs_txreq:
+	kmem_cache_destroy(dev->verbs_txreq_cache);
+	vfree(dev->lk_table.table);
+err_lk:
+	hfi1_qp_exit(dev);
+err_qp_init:
+	dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+bail:
+	return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+
+	hfi1_verbs_unregister_sysfs(dd);
+
+	hfi1_free_agents(dev);
+
+	ib_unregister_device(ibdev);
+
+	if (!list_empty(&dev->txwait))
+		dd_dev_err(dd, "txwait list not empty!\n");
+	if (!list_empty(&dev->memwait))
+		dd_dev_err(dd, "memwait list not empty!\n");
+	if (dev->dma_mr)
+		dd_dev_err(dd, "DMA MR not NULL!\n");
+
+	hfi1_qp_exit(dev);
+	del_timer_sync(&dev->mem_timer);
+	kmem_cache_destroy(dev->verbs_txreq_cache);
+	vfree(dev->lk_table.table);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp)
+{
+	if (hfi1_send_ok(qp)) {
+		struct hfi1_ibport *ibp =
+			to_iport(qp->ibqp.device, qp->port_num);
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+		iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
+	}
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+
+	if (packet->qp->ibqp.qp_type == IB_QPT_UC)
+		hfi1_uc_rcv(packet);
+	else if (packet->qp->ibqp.qp_type == IB_QPT_UD)
+		hfi1_ud_rcv(packet);
+	else
+		ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
new file mode 100644
index 0000000..8125361
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -0,0 +1,1149 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC     16
+#define HFI1_GUIDS_PER_PORT	5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
+
+#define IB_SEQ_NAK	(3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* Flags for checking QP state (see ib_hfi1_state_ops[]) */
+#define HFI1_POST_SEND_OK                0x01
+#define HFI1_POST_RECV_OK                0x02
+#define HFI1_PROCESS_RECV_OK             0x04
+#define HFI1_PROCESS_SEND_OK             0x08
+#define HFI1_PROCESS_NEXT_SEND_OK        0x10
+#define HFI1_FLUSH_SEND			0x20
+#define HFI1_FLUSH_RECV			0x40
+#define HFI1_PROCESS_OR_FLUSH_SEND \
+	(HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG		cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK		(1 << 31)
+#define IB_BTH_SOLICITED	(1 << 23)
+#define IB_BTH_MIG_REQ		(1 << 22)
+
+#define IB_GRH_VERSION		6
+#define IB_GRH_VERSION_MASK	0xF
+#define IB_GRH_VERSION_SHIFT	28
+#define IB_GRH_TCLASS_MASK	0xFF
+#define IB_GRH_TCLASS_SHIFT	20
+#define IB_GRH_FLOW_MASK	0xFFFFF
+#define IB_GRH_FLOW_SHIFT	0
+#define IB_GRH_NEXT_HDR		0x1B
+
+#define IB_DEFAULT_GID_PREFIX	cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+	HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+	struct {
+		__be32 deth[2];
+		__be32 imm_data;
+	} ud;
+	struct {
+		struct ib_reth reth;
+		__be32 imm_data;
+	} rc;
+	struct {
+		__be32 aeth;
+		__be32 atomic_ack_eth[2];
+	} at;
+	__be32 imm_data;
+	__be32 aeth;
+	struct ib_atomic_eth atomic_eth;
+}  __packed;
+
+struct hfi1_other_headers {
+	__be32 bth[3];
+	union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct hfi1_other_headers oth;
+		} l;
+		struct hfi1_other_headers oth;
+	} u;
+} __packed;
+
+struct ahg_ib_header {
+	struct sdma_engine *sde;
+	u32 ahgdesc[2];
+	u16 tx_flags;
+	u8 ahgcount;
+	u8 ahgidx;
+	struct hfi1_ib_header ibh;
+};
+
+struct hfi1_pio_header {
+	__le64 pbc;
+	struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * used for force cacheline alignment for AHG
+ */
+struct tx_pio_header {
+	struct hfi1_pio_header phdr;
+} ____cacheline_aligned;
+
+/*
+ * There is one struct hfi1_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct hfi1_mcast_qp.
+ */
+struct hfi1_mcast_qp {
+	struct list_head list;
+	struct hfi1_qp *qp;
+};
+
+struct hfi1_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct hfi1_pd {
+	struct ib_pd ibpd;
+	int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct hfi1_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+	atomic_t refcount;
+};
+
+/*
+ * This structure is used by hfi1_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct hfi1_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct hfi1_cq_wc {
+	u32 head;               /* index of next entry to fill */
+	u32 tail;               /* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct hfi1_cq {
+	struct ib_cq ibcq;
+	struct kthread_work comptask;
+	struct hfi1_devdata *dd;
+	spinlock_t lock; /* protect changes in this struct */
+	u8 notify;
+	u8 triggered;
+	struct hfi1_cq_wc *queue;
+	struct hfi1_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct hfi1_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of hfi1_segs that fit in a page. */
+#define HFI1_SEGSZ     (PAGE_SIZE / sizeof(struct hfi1_seg))
+
+struct hfi1_segarray {
+	struct hfi1_seg segs[HFI1_SEGSZ];
+};
+
+struct hfi1_mregion {
+	struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+	u64 user_base;          /* User's address for this region */
+	u64 iova;               /* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;             /* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;           /* number of hfi1_segs in all the arrays */
+	u32 mapsz;              /* size of the map array */
+	u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
+	u8  lkey_published;     /* in global table */
+	struct completion comp; /* complete when refcount goes to zero */
+	atomic_t refcount;
+	struct hfi1_segarray *map[0];    /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct hfi1_sge {
+	struct hfi1_mregion *mr;
+	void *vaddr;            /* kernel virtual address of segment */
+	u32 sge_length;         /* length of the SGE */
+	u32 length;             /* remaining length of the segment */
+	u16 m;                  /* current index: mr->map[m] */
+	u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct hfi1_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct hfi1_mregion mr;  /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct hfi1_swqe {
+	struct ib_send_wr wr;   /* don't use wr.sg_list */
+	u32 psn;                /* first packet sequence number */
+	u32 lpsn;               /* last packet sequence number */
+	u32 ssn;                /* send sequence number */
+	u32 length;             /* total length of data in sg_list */
+	struct hfi1_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct hfi1_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct hfi1_rwq {
+	u32 head;               /* new work requests posted to the head */
+	u32 tail;               /* receives pull requests from here. */
+	struct hfi1_rwqe wq[0];
+};
+
+struct hfi1_rq {
+	struct hfi1_rwq *wq;
+	u32 size;               /* size of RWQE array */
+	u8 max_sge;
+	/* protect changes in this struct */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct hfi1_srq {
+	struct ib_srq ibsrq;
+	struct hfi1_rq rq;
+	struct hfi1_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct hfi1_sge_state {
+	struct hfi1_sge *sg_list;      /* next SGE to be used if any */
+	struct hfi1_sge sge;   /* progress state for the current SGE */
+	u32 total_len;
+	u8 num_sge;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct hfi1_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	u32 lpsn;
+	union {
+		struct hfi1_sge rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct hfi1_qp {
+	struct ib_qp ibqp;
+	/* read mostly fields above and below */
+	struct ib_ah_attr remote_ah_attr;
+	struct ib_ah_attr alt_ah_attr;
+	struct hfi1_qp __rcu *next;           /* link list for QPN hash table */
+	struct hfi1_swqe *s_wq;  /* send work queue */
+	struct hfi1_mmap_info *ip;
+	struct ahg_ib_header *s_hdr;     /* next packet header to send */
+	u8 s_sc;			/* SC[0..4] for next packet */
+	unsigned long timeout_jiffies;  /* computed from timeout */
+
+	enum ib_mtu path_mtu;
+	int srate_mbps;		/* s_srate (below) converted to Mbit/s */
+	u32 remote_qpn;
+	u32 pmtu;		/* decoded from path_mtu */
+	u32 qkey;               /* QKEY for this QP (for UD or RD) */
+	u32 s_size;             /* send work queue size */
+	u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+	u32 s_ahgpsn;           /* set to the psn in the copy of the header */
+
+	u8 state;               /* QP state */
+	u8 allowed_ops;		/* high order bits of allowed opcodes */
+	u8 qp_access_flags;
+	u8 alt_timeout;         /* Alternate path timeout for this QP */
+	u8 timeout;             /* Timeout for this QP */
+	u8 s_srate;
+	u8 s_mig_state;
+	u8 port_num;
+	u8 s_pkey_index;        /* PKEY index to use */
+	u8 s_alt_pkey_index;    /* Alternate path PKEY index to use */
+	u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+	u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+	u8 s_retry_cnt;         /* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+	u8 s_max_sge;           /* size of s_wq->sg_list */
+	u8 s_draining;
+
+	/* start of read/write fields */
+	atomic_t refcount ____cacheline_aligned_in_smp;
+	wait_queue_head_t wait;
+
+
+	struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1]
+		____cacheline_aligned_in_smp;
+	struct hfi1_sge_state s_rdma_read_sge;
+
+	spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
+	unsigned long r_aflags;
+	u64 r_wr_id;            /* ID for current receive WQE */
+	u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+	u32 r_len;              /* total length of r_sge */
+	u32 r_rcv_len;          /* receive data len processed */
+	u32 r_psn;              /* expected rcv packet sequence number */
+	u32 r_msn;              /* message sequence number */
+
+	u8 r_state;             /* opcode of last packet received */
+	u8 r_flags;
+	u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+
+	struct list_head rspwait;       /* link for waiting to respond */
+
+	struct hfi1_sge_state r_sge;     /* current receive data */
+	struct hfi1_rq r_rq;             /* receive work queue */
+
+	spinlock_t s_lock ____cacheline_aligned_in_smp;
+	struct hfi1_sge_state *s_cur_sge;
+	u32 s_flags;
+	struct hfi1_swqe *s_wqe;
+	struct hfi1_sge_state s_sge;     /* current send request data */
+	struct hfi1_mregion *s_rdma_mr;
+	struct sdma_engine *s_sde; /* current sde */
+	u32 s_cur_size;         /* size of send packet in bytes */
+	u32 s_len;              /* total length of s_sge */
+	u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+	u32 s_next_psn;         /* PSN for next request */
+	u32 s_last_psn;         /* last response PSN processed */
+	u32 s_sending_psn;      /* lowest PSN that is being sent */
+	u32 s_sending_hpsn;     /* highest PSN that is being sent */
+	u32 s_psn;              /* current packet sequence number */
+	u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+	u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+	u32 s_head;             /* new entries added here */
+	u32 s_tail;             /* next entry to process */
+	u32 s_cur;              /* current work queue entry */
+	u32 s_acked;            /* last un-ACK'ed entry */
+	u32 s_last;             /* last completed entry */
+	u32 s_ssn;              /* SSN of tail entry */
+	u32 s_lsn;              /* limit sequence number (credit) */
+	u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
+	u16 s_rdma_ack_cnt;
+	s8 s_ahgidx;
+	u8 s_state;             /* opcode of last packet sent */
+	u8 s_ack_state;         /* opcode of packet to ACK */
+	u8 s_nak_state;         /* non-zero if NAK is pending */
+	u8 r_nak_state;         /* non-zero if NAK is pending */
+	u8 s_retry;             /* requester retry counter */
+	u8 s_rnr_retry;         /* requester RNR retry counter */
+	u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+
+	struct hfi1_sge_state s_ack_rdma_sge;
+	struct timer_list s_timer;
+
+	struct iowait s_iowait;
+
+	struct hfi1_sge r_sg_list[0] /* verified SGEs */
+		____cacheline_aligned_in_smp;
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define HFI1_R_WRID_VALID        0
+#define HFI1_R_REWIND_SGE        1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define HFI1_R_REUSE_SGE 0x01
+#define HFI1_R_RDMAR_SEQ 0x02
+#define HFI1_R_RSP_NAK   0x04
+#define HFI1_R_RSP_SEND  0x08
+#define HFI1_R_COMM_EST  0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * HFI1_S_BUSY - send tasklet is processing the QP
+ * HFI1_S_TIMER - the RC retry timer is active
+ * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ *                         before processing the next SWQE
+ * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ *                         before processing the next SWQE
+ * HFI1_S_WAIT_RNR - waiting for RNR timeout
+ * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                  next send completion entry not via send DMA
+ * HFI1_S_WAIT_PIO - waiting for a send buffer to be available
+ * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available
+ * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * HFI1_S_ECN - a BECN was queued to the send engine
+ */
+#define HFI1_S_SIGNAL_REQ_WR	0x0001
+#define HFI1_S_BUSY		0x0002
+#define HFI1_S_TIMER		0x0004
+#define HFI1_S_RESP_PENDING	0x0008
+#define HFI1_S_ACK_PENDING	0x0010
+#define HFI1_S_WAIT_FENCE	0x0020
+#define HFI1_S_WAIT_RDMAR	0x0040
+#define HFI1_S_WAIT_RNR		0x0080
+#define HFI1_S_WAIT_SSN_CREDIT	0x0100
+#define HFI1_S_WAIT_DMA		0x0200
+#define HFI1_S_WAIT_PIO		0x0400
+#define HFI1_S_WAIT_TX		0x0800
+#define HFI1_S_WAIT_DMA_DESC	0x1000
+#define HFI1_S_WAIT_KMEM		0x2000
+#define HFI1_S_WAIT_PSN		0x4000
+#define HFI1_S_WAIT_ACK		0x8000
+#define HFI1_S_SEND_ONE		0x10000
+#define HFI1_S_UNLIMITED_CREDIT	0x20000
+#define HFI1_S_AHG_VALID		0x40000
+#define HFI1_S_AHG_CLEAR		0x80000
+#define HFI1_S_ECN		0x100000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \
+	HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \
+	HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \
+	HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK)
+
+#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND)
+
+#define HFI1_PSN_CREDIT  16
+
+/*
+ * Since struct hfi1_swqe is not a fixed size, we can't simply index into
+ * struct hfi1_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp,
+					     unsigned n)
+{
+	return (struct hfi1_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct hfi1_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct hfi1_sge)) * n);
+}
+
+/*
+ * Since struct hfi1_rwqe is not a fixed size, we can't simply index into
+ * struct hfi1_rwq.wq.  This function does the array index computation.
+ */
+static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n)
+{
+	return (struct hfi1_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct hfi1_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+#define MAX_LKEY_TABLE_BITS 23
+
+struct hfi1_lkey_table {
+	spinlock_t lock; /* protect changes in this struct */
+	u32 next;               /* next unused index (speeds search) */
+	u32 gen;                /* generation count */
+	u32 max;                /* size of the table */
+	struct hfi1_mregion __rcu **table;
+};
+
+struct hfi1_opcode_stats {
+	u64 n_packets;          /* number of packets */
+	u64 n_bytes;            /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+	struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+	u32 tlen,
+	struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+	stats->n_bytes += tlen;
+	stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+	struct hfi1_qp __rcu *qp[2];
+	struct ib_mad_agent *send_agent;	/* agent for SMI (traps) */
+	struct hfi1_ah *sm_ah;
+	struct hfi1_ah *smi_ah;
+	struct rb_root mcast_tree;
+	spinlock_t lock;		/* protect changes in this struct */
+
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+	unsigned long trap_timeout;
+	__be64 gid_prefix;      /* in network order */
+	__be64 mkey;
+	__be64 guids[HFI1_GUIDS_PER_PORT	- 1];	/* writable GUIDs */
+	u64 tid;		/* TID for traps */
+	u64 n_rc_resends;
+	u64 n_seq_naks;
+	u64 n_rdma_seq;
+	u64 n_rnr_naks;
+	u64 n_other_naks;
+	u64 n_loop_pkts;
+	u64 n_pkt_drops;
+	u64 n_vl15_dropped;
+	u64 n_rc_timeouts;
+	u64 n_dmawait;
+	u64 n_unaligned;
+	u64 n_rc_dupreq;
+	u64 n_rc_seqnak;
+
+	/* Hot-path per CPU counters to avoid cacheline trading to update */
+	u64 z_rc_acks;
+	u64 z_rc_qacks;
+	u64 z_rc_delayed_comp;
+	u64 __percpu *rc_acks;
+	u64 __percpu *rc_qacks;
+	u64 __percpu *rc_delayed_comp;
+
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 pkey_violations;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 sm_lid;
+	u16 repress_traps;
+	u8 sm_sl;
+	u8 mkeyprot;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	/* the first 16 entries are sl_to_vl for !OPA */
+	u8 sl_to_sc[32];
+	u8 sc_to_sl[32];
+};
+
+
+struct hfi1_qp_ibdev;
+struct hfi1_ibdev {
+	struct ib_device ibdev;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock; /* protect mmap_offset */
+	u32 mmap_offset;
+	struct hfi1_mregion __rcu *dma_mr;
+
+	struct hfi1_qp_ibdev *qp_dev;
+
+	/* QP numbers are shared by all IB ports */
+	struct hfi1_lkey_table lk_table;
+	/* protect wait lists */
+	seqlock_t iowait_lock;
+	struct list_head txwait;        /* list for wait verbs_txreq */
+	struct list_head memwait;       /* list for wait kernel memory */
+	struct list_head txreq_free;
+	struct kmem_cache *verbs_txreq_cache;
+	struct timer_list mem_timer;
+
+	/* other waiters */
+	spinlock_t pending_lock;
+
+	u64 n_piowait;
+	u64 n_txwait;
+	u64 n_kmem_wait;
+
+	u32 n_pds_allocated;    /* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;    /* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;    /* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;    /* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;   /* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+#ifdef CONFIG_DEBUG_FS
+	/* per HFI debugfs */
+	struct dentry *hfi1_ibdev_dbg;
+	/* per HFI symlinks to above */
+	struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+struct hfi1_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct hfi1_mr, ibmr);
+}
+
+static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct hfi1_pd, ibpd);
+}
+
+static inline struct hfi1_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct hfi1_ah, ibah);
+}
+
+static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct hfi1_cq, ibcq);
+}
+
+static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct hfi1_srq, ibsrq);
+}
+
+static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct hfi1_qp, ibqp);
+}
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct hfi1_ibdev, ibdev);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct hfi1_qp *qp)
+{
+	return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+		(qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) ||
+		 !(qp->s_flags & HFI1_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp);
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		    u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index);
+int hfi1_create_agents(struct hfi1_ibdev *dev);
+void hfi1_free_agents(struct hfi1_ibdev *dev);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define OPCODE_QP_MASK 0xE0
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+	return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid);
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp);
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+		    u32 hdrwords, struct hfi1_sge_state *ss, u32 len);
+
+void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length,
+		   int release);
+
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+	struct hfi1_ctxtdata *rcd,
+	struct hfi1_ib_header *hdr,
+	u32 rcv_flags,
+	struct hfi1_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region);
+
+void hfi1_free_lkey(struct hfi1_mregion *mr);
+
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+		 struct hfi1_sge *isge, struct ib_sge *sge, int acc);
+
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+		 u32 len, u64 vaddr, u32 rkey, int acc);
+
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+			       struct ib_srq_init_attr *srq_init_attr,
+			       struct ib_udata *udata);
+
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask,
+		    struct ib_udata *udata);
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int hfi1_destroy_srq(struct ib_srq *ibsrq);
+
+int hfi1_cq_init(struct hfi1_devdata *dd);
+
+void hfi1_cq_exit(struct hfi1_devdata *dd);
+
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig);
+
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *hfi1_create_cq(
+	struct ib_device *ibdev,
+	const struct ib_cq_init_attr *attr,
+	struct ib_ucontext *context,
+	struct ib_udata *udata);
+
+int hfi1_destroy_cq(struct ib_cq *ibcq);
+
+int hfi1_req_notify_cq(
+	struct ib_cq *ibcq,
+	enum ib_cq_notify_flags notify_flags);
+
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *buffer_list,
+			       int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt_addr, int mr_access_flags,
+			       struct ib_udata *udata);
+
+int hfi1_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *hfi1_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list(
+				struct ib_device *ibdev, int page_list_len);
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
+
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr);
+
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr);
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int list_len, u64 iova);
+
+int hfi1_unmap_fmr(struct list_head *fmr_list);
+
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr);
+
+static inline void hfi1_get_mr(struct hfi1_mregion *mr)
+{
+	atomic_inc(&mr->refcount);
+}
+
+static inline void hfi1_put_mr(struct hfi1_mregion *mr)
+{
+	if (unlikely(atomic_dec_and_test(&mr->refcount)))
+		complete(&mr->comp);
+}
+
+static inline void hfi1_put_ss(struct hfi1_sge_state *ss)
+{
+	while (ss->num_sge) {
+		hfi1_put_mr(ss->sge.mr);
+		if (--ss->num_sge)
+			ss->sge = *ss->sg_list++;
+	}
+}
+
+void hfi1_release_mmap_info(struct kref *ref);
+
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size,
+					     struct ib_ucontext *context,
+					     void *obj);
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+			   u32 size, void *obj);
+
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct hfi1_qp *qp);
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+		       int has_grh, struct hfi1_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+		  struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void clear_ahg(struct hfi1_qp *qp);
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+			  u32 bth0, u32 bth2, int middle);
+
+void hfi1_do_send(struct work_struct *work);
+
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+			enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct hfi1_qp *qp);
+
+int hfi1_make_uc_req(struct hfi1_qp *qp);
+
+int hfi1_make_ud_req(struct hfi1_qp *qp);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+			u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			u32 plen, u32 dwords, u64 pbc);
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+			u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+			u32 plen, u32 dwords, u64 pbc);
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5);
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_hfi1_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
+
+extern unsigned int hfi1_lkey_table_size;
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops;
+
+#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/staging/rdma/hfi1/verbs_mcast.c
new file mode 100644
index 0000000..afc6b4c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs_mcast.c
@@ -0,0 +1,385 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/rculist.h>
+
+#include "hfi.h"
+
+/**
+ * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp)
+{
+	struct hfi1_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void mcast_qp_free(struct hfi1_mcast_qp *mqp)
+{
+	struct hfi1_qp *qp = mqp->qp;
+
+	/* Notify hfi1_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid)
+{
+	struct hfi1_mcast *mcast;
+
+	mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void mcast_free(struct hfi1_mcast *mcast)
+{
+	struct hfi1_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * hfi1_mcast_find - search the global table for the given multicast GID
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct hfi1_mcast *mcast;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	n = ibp->mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&ibp->lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp,
+		     struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp)
+{
+	struct rb_node **n = &ibp->mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&ibp->lock);
+
+	while (*n) {
+		struct hfi1_mcast *tmcast;
+		struct hfi1_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct hfi1_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == hfi1_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&ibp->lock);
+
+	return ret;
+}
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	struct hfi1_ibdev *dev = to_idev(ibqp->device);
+	struct hfi1_ibport *ibp;
+	struct hfi1_mcast *mcast;
+	struct hfi1_mcast_qp *mqp;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	ibp = to_iport(ibqp->device, qp->port_num);
+	switch (mcast_add(dev, ibp, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: OK to attach the same QP twice. */
+		mcast_qp_free(mqp);
+		mcast_free(mcast);
+		break;
+
+	case EEXIST:            /* The mcast wasn't used */
+		mcast_free(mcast);
+		break;
+
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		mcast_qp_free(mqp);
+		mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct hfi1_qp *qp = to_iqp(ibqp);
+	struct hfi1_ibdev *dev = to_idev(ibqp->device);
+	struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+	struct hfi1_mcast *mcast = NULL;
+	struct hfi1_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irq(&ibp->lock);
+
+	/* Find the GID in the mcast table. */
+	n = ibp->mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&ibp->lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&ibp->lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp)
+{
+	return ibp->mcast_tree.rb_node == NULL;
+}
-- 
cgit v0.10.2


From a724648e8a8f7ff874f0d58491b39517718c7237 Mon Sep 17 00:00:00 2001
From: Jeff Becker <jeffrey.c.becker@nasa.gov>
Date: Fri, 21 Aug 2015 12:26:22 -0700
Subject: staging/hfi1: replace indent spaces with tabs

Running checkpatch.pl on mad.c produces several
"ERROR: code indent should use tabs where possible" messages.
This patch fixes these.

Signed-off-by: Jeff Becker <Jeffrey.C.Becker@nasa.gov>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
index 0a18fee..37269eb 100644
--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -4075,11 +4075,11 @@ bail:
 }
 
 static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
-			        u8 port, const struct ib_wc *in_wc,
-			        const struct ib_grh *in_grh,
-			        const struct opa_mad *in_mad,
-			        struct opa_mad *out_mad, size_t *out_mad_size,
-			        u16 *out_mad_pkey_index)
+				u8 port, const struct ib_wc *in_wc,
+				const struct ib_grh *in_grh,
+				const struct opa_mad *in_mad,
+				struct opa_mad *out_mad, size_t *out_mad_size,
+				u16 *out_mad_pkey_index)
 {
 	int ret;
 	int pkey_idx;
-- 
cgit v0.10.2


From 18ebd40773bf500054a5e079e71cd13aafd1b6eb Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 27 Jul 2015 18:10:01 -0500
Subject: mlx4, mlx5, mthca: Expose max_sge_rd correctly

Applications must not assume that max_sge and max_sge_rd are the same,
Hence expose max_sge_rd correctly as well.

Reported-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 8be6db8..05166b7 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -229,6 +229,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
 	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
 					 dev->dev->caps.max_rq_sg);
+	props->max_sge_rd = props->max_sge;
 	props->max_cq		   = dev->dev->quotas.cq;
 	props->max_cqe		   = dev->dev->caps.max_cqes;
 	props->max_mr		   = dev->dev->quotas.mpt;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 1ece3a7..bac5f98 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -274,6 +274,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		     sizeof(struct mlx5_wqe_ctrl_seg)) /
 		     sizeof(struct mlx5_wqe_data_seg);
 	props->max_sge = min(max_rq_sg, max_sq_sg);
+	props->max_sge_rd = props->max_sge;
 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 93ae51d..dc2d48c 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -97,6 +97,7 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
 	props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
 	props->max_qp_wr           = mdev->limits.max_wqes;
 	props->max_sge             = mdev->limits.max_sg;
+	props->max_sge_rd          = props->max_sge;
 	props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
 	props->max_cqe             = mdev->limits.max_cqes;
 	props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
-- 
cgit v0.10.2


From aaae91f4f05c39f02e36e89b00ad84465f8eb02b Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 27 Jul 2015 18:10:07 -0500
Subject: ipath,qib: Expose max_sge_rd correctly

Applications must not assume that max_sge and max_sge_rd are the same,
Hence expose max_sge_rd correctly as well.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Acked-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 77e981a..bc0599c 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1575,6 +1575,7 @@ static int qib_query_device(struct ib_device *ibdev, struct ib_device_attr *prop
 	props->max_qp = ib_qib_max_qps;
 	props->max_qp_wr = ib_qib_max_qp_wrs;
 	props->max_sge = ib_qib_max_sges;
+	props->max_sge_rd = ib_qib_max_sges;
 	props->max_cq = ib_qib_max_cqs;
 	props->max_ah = ib_qib_max_ahs;
 	props->max_cqe = ib_qib_max_cqes;
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
index 30ba49c..ed2bbc2 100644
--- a/drivers/staging/rdma/ipath/ipath_verbs.c
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -1521,6 +1521,7 @@ static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
 	props->max_qp = ib_ipath_max_qps;
 	props->max_qp_wr = ib_ipath_max_qp_wrs;
 	props->max_sge = ib_ipath_max_sges;
+	props->max_sge_rd = ib_ipath_max_sges;
 	props->max_cq = ib_ipath_max_cqs;
 	props->max_ah = ib_ipath_max_ahs;
 	props->max_cqe = ib_ipath_max_cqes;
-- 
cgit v0.10.2


From bc3fe2e3769874dfa8674791e84c4a901ba9e48b Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 27 Jul 2015 18:10:12 -0500
Subject: svcrdma: Use max_sge_rd for destination read depths

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index cb94ee4..83211bc 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -132,6 +132,7 @@ struct svcxprt_rdma {
 	struct list_head     sc_accept_q;	/* Conn. waiting accept */
 	int		     sc_ord;		/* RDMA read limit */
 	int                  sc_max_sge;
+	int                  sc_max_sge_rd;	/* max sge for read target */
 
 	int                  sc_sq_depth;	/* Depth of SQ */
 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2e1348b..cb51742 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
-	if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device,
-				     xprt->sc_cm_id->port_num))
-		return 1;
-	else
-		return min_t(int, sge_count, xprt->sc_max_sge);
-}
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
 int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 			struct svc_rqst *rqstp,
@@ -144,8 +135,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 
 	ctxt->direction = DMA_FROM_DEVICE;
 	ctxt->read_hdr = head;
-	pages_needed =
-		min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+	pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
 	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
 
 	for (pno = 0; pno < pages_needed; pno++) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 6b36279..fdc850f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -872,6 +872,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * capabilities of this particular device */
 	newxprt->sc_max_sge = min((size_t)devattr.max_sge,
 				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+				       RPCSVC_MAXPAGES);
 	newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
 				   (size_t)svcrdma_max_requests);
 	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -1046,6 +1048,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		"    remote_ip       : %pI4\n"
 		"    remote_port     : %d\n"
 		"    max_sge         : %d\n"
+		"    max_sge_rd      : %d\n"
 		"    sq_depth        : %d\n"
 		"    max_requests    : %d\n"
 		"    ord             : %d\n",
@@ -1059,6 +1062,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
 		       route.addr.dst_addr)->sin_port),
 		newxprt->sc_max_sge,
+		newxprt->sc_max_sge_rd,
 		newxprt->sc_sq_depth,
 		newxprt->sc_max_requests,
 		newxprt->sc_ord);
-- 
cgit v0.10.2


From 3403051ebbd486a342272a404f16e7f1aca8758e Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 27 Jul 2015 18:10:18 -0500
Subject: RDMA/Core: remove rdma_cap_read_multi_sge() helper

This functionality already exists via the max_sge_rd
device capability.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b0f898e..7448a27 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2071,34 +2071,6 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
 }
 
 /**
- * rdma_cap_read_multi_sge - Check if the port of device has the capability
- * RDMA Read Multiple Scatter-Gather Entries.
- * @device: Device to check
- * @port_num: Port number to check
- *
- * iWARP has a restriction that RDMA READ requests may only have a single
- * Scatter/Gather Entry (SGE) in the work request.
- *
- * NOTE: although the linux kernel currently assumes all devices are either
- * single SGE RDMA READ devices or identical SGE maximums for RDMA READs and
- * WRITEs, according to Tom Talpey, this is not accurate.  There are some
- * devices out there that support more than a single SGE on RDMA READ
- * requests, but do not support the same number of SGEs as they do on
- * RDMA WRITE requests.  The linux kernel would need rearchitecting to
- * support these imbalanced READ/WRITE SGEs allowed devices.  So, for now,
- * suffice with either the device supports the same READ/WRITE SGEs, or
- * it only gets one READ sge.
- *
- * Return: true for any device that allows more than one SGE in RDMA READ
- * requests.
- */
-static inline bool rdma_cap_read_multi_sge(struct ib_device *device,
-					   u8 port_num)
-{
-	return !(device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP);
-}
-
-/**
  * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
  *
  * @device: Device
-- 
cgit v0.10.2


From 5aa44bb90f047662c12c44be1b6de454658632d0 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:13 +0300
Subject: IB/core: Add rwsem to allow reading device list or client list

Currently the RDMA subsystem's device list and client list are protected by
a single mutex. This prevents adding user-facing APIs that iterate these
lists, since using them may cause a deadlock. The patch attempts to solve
this problem by adding a read-write semaphore to protect the lists. Readers
now don't need the mutex, and are safe just by read-locking the semaphore.

The ib_register_device, ib_register_client, ib_unregister_device, and
ib_unregister_client functions are modified to lock the semaphore for write
during their respective list modification. Also, in order to make sure
client callbacks are called only between add() and remove() calls, the code
is changed to only add items to the lists after the add() calls and remove
from the lists before the remove() calls.

This patch attempts to solve a similar need [1] that was seen in the RoCE
v2 patch series.

[1] http://www.spinics.net/lists/linux-rdma/msg24733.html

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756..0c8fa78 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -55,17 +55,24 @@ struct ib_client_data {
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
 static LIST_HEAD(device_list);
 static LIST_HEAD(client_list);
 
 /*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list.  In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list.  device_mutex protects writer access by device and client
+ * registration / de-registration.  lists_rwsem protects reader access to
+ * these lists.  Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
  */
 static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
 
 static int ib_device_check_mandatory(struct ib_device *device)
 {
@@ -305,8 +312,6 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
-	list_add_tail(&device->core_list, &device_list);
-
 	device->reg_state = IB_DEV_REGISTERED;
 
 	{
@@ -317,7 +322,10 @@ int ib_register_device(struct ib_device *device,
 				client->add(device);
 	}
 
- out:
+	down_write(&lists_rwsem);
+	list_add_tail(&device->core_list, &device_list);
+	up_write(&lists_rwsem);
+out:
 	mutex_unlock(&device_mutex);
 	return ret;
 }
@@ -337,12 +345,14 @@ void ib_unregister_device(struct ib_device *device)
 
 	mutex_lock(&device_mutex);
 
+	down_write(&lists_rwsem);
+	list_del(&device->core_list);
+	up_write(&lists_rwsem);
+
 	list_for_each_entry_reverse(client, &client_list, list)
 		if (client->remove)
 			client->remove(device);
 
-	list_del(&device->core_list);
-
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
@@ -375,11 +385,14 @@ int ib_register_client(struct ib_client *client)
 
 	mutex_lock(&device_mutex);
 
-	list_add_tail(&client->list, &client_list);
 	list_for_each_entry(device, &device_list, core_list)
 		if (client->add && !add_client_context(device, client))
 			client->add(device);
 
+	down_write(&lists_rwsem);
+	list_add_tail(&client->list, &client_list);
+	up_write(&lists_rwsem);
+
 	mutex_unlock(&device_mutex);
 
 	return 0;
@@ -402,6 +415,10 @@ void ib_unregister_client(struct ib_client *client)
 
 	mutex_lock(&device_mutex);
 
+	down_write(&lists_rwsem);
+	list_del(&client->list);
+	up_write(&lists_rwsem);
+
 	list_for_each_entry(device, &device_list, core_list) {
 		if (client->remove)
 			client->remove(device);
@@ -414,7 +431,6 @@ void ib_unregister_client(struct ib_client *client)
 			}
 		spin_unlock_irqrestore(&device->client_data_lock, flags);
 	}
-	list_del(&client->list);
 
 	mutex_unlock(&device_mutex);
 }
-- 
cgit v0.10.2


From 7c1eb45a22d76bb99236e7485958f87ef7c449cf Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:14 +0300
Subject: IB/core: lock client data with lists_rwsem

An ib_client callback that is called with the lists_rwsem locked only for
read is protected from changes to the IB client lists, but not from
ib_unregister_device() freeing its client data. This is because
ib_unregister_device() will remove the device from the device list with
lists_rwsem locked for write, but perform the rest of the cleanup,
including the call to remove() without that lock.

Mark client data that is undergoing de-registration with a new going_down
flag in the client data context. Lock the client data list with lists_rwsem
for write in addition to using the spinlock, so that functions calling the
callback would be able to lock only lists_rwsem for read and let callbacks
sleep.

Since ib_unregister_client() now marks the client data context, no need for
remove() to search the context again, so pass the client data directly to
remove() callbacks.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 871da83..c93af66 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -394,7 +394,7 @@ err:
 	kfree(device->cache.lmc_cache);
 }
 
-static void ib_cache_cleanup_one(struct ib_device *device)
+static void ib_cache_cleanup_one(struct ib_device *device, void *client_data)
 {
 	int p;
 
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3a972eb..82d5c43 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cm_client = {
 	.name   = "cm",
@@ -3886,9 +3886,9 @@ free:
 	kfree(cm_dev);
 }
 
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
-	struct cm_device *cm_dev;
+	struct cm_device *cm_dev = client_data;
 	struct cm_port *port;
 	struct ib_port_modify port_modify = {
 		.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3896,6 @@ static void cm_remove_one(struct ib_device *ib_device)
 	unsigned long flags;
 	int i;
 
-	cm_dev = ib_get_client_data(ib_device, &cm_client);
 	if (!cm_dev)
 		return;
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 4e72e4c..9664131 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -94,7 +94,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
 EXPORT_SYMBOL(rdma_event_msg);
 
 static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cma_client = {
 	.name   = "cma",
@@ -3554,11 +3554,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
 	wait_for_completion(&cma_dev->comp);
 }
 
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
 {
-	struct cma_device *cma_dev;
+	struct cma_device *cma_dev = client_data;
 
-	cma_dev = ib_get_client_data(device, &cma_client);
 	if (!cma_dev)
 		return;
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 0c8fa78..ce317e6 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -50,6 +50,9 @@ struct ib_client_data {
 	struct list_head  list;
 	struct ib_client *client;
 	void *            data;
+	/* The device or client is going down. Do not call client or device
+	 * callbacks other than remove(). */
+	bool		  going_down;
 };
 
 struct workqueue_struct *ib_wq;
@@ -69,6 +72,8 @@ static LIST_HEAD(client_list);
  * to the lists must be done with a write lock. A special case is when the
  * device_mutex is locked. In this case locking the lists for read access is
  * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
  */
 static DEFINE_MUTEX(device_mutex);
 static DECLARE_RWSEM(lists_rwsem);
@@ -210,10 +215,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
 
 	context->client = client;
 	context->data   = NULL;
+	context->going_down = false;
 
+	down_write(&lists_rwsem);
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_add(&context->list, &device->client_data_list);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
 
 	return 0;
 }
@@ -339,7 +347,6 @@ EXPORT_SYMBOL(ib_register_device);
  */
 void ib_unregister_device(struct ib_device *device)
 {
-	struct ib_client *client;
 	struct ib_client_data *context, *tmp;
 	unsigned long flags;
 
@@ -347,20 +354,29 @@ void ib_unregister_device(struct ib_device *device)
 
 	down_write(&lists_rwsem);
 	list_del(&device->core_list);
-	up_write(&lists_rwsem);
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		context->going_down = true;
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	downgrade_write(&lists_rwsem);
 
-	list_for_each_entry_reverse(client, &client_list, list)
-		if (client->remove)
-			client->remove(device);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list,
+				 list) {
+		if (context->client->remove)
+			context->client->remove(device, context->data);
+	}
+	up_read(&lists_rwsem);
 
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
 
+	down_write(&lists_rwsem);
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 		kfree(context);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
 }
@@ -420,16 +436,35 @@ void ib_unregister_client(struct ib_client *client)
 	up_write(&lists_rwsem);
 
 	list_for_each_entry(device, &device_list, core_list) {
-		if (client->remove)
-			client->remove(device);
+		struct ib_client_data *found_context = NULL;
 
+		down_write(&lists_rwsem);
 		spin_lock_irqsave(&device->client_data_lock, flags);
 		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 			if (context->client == client) {
-				list_del(&context->list);
-				kfree(context);
+				context->going_down = true;
+				found_context = context;
+				break;
 			}
 		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
+
+		if (client->remove)
+			client->remove(device, found_context ?
+					       found_context->data : NULL);
+
+		if (!found_context) {
+			pr_warn("No client context found for %s/%s\n",
+				device->name, client->name);
+			continue;
+		}
+
+		down_write(&lists_rwsem);
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_del(&found_context->list);
+		kfree(found_context);
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
 	}
 
 	mutex_unlock(&device_mutex);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 786fc51..66b4b3e 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3335,7 +3335,7 @@ error:
 	}
 }
 
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
 {
 	int i;
 
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 2cb865c..d38d8b2 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
 #include "sa.h"
 
 static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client mcast_client = {
 	.name   = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
 	ib_register_event_handler(&dev->event_handler);
 }
 
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
 {
-	struct mcast_device *dev;
+	struct mcast_device *dev = client_data;
 	struct mcast_port *port;
 	int i;
 
-	dev = ib_get_client_data(device, &mcast_client);
 	if (!dev)
 		return;
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ca919f4..d40be36 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -107,7 +107,7 @@ struct ib_sa_mcmember_query {
 };
 
 static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client sa_client = {
 	.name   = "sa",
@@ -1221,9 +1221,9 @@ free:
 	return;
 }
 
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_device *sa_dev = client_data;
 	int i;
 
 	if (!sa_dev)
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 0094810..8cde48b 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 
 static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client ucm_client = {
 	.name   = "ucm",
@@ -1310,9 +1310,9 @@ err:
 	return;
 }
 
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+	struct ib_ucm_device *ucm_dev = client_data;
 
 	if (!ucm_dev)
 		return;
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 35567ff..57f281f 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
 
 static void ib_umad_release_dev(struct kobject *kobj)
 {
@@ -1322,9 +1322,9 @@ free:
 	kobject_put(&umad_dev->kobj);
 }
 
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	struct ib_umad_device *umad_dev = client_data;
 	int i;
 
 	if (!umad_dev)
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index f6eef2d..46c9229 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -128,7 +128,7 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
 static void ib_uverbs_release_dev(struct kref *ref)
 {
@@ -948,9 +948,9 @@ err:
 	return;
 }
 
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+	struct ib_uverbs_device *uverbs_dev = client_data;
 
 	if (!uverbs_dev)
 		return;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index b2943c8..cca1a0c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -89,7 +89,7 @@ struct workqueue_struct *ipoib_workqueue;
 struct ib_sa_client ipoib_sa_client;
 
 static void ipoib_add_one(struct ib_device *device);
-static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device, void *client_data);
 static void ipoib_neigh_reclaim(struct rcu_head *rp);
 
 static struct ib_client ipoib_client = {
@@ -1715,12 +1715,11 @@ static void ipoib_add_one(struct ib_device *device)
 	ib_set_client_data(device, &ipoib_client, dev_list);
 }
 
-static void ipoib_remove_one(struct ib_device *device)
+static void ipoib_remove_one(struct ib_device *device, void *client_data)
 {
 	struct ipoib_dev_priv *priv, *tmp;
-	struct list_head *dev_list;
+	struct list_head *dev_list = client_data;
 
-	dev_list = ib_get_client_data(device, &ipoib_client);
 	if (!dev_list)
 		return;
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 31a20b4..7755df4 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -131,7 +131,7 @@ MODULE_PARM_DESC(ch_count,
 		 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
 
 static void srp_add_one(struct ib_device *device);
-static void srp_remove_one(struct ib_device *device);
+static void srp_remove_one(struct ib_device *device, void *client_data);
 static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
 static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
@@ -3460,13 +3460,13 @@ free_attr:
 	kfree(dev_attr);
 }
 
-static void srp_remove_one(struct ib_device *device)
+static void srp_remove_one(struct ib_device *device, void *client_data)
 {
 	struct srp_device *srp_dev;
 	struct srp_host *host, *tmp_host;
 	struct srp_target_port *target;
 
-	srp_dev = ib_get_client_data(device, &srp_client);
+	srp_dev = client_data;
 	if (!srp_dev)
 		return;
 
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 60ff0a2..4c59ceb 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -3326,12 +3326,11 @@ err:
 /**
  * srpt_remove_one() - InfiniBand device removal callback function.
  */
-static void srpt_remove_one(struct ib_device *device)
+static void srpt_remove_one(struct ib_device *device, void *client_data)
 {
-	struct srpt_device *sdev;
+	struct srpt_device *sdev = client_data;
 	int i;
 
-	sdev = ib_get_client_data(device, &srpt_client);
 	if (!sdev) {
 		pr_info("%s(%s): nothing to do.\n", __func__, device->name);
 		return;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7448a27..449609b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1550,6 +1550,8 @@ struct ib_device {
 
 	spinlock_t                    client_data_lock;
 	struct list_head              core_list;
+	/* Access to the client_data_list is protected by the client_data_lock
+	 * spinlock and the lists_rwsem read-write semaphore */
 	struct list_head              client_data_list;
 
 	struct ib_cache               cache;
@@ -1761,7 +1763,7 @@ struct ib_device {
 struct ib_client {
 	char  *name;
 	void (*add)   (struct ib_device *);
-	void (*remove)(struct ib_device *);
+	void (*remove)(struct ib_device *, void *client_data);
 
 	struct list_head list;
 };
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..348ac37 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -230,11 +230,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
  *
  * This can be called at any time and can be racing with any other RDS path.
  */
-static void rds_ib_remove_one(struct ib_device *device)
+static void rds_ib_remove_one(struct ib_device *device, void *client_data)
 {
-	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_device *rds_ibdev = client_data;
 
-	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
 	if (!rds_ibdev)
 		return;
 
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 5899356..7cc2f32 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -125,12 +125,11 @@ free_attr:
 	kfree(dev_attr);
 }
 
-static void rds_iw_remove_one(struct ib_device *device)
+static void rds_iw_remove_one(struct ib_device *device, void *client_data)
 {
-	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_device *rds_iwdev = client_data;
 	struct rds_iw_cm_id *i_cm_id, *next;
 
-	rds_iwdev = ib_get_client_data(device, &rds_iw_client);
 	if (!rds_iwdev)
 		return;
 
-- 
cgit v0.10.2


From 9268f72dcb24348c8b4cf9bcf8afeb24035157a5 Mon Sep 17 00:00:00 2001
From: Yotam Kenneth <yotamke@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:15 +0300
Subject: IB/core: Find the network device matching connection parameters

In the case of IPoIB, and maybe in other cases, the network device is
managed by an upper-layer protocol (ULP). In order to expose this
network device to other users of the IB device, let ULPs implement
a callback that returns network device according to connection parameters.

The IB device and port, together with the P_Key and the GID should
be enough to uniquely identify the ULP net device. However, in current
kernels there can be multiple IPoIB interfaces created with the same GID.
Furthermore, such configuration may be desireable to support ipvlan-like
configurations for RDMA CM with IPoIB.  To resolve the device in these
cases the code will also take the IP address as an additional input.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index ce317e6..a9a2781 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
@@ -780,6 +781,51 @@ int ib_find_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_pkey);
 
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev:	An RDMA device on which the request has been received.
+ * @port:	Port number on the RDMA device.
+ * @pkey:	The Pkey the request came on.
+ * @gid:	A GID that the net_dev uses to communicate.
+ * @addr:	Contains the IP address that the request specified as its
+ *		destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+					    u8 port,
+					    u16 pkey,
+					    const union ib_gid *gid,
+					    const struct sockaddr *addr)
+{
+	struct net_device *net_dev = NULL;
+	struct ib_client_data *context;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	down_read(&lists_rwsem);
+
+	list_for_each_entry(context, &dev->client_data_list, list) {
+		struct ib_client *client = context->client;
+
+		if (context->going_down)
+			continue;
+
+		if (client->get_net_dev_by_params) {
+			net_dev = client->get_net_dev_by_params(dev, port, pkey,
+								gid, addr,
+								context->data);
+			if (net_dev)
+				break;
+		}
+	}
+
+	up_read(&lists_rwsem);
+
+	return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
 static int __init ib_core_init(void)
 {
 	int ret;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 449609b..fd7a695 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -48,6 +48,7 @@
 #include <linux/rwsem.h>
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
+#include <linux/socket.h>
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
@@ -1765,6 +1766,28 @@ struct ib_client {
 	void (*add)   (struct ib_device *);
 	void (*remove)(struct ib_device *, void *client_data);
 
+	/* Returns the net_dev belonging to this ib_client and matching the
+	 * given parameters.
+	 * @dev:	 An RDMA device that the net_dev use for communication.
+	 * @port:	 A physical port number on the RDMA device.
+	 * @pkey:	 P_Key that the net_dev uses if applicable.
+	 * @gid:	 A GID that the net_dev uses to communicate.
+	 * @addr:	 An IP address the net_dev is configured with.
+	 * @client_data: The device's client data set by ib_set_client_data().
+	 *
+	 * An ib_client that implements a net_dev on top of RDMA devices
+	 * (such as IP over IB) should implement this callback, allowing the
+	 * rdma_cm module to find the right net_dev for a given request.
+	 *
+	 * The caller is responsible for calling dev_put on the returned
+	 * netdev. */
+	struct net_device *(*get_net_dev_by_params)(
+			struct ib_device *dev,
+			u8 port,
+			u16 pkey,
+			const union ib_gid *gid,
+			const struct sockaddr *addr,
+			void *client_data);
 	struct list_head list;
 };
 
@@ -3014,4 +3037,8 @@ static inline int ib_check_mr_access(int flags)
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		       struct ib_mr_status *mr_status);
 
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
+					    u16 pkey, const union ib_gid *gid,
+					    const struct sockaddr *addr);
+
 #endif /* IB_VERBS_H */
-- 
cgit v0.10.2


From ddde896e561a51ae5023e531d66dc6a140a95ec3 Mon Sep 17 00:00:00 2001
From: Guy Shapiro <guysh@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:16 +0300
Subject: IB/ipoib: Return IPoIB devices matching connection parameters

Implement the get_net_device_by_port_pkey_ip callback that returns network
device to ib_core according to connection parameters. Check the ipoib
device and iterate over all child devices to look for a match.

For each IPoIB device we iterate through all upper devices when searching
for a matching IP, in order to support bonding.

Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index cca1a0c..36536ce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,9 @@
 
 #include <linux/jhash.h>
 #include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
 
 #define DRV_VERSION "1.0.0"
 
@@ -91,11 +94,16 @@ struct ib_sa_client ipoib_sa_client;
 static void ipoib_add_one(struct ib_device *device);
 static void ipoib_remove_one(struct ib_device *device, void *client_data);
 static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_dev_by_params(
+		struct ib_device *dev, u8 port, u16 pkey,
+		const union ib_gid *gid, const struct sockaddr *addr,
+		void *client_data);
 
 static struct ib_client ipoib_client = {
 	.name   = "ipoib",
 	.add    = ipoib_add_one,
-	.remove = ipoib_remove_one
+	.remove = ipoib_remove_one,
+	.get_net_dev_by_params = ipoib_get_net_dev_by_params,
 };
 
 int ipoib_open(struct net_device *dev)
@@ -222,6 +230,225 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
+					struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct in_device *in_dev;
+	struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+	__be32 ret_addr;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		in_dev = in_dev_get(dev);
+		if (!in_dev)
+			return false;
+
+		ret_addr = inet_confirm_addr(net, in_dev, 0,
+					     addr_in->sin_addr.s_addr,
+					     RT_SCOPE_HOST);
+		in_dev_put(in_dev);
+		if (ret_addr)
+			return true;
+
+		break;
+	case AF_INET6:
+		if (IS_ENABLED(CONFIG_IPV6) &&
+		    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+			return true;
+
+		break;
+	}
+	return false;
+}
+
+/**
+ * Find the master net_device on top of the given net_device.
+ * @dev: base IPoIB net_device
+ *
+ * Returns the master net_device with a reference held, or the same net_device
+ * if no master exists.
+ */
+static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
+{
+	struct net_device *master;
+
+	rcu_read_lock();
+	master = netdev_master_upper_dev_get_rcu(dev);
+	if (master)
+		dev_hold(master);
+	rcu_read_unlock();
+
+	if (master)
+		return master;
+
+	dev_hold(dev);
+	return dev;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(
+		const struct sockaddr *addr, struct net_device *dev)
+{
+	struct net_device *upper,
+			  *result = NULL;
+	struct list_head *iter;
+
+	rcu_read_lock();
+	if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
+		dev_hold(dev);
+		result = dev;
+		goto out;
+	}
+
+	netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
+		if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
+			dev_hold(upper);
+			result = upper;
+			break;
+		}
+	}
+out:
+	rcu_read_unlock();
+	return result;
+}
+
+/* returns the number of IPoIB netdevs on top a given ipoib device matching a
+ * pkey_index and address, if one exists.
+ *
+ * @found_net_dev: contains a matching net_device if the return value >= 1,
+ * with a reference held. */
+static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
+				     const union ib_gid *gid,
+				     u16 pkey_index,
+				     const struct sockaddr *addr,
+				     int nesting,
+				     struct net_device **found_net_dev)
+{
+	struct ipoib_dev_priv *child_priv;
+	struct net_device *net_dev = NULL;
+	int matches = 0;
+
+	if (priv->pkey_index == pkey_index &&
+	    (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
+		if (!addr) {
+			net_dev = ipoib_get_master_net_dev(priv->dev);
+		} else {
+			/* Verify the net_device matches the IP address, as
+			 * IPoIB child devices currently share a GID. */
+			net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+		}
+		if (net_dev) {
+			if (!*found_net_dev)
+				*found_net_dev = net_dev;
+			else
+				dev_put(net_dev);
+			++matches;
+		}
+	}
+
+	/* Check child interfaces */
+	down_read_nested(&priv->vlan_rwsem, nesting);
+	list_for_each_entry(child_priv, &priv->child_intfs, list) {
+		matches += ipoib_match_gid_pkey_addr(child_priv, gid,
+						    pkey_index, addr,
+						    nesting + 1,
+						    found_net_dev);
+		if (matches > 1)
+			break;
+	}
+	up_read(&priv->vlan_rwsem);
+
+	return matches;
+}
+
+/* Returns the number of matching net_devs found (between 0 and 2). Also
+ * return the matching net_device in the @net_dev parameter, holding a
+ * reference to the net_device, if the number of matches >= 1 */
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+					 u16 pkey_index,
+					 const union ib_gid *gid,
+					 const struct sockaddr *addr,
+					 struct net_device **net_dev)
+{
+	struct ipoib_dev_priv *priv;
+	int matches = 0;
+
+	*net_dev = NULL;
+
+	list_for_each_entry(priv, dev_list, list) {
+		if (priv->port != port)
+			continue;
+
+		matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
+						     addr, 0, net_dev);
+		if (matches > 1)
+			break;
+	}
+
+	return matches;
+}
+
+static struct net_device *ipoib_get_net_dev_by_params(
+		struct ib_device *dev, u8 port, u16 pkey,
+		const union ib_gid *gid, const struct sockaddr *addr,
+		void *client_data)
+{
+	struct net_device *net_dev;
+	struct list_head *dev_list = client_data;
+	u16 pkey_index;
+	int matches;
+	int ret;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+	if (ret)
+		return NULL;
+
+	if (!dev_list)
+		return NULL;
+
+	/* See if we can find a unique device matching the L2 parameters */
+	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+						gid, NULL, &net_dev);
+
+	switch (matches) {
+	case 0:
+		return NULL;
+	case 1:
+		return net_dev;
+	}
+
+	dev_put(net_dev);
+
+	/* Couldn't find a unique device with L2 parameters only. Use L3
+	 * address to uniquely match the net device */
+	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+						gid, addr, &net_dev);
+	switch (matches) {
+	case 0:
+		return NULL;
+	default:
+		dev_warn_ratelimited(&dev->dev,
+				     "duplicate IP address detected\n");
+		/* Fall through */
+	case 1:
+		return net_dev;
+	}
+}
+
 int ipoib_set_mode(struct net_device *dev, const char *buf)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-- 
cgit v0.10.2


From 15865e7dab62a58407f1b7decdafd89dd0a8b063 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:17 +0300
Subject: IB/cm: Expose service ID in request events

Expose the service ID on an incoming CM or SIDR request to the event
handler. This will allow the RDMA CM module to de-multiplex connection
requests based on the information encoded in the service ID.

Acked-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 82d5c43..93e9e2f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1268,6 +1268,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 	primary_path->packet_life_time =
 		cm_req_get_primary_local_ack_timeout(req_msg);
 	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+	primary_path->service_id = req_msg->service_id;
 
 	if (req_msg->alt_local_lid) {
 		memset(alt_path, 0, sizeof *alt_path);
@@ -1289,6 +1290,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 		alt_path->packet_life_time =
 			cm_req_get_alt_local_ack_timeout(req_msg);
 		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+		alt_path->service_id = req_msg->service_id;
 	}
 }
 
@@ -2992,6 +2994,7 @@ static void cm_format_sidr_req_event(struct cm_work *work,
 	param = &work->cm_event.param.sidr_req_rcvd;
 	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
 	param->listen_id = listen_id;
+	param->service_id = sidr_req_msg->service_id;
 	param->port = work->port->port_num;
 	work->cm_event.private_data = &sidr_req_msg->private_data;
 }
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 39ed2d2..1b567bb 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -223,6 +223,7 @@ struct ib_cm_apr_event_param {
 
 struct ib_cm_sidr_req_event_param {
 	struct ib_cm_id		*listen_id;
+	__be64			service_id;
 	u8			port;
 	u16			pkey;
 };
-- 
cgit v0.10.2


From 067b171b8679f99e170feae2e1d2eae319736420 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:18 +0300
Subject: IB/cm: Share listening CM IDs

Enabling network namespaces for RDMA CM will allow processes on different
namespaces to listen on the same port. In order to leave namespace support
out of the CM layer, this requires that multiple RDMA CM IDs will be able
to share a single CM ID.

This patch adds infrastructure to retrieve an existing listening ib_cm_id,
based on its device and service ID, or create a new one if one does not
already exist. It also adds a reference count for such instances
(cm_id_private.listen_sharecount), and prevents cm_destroy_id from
destroying a CM if it is still shared. See the relevant discussion [1].

[1] Re: [PATCH v3 for-next 05/13] IB/cm: Reference count ib_cm_ids
    http://www.spinics.net/lists/netdev/msg328860.html

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 93e9e2f..fa3d3e7 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -213,6 +213,9 @@ struct cm_id_private {
 	spinlock_t lock;	/* Do not acquire inside cm.lock */
 	struct completion comp;
 	atomic_t refcount;
+	/* Number of clients sharing this ib_cm_id. Only valid for listeners.
+	 * Protected by the cm.lock spinlock. */
+	int listen_sharecount;
 
 	struct ib_mad_send_buf *msg;
 	struct cm_timewait_info *timewait_info;
@@ -859,9 +862,15 @@ retest:
 	spin_lock_irq(&cm_id_priv->lock);
 	switch (cm_id->state) {
 	case IB_CM_LISTEN:
-		cm_id->state = IB_CM_IDLE;
 		spin_unlock_irq(&cm_id_priv->lock);
+
 		spin_lock_irq(&cm.lock);
+		if (--cm_id_priv->listen_sharecount > 0) {
+			/* The id is still shared. */
+			cm_deref_id(cm_id_priv);
+			spin_unlock_irq(&cm.lock);
+			return;
+		}
 		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
 		spin_unlock_irq(&cm.lock);
 		break;
@@ -941,11 +950,32 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
 }
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-		 struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional.  It specifies data that must
+ *   appear in the private data of a connection request for the specified
+ *   listen request.
+ * @lock: If set, lock the cm.lock spin-lock when adding the id to the
+ *   listener tree. When false, the caller must already hold the spin-lock,
+ *   and compare_data must be NULL.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+			  __be64 service_mask,
+			  struct ib_cm_compare_data *compare_data,
+			  bool lock)
 {
 	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
-	unsigned long flags;
+	unsigned long flags = 0;
 	int ret = 0;
 
 	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -970,8 +1000,10 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 	}
 
 	cm_id->state = IB_CM_LISTEN;
+	if (lock)
+		spin_lock_irqsave(&cm.lock, flags);
 
-	spin_lock_irqsave(&cm.lock, flags);
+	++cm_id_priv->listen_sharecount;
 	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
 		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
 		cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +1012,96 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 		cm_id->service_mask = service_mask;
 	}
 	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
-	spin_unlock_irqrestore(&cm.lock, flags);
 
 	if (cur_cm_id_priv) {
 		cm_id->state = IB_CM_IDLE;
+		--cm_id_priv->listen_sharecount;
 		kfree(cm_id_priv->compare_data);
 		cm_id_priv->compare_data = NULL;
 		ret = -EBUSY;
 	}
+
+	if (lock)
+		spin_unlock_irqrestore(&cm.lock, flags);
+
 	return ret;
 }
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data)
+{
+	return __ib_cm_listen(cm_id, service_id, service_mask, compare_data,
+			      true);
+}
 EXPORT_SYMBOL(ib_cm_listen);
 
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+				     ib_cm_handler cm_handler,
+				     __be64 service_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_id *cm_id;
+	unsigned long flags;
+	int err = 0;
+
+	/* Create an ID in advance, since the creation may sleep */
+	cm_id = ib_create_cm_id(device, cm_handler, NULL);
+	if (IS_ERR(cm_id))
+		return cm_id;
+
+	spin_lock_irqsave(&cm.lock, flags);
+
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+		goto new_id;
+
+	/* Find an existing ID */
+	cm_id_priv = cm_find_listen(device, service_id, NULL);
+	if (cm_id_priv) {
+		if (cm_id->cm_handler != cm_handler || cm_id->context) {
+			/* Sharing an ib_cm_id with different handlers is not
+			 * supported */
+			spin_unlock_irqrestore(&cm.lock, flags);
+			return ERR_PTR(-EINVAL);
+		}
+		atomic_inc(&cm_id_priv->refcount);
+		++cm_id_priv->listen_sharecount;
+		spin_unlock_irqrestore(&cm.lock, flags);
+
+		ib_destroy_cm_id(cm_id);
+		cm_id = &cm_id_priv->id;
+		return cm_id;
+	}
+
+new_id:
+	/* Use newly created ID */
+	err = __ib_cm_listen(cm_id, service_id, 0, NULL, false);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (err) {
+		ib_destroy_cm_id(cm_id);
+		return ERR_PTR(err);
+	}
+	return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
 static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
 			  enum cm_msg_sequence msg_seq)
 {
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 1b567bb..9cc496e 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -362,6 +362,10 @@ struct ib_cm_compare_data {
 int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 		 struct ib_cm_compare_data *compare_data);
 
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+				     ib_cm_handler cm_handler,
+				     __be64 service_id);
+
 struct ib_cm_req_param {
 	struct ib_sa_path_rec	*primary_path;
 	struct ib_sa_path_rec	*alternate_path;
-- 
cgit v0.10.2


From 0c505f70a28d943e15a6702ca75bea4f332a03ed Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:19 +0300
Subject: IB/cma: Refactor RDMA IP CM private-data parsing code

When receiving a connection request, rdma_cm needs to associate the request
with a network device, in order to disambiguate requests. To do this, it
needs to know the request's destination IP. For this the module needs to
allow getting this information from the private data in the request packet,
instead of relying on the information already being in the listening RDMA
CM ID.

When creating a new incoming connection ID, the code in
cma_save_ip{4,6}_info can no longer rely on the listener's private data to
find the port number, so it reads it from the requested service ID.

Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9664131..249944c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -870,107 +870,138 @@ static inline int cma_any_port(struct sockaddr *addr)
 	return !cma_port(addr);
 }
 
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
 			     struct ib_sa_path_rec *path)
 {
 	struct sockaddr_ib *listen_ib, *ib;
 
 	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
-	ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
-	ib->sib_family = listen_ib->sib_family;
-	if (path) {
-		ib->sib_pkey = path->pkey;
-		ib->sib_flowinfo = path->flow_label;
-		memcpy(&ib->sib_addr, &path->sgid, 16);
-	} else {
-		ib->sib_pkey = listen_ib->sib_pkey;
-		ib->sib_flowinfo = listen_ib->sib_flowinfo;
-		ib->sib_addr = listen_ib->sib_addr;
-	}
-	ib->sib_sid = listen_ib->sib_sid;
-	ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
-	ib->sib_scope_id = listen_ib->sib_scope_id;
-
-	if (path) {
-		ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
-		ib->sib_family = listen_ib->sib_family;
-		ib->sib_pkey = path->pkey;
-		ib->sib_flowinfo = path->flow_label;
-		memcpy(&ib->sib_addr, &path->dgid, 16);
+	if (src_addr) {
+		ib = (struct sockaddr_ib *)src_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->sgid, 16);
+			ib->sib_sid = path->service_id;
+			ib->sib_scope_id = 0;
+		} else {
+			ib->sib_pkey = listen_ib->sib_pkey;
+			ib->sib_flowinfo = listen_ib->sib_flowinfo;
+			ib->sib_addr = listen_ib->sib_addr;
+			ib->sib_sid = listen_ib->sib_sid;
+			ib->sib_scope_id = listen_ib->sib_scope_id;
+		}
+		ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+	}
+	if (dst_addr) {
+		ib = (struct sockaddr_ib *)dst_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->dgid, 16);
+		}
 	}
 }
 
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
-	if (ss->ss_family == AF_INET)
-		return ((struct sockaddr_in *)ss)->sin_port;
-	else if (ss->ss_family == AF_INET6)
-		return ((struct sockaddr_in6 *)ss)->sin6_port;
-	BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			      struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+			      struct sockaddr *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
 {
 	struct sockaddr_in *ip4;
 
-	ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
-	ip4->sin_family = AF_INET;
-	ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
-	ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+	if (src_addr) {
+		ip4 = (struct sockaddr_in *)src_addr;
+		ip4->sin_family = AF_INET;
+		ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+		ip4->sin_port = local_port;
+	}
 
-	ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
-	ip4->sin_family = AF_INET;
-	ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
-	ip4->sin_port = hdr->port;
+	if (dst_addr) {
+		ip4 = (struct sockaddr_in *)dst_addr;
+		ip4->sin_family = AF_INET;
+		ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+		ip4->sin_port = hdr->port;
+	}
 }
 
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			      struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+			      struct sockaddr *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
 {
 	struct sockaddr_in6 *ip6;
 
-	ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
-	ip6->sin6_family = AF_INET6;
-	ip6->sin6_addr = hdr->dst_addr.ip6;
-	ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+	if (src_addr) {
+		ip6 = (struct sockaddr_in6 *)src_addr;
+		ip6->sin6_family = AF_INET6;
+		ip6->sin6_addr = hdr->dst_addr.ip6;
+		ip6->sin6_port = local_port;
+	}
 
-	ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
-	ip6->sin6_family = AF_INET6;
-	ip6->sin6_addr = hdr->src_addr.ip6;
-	ip6->sin6_port = hdr->port;
+	if (dst_addr) {
+		ip6 = (struct sockaddr_in6 *)dst_addr;
+		ip6->sin6_family = AF_INET6;
+		ip6->sin6_addr = hdr->src_addr.ip6;
+		ip6->sin6_port = hdr->port;
+	}
 }
 
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			     struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
 {
-	struct cma_hdr *hdr;
+	return (u16)be64_to_cpu(service_id);
+}
 
-	if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
-		if (ib_event->event == IB_CM_REQ_RECEIVED)
-			cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
-		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
-			cma_save_ib_info(id, listen_id, NULL);
-		return 0;
-	}
+static int cma_save_ip_info(struct sockaddr *src_addr,
+			    struct sockaddr *dst_addr,
+			    struct ib_cm_event *ib_event,
+			    __be64 service_id)
+{
+	struct cma_hdr *hdr;
+	__be16 port;
 
 	hdr = ib_event->private_data;
 	if (hdr->cma_version != CMA_VERSION)
 		return -EINVAL;
 
+	port = htons(cma_port_from_service_id(service_id));
+
 	switch (cma_get_ip_ver(hdr)) {
 	case 4:
-		cma_save_ip4_info(id, listen_id, hdr);
+		cma_save_ip4_info(src_addr, dst_addr, hdr, port);
 		break;
 	case 6:
-		cma_save_ip6_info(id, listen_id, hdr);
+		cma_save_ip6_info(src_addr, dst_addr, hdr, port);
 		break;
 	default:
 		return -EINVAL;
 	}
+
 	return 0;
 }
 
+static int cma_save_net_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
+			     struct ib_cm_event *ib_event,
+			     sa_family_t sa_family, __be64 service_id)
+{
+	if (sa_family == AF_IB) {
+		if (ib_event->event == IB_CM_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id,
+					 ib_event->param.req_rcvd.primary_path);
+		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+		return 0;
+	}
+
+	return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
 static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
 {
 	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1221,6 +1252,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
 	struct rdma_route *rt;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+	const __be64 service_id =
+		      ib_event->param.req_rcvd.primary_path->service_id;
 	int ret;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1263,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 		return NULL;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (cma_save_net_info(id, listen_id, ib_event))
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family, service_id))
 		goto err;
 
 	rt = &id->route;
@@ -1267,6 +1303,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
 	int ret;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,7 +1312,10 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 		return NULL;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (cma_save_net_info(id, listen_id, ib_event))
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family,
+			      ib_event->param.sidr_req_rcvd.service_id))
 		goto err;
 
 	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
-- 
cgit v0.10.2


From aac978e15230fccb7a3e9190eb58732925019300 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:20 +0300
Subject: IB/cma: Helper functions to access port space IDRs

Add helper functions to access the IDRs by port-space and port number.

Pass around the port-space enum in cma.c instead of using pointers to
port-space IDRs.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 249944c..a7886ac 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -113,6 +113,22 @@ static DEFINE_IDR(udp_ps);
 static DEFINE_IDR(ipoib_ps);
 static DEFINE_IDR(ib_ps);
 
+static struct idr *cma_idr(enum rdma_port_space ps)
+{
+	switch (ps) {
+	case RDMA_PS_TCP:
+		return &tcp_ps;
+	case RDMA_PS_UDP:
+		return &udp_ps;
+	case RDMA_PS_IPOIB:
+		return &ipoib_ps;
+	case RDMA_PS_IB:
+		return &ib_ps;
+	default:
+		return NULL;
+	}
+}
+
 struct cma_device {
 	struct list_head	list;
 	struct ib_device	*device;
@@ -122,11 +138,33 @@ struct cma_device {
 };
 
 struct rdma_bind_list {
-	struct idr		*ps;
+	enum rdma_port_space	ps;
 	struct hlist_head	owners;
 	unsigned short		port;
 };
 
+static int cma_ps_alloc(enum rdma_port_space ps,
+			struct rdma_bind_list *bind_list, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(enum rdma_port_space ps, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	idr_remove(idr, snum);
+}
+
 enum {
 	CMA_OPTION_AFONLY,
 };
@@ -1069,7 +1107,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
 	mutex_lock(&lock);
 	hlist_del(&id_priv->node);
 	if (hlist_empty(&bind_list->owners)) {
-		idr_remove(bind_list->ps, bind_list->port);
+		cma_ps_remove(bind_list->ps, bind_list->port);
 		kfree(bind_list);
 	}
 	mutex_unlock(&lock);
@@ -2368,8 +2406,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
 	hlist_add_head(&id_priv->node, &bind_list->owners);
 }
 
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
-			  unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+			  struct rdma_id_private *id_priv, unsigned short snum)
 {
 	struct rdma_bind_list *bind_list;
 	int ret;
@@ -2378,7 +2416,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
 	if (!bind_list)
 		return -ENOMEM;
 
-	ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+	ret = cma_ps_alloc(ps, bind_list, snum);
 	if (ret < 0)
 		goto err;
 
@@ -2391,7 +2429,8 @@ err:
 	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
 }
 
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+			      struct rdma_id_private *id_priv)
 {
 	static unsigned int last_used_port;
 	int low, high, remaining;
@@ -2402,7 +2441,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
 	rover = prandom_u32() % remaining + low;
 retry:
 	if (last_used_port != rover &&
-	    !idr_find(ps, (unsigned short) rover)) {
+	    !cma_ps_find(ps, (unsigned short)rover)) {
 		int ret = cma_alloc_port(ps, id_priv, rover);
 		/*
 		 * Remember previously used port number in order to avoid
@@ -2457,7 +2496,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
 	return 0;
 }
 
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+			struct rdma_id_private *id_priv)
 {
 	struct rdma_bind_list *bind_list;
 	unsigned short snum;
@@ -2467,7 +2507,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
 	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
-	bind_list = idr_find(ps, snum);
+	bind_list = cma_ps_find(ps, snum);
 	if (!bind_list) {
 		ret = cma_alloc_port(ps, id_priv, snum);
 	} else {
@@ -2490,25 +2530,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
 	return ret;
 }
 
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+		struct rdma_id_private *id_priv)
 {
 	switch (id_priv->id.ps) {
 	case RDMA_PS_TCP:
-		return &tcp_ps;
 	case RDMA_PS_UDP:
-		return &udp_ps;
 	case RDMA_PS_IPOIB:
-		return &ipoib_ps;
 	case RDMA_PS_IB:
-		return &ib_ps;
+		return id_priv->id.ps;
 	default:
-		return NULL;
+
+		return 0;
 	}
 }
 
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
 {
-	struct idr *ps = NULL;
+	enum rdma_port_space ps = 0;
 	struct sockaddr_ib *sib;
 	u64 sid_ps, mask, sid;
 
@@ -2518,15 +2557,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
 	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
 		sid_ps = RDMA_IB_IP_PS_IB;
-		ps = &ib_ps;
+		ps = RDMA_PS_IB;
 	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
 		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
 		sid_ps = RDMA_IB_IP_PS_TCP;
-		ps = &tcp_ps;
+		ps = RDMA_PS_TCP;
 	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
 		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
 		sid_ps = RDMA_IB_IP_PS_UDP;
-		ps = &udp_ps;
+		ps = RDMA_PS_UDP;
 	}
 
 	if (ps) {
@@ -2539,7 +2578,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
-	struct idr *ps;
+	enum rdma_port_space ps;
 	int ret;
 
 	if (cma_family(id_priv) != AF_IB)
-- 
cgit v0.10.2


From 24cad9a7e8bfd4cf1ace7ac2a2b3f696a0e70420 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:21 +0300
Subject: IB/cm: Expose BTH P_Key in CM and SIDR request events

The rdma_cm module will later use the P_Key from the BTH to de-mux
requests.

See discussion at:
  http://www.spinics.net/lists/netdev/msg336067.html

Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Liran Liss <liranl@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index fa3d3e7..d2b2c83 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1404,6 +1404,24 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 	}
 }
 
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+	struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+	u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+	u16 pkey;
+	int ret;
+
+	ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+	if (ret) {
+		dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+				     port_num, pkey_index, ret);
+		return 0;
+	}
+
+	return pkey;
+}
+
 static void cm_format_req_event(struct cm_work *work,
 				struct cm_id_private *cm_id_priv,
 				struct ib_cm_id *listen_id)
@@ -1414,6 +1432,7 @@ static void cm_format_req_event(struct cm_work *work,
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = cm_id_priv->av.port->port_num;
 	param->primary_path = &work->path[0];
 	if (req_msg->alt_local_lid)
@@ -3105,6 +3124,7 @@ static void cm_format_sidr_req_event(struct cm_work *work,
 	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
 	param->listen_id = listen_id;
 	param->service_id = sidr_req_msg->service_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = work->port->port_num;
 	work->cm_event.private_data = &sidr_req_msg->private_data;
 }
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 9cc496e..e3f4863 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -113,6 +113,10 @@ struct ib_cm_id;
 
 struct ib_cm_req_event_param {
 	struct ib_cm_id		*listen_id;
+
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
+
 	u8			port;
 
 	struct ib_sa_path_rec	*primary_path;
@@ -224,6 +228,8 @@ struct ib_cm_apr_event_param {
 struct ib_cm_sidr_req_event_param {
 	struct ib_cm_id		*listen_id;
 	__be64			service_id;
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
 	u8			port;
 	u16			pkey;
 };
-- 
cgit v0.10.2


From 4c21b5bcef73e6649429c1d9b39f5065e756d857 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:22 +0300
Subject: IB/cma: Add net_dev and private data checks to RDMA CM

Instead of relying on a the ib_cm module to check an incoming CM request's
private data header, add these checks to the RDMA CM module. This allows a
following patch to to clean up the ib_cm interface and remove the code that
looks into the private headers. It will also allow supporting namespaces in
RDMA CM by making these checks namespace aware later on.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index a7886ac..ed6abdb 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -263,6 +263,15 @@ struct cma_hdr {
 
 #define CMA_VERSION 0x00
 
+struct cma_req_info {
+	struct ib_device *device;
+	int port;
+	union ib_gid local_gid;
+	__be64 service_id;
+	u16 pkey;
+	bool has_gid:1;
+};
+
 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
 {
 	unsigned long flags;
@@ -300,7 +309,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
 	return old;
 }
 
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 {
 	return hdr->ip_version >> 4;
 }
@@ -1016,7 +1025,7 @@ static int cma_save_ip_info(struct sockaddr *src_addr,
 		cma_save_ip6_info(src_addr, dst_addr, hdr, port);
 		break;
 	default:
-		return -EINVAL;
+		return -EAFNOSUPPORT;
 	}
 
 	return 0;
@@ -1040,6 +1049,176 @@ static int cma_save_net_info(struct sockaddr *src_addr,
 	return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
 }
 
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+			     struct cma_req_info *req)
+{
+	const struct ib_cm_req_event_param *req_param =
+		&ib_event->param.req_rcvd;
+	const struct ib_cm_sidr_req_event_param *sidr_param =
+		&ib_event->param.sidr_req_rcvd;
+
+	switch (ib_event->event) {
+	case IB_CM_REQ_RECEIVED:
+		req->device	= req_param->listen_id->device;
+		req->port	= req_param->port;
+		memcpy(&req->local_gid, &req_param->primary_path->sgid,
+		       sizeof(req->local_gid));
+		req->has_gid	= true;
+		req->service_id	= req_param->primary_path->service_id;
+		req->pkey	= req_param->bth_pkey;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		req->device	= sidr_param->listen_id->device;
+		req->port	= sidr_param->port;
+		req->has_gid	= false;
+		req->service_id	= sidr_param->service_id;
+		req->pkey	= sidr_param->bth_pkey;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+					  const struct cma_req_info *req)
+{
+	struct sockaddr_storage listen_addr_storage;
+	struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage;
+	struct net_device *net_dev;
+	const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+	int err;
+
+	err = cma_save_ip_info(listen_addr, NULL, ib_event, req->service_id);
+	if (err)
+		return ERR_PTR(err);
+
+	net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+					   gid, listen_addr);
+	if (!net_dev)
+		return ERR_PTR(-ENODEV);
+
+	return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+	return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+				   const struct cma_hdr *hdr)
+{
+	struct sockaddr *addr = cma_src_addr(id_priv);
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	if (cma_any_addr(addr) && !id_priv->afonly)
+		return true;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+		if (cma_get_ip_ver(hdr) != 4)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    hdr->dst_addr.ip4.addr != ip4_addr)
+			return false;
+		break;
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+		if (cma_get_ip_ver(hdr) != 6)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+			return false;
+		break;
+	case AF_IB:
+		return true;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
+			      const struct net_device *net_dev)
+{
+	const struct rdma_addr *addr = &id_priv->id.route.addr;
+
+	if (!net_dev)
+		/* This request is an AF_IB request */
+		return addr->src_addr.ss_family == AF_IB;
+
+	return !addr->dev_addr.bound_dev_if ||
+	       (net_eq(dev_net(net_dev), &init_net) &&
+		addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+		const struct rdma_bind_list *bind_list,
+		const struct ib_cm_id *cm_id,
+		const struct ib_cm_event *ib_event,
+		const struct cma_req_info *req,
+		const struct net_device *net_dev)
+{
+	struct rdma_id_private *id_priv, *id_priv_dev;
+
+	if (!bind_list)
+		return ERR_PTR(-EINVAL);
+
+	hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+		if (cma_match_private_data(id_priv, ib_event->private_data)) {
+			if (id_priv->id.device == cm_id->device &&
+			    cma_match_net_dev(id_priv, net_dev))
+				return id_priv;
+			list_for_each_entry(id_priv_dev,
+					    &id_priv->listen_list,
+					    listen_list) {
+				if (id_priv_dev->id.device == cm_id->device &&
+				    cma_match_net_dev(id_priv_dev, net_dev))
+					return id_priv_dev;
+			}
+		}
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+						 struct ib_cm_event *ib_event)
+{
+	struct cma_req_info req;
+	struct rdma_bind_list *bind_list;
+	struct rdma_id_private *id_priv;
+	struct net_device *net_dev;
+	int err;
+
+	err = cma_save_req_info(ib_event, &req);
+	if (err)
+		return ERR_PTR(err);
+
+	net_dev = cma_get_net_dev(ib_event, &req);
+	if (IS_ERR(net_dev)) {
+		if (PTR_ERR(net_dev) == -EAFNOSUPPORT) {
+			/* Assuming the protocol is AF_IB */
+			net_dev = NULL;
+		} else {
+			return ERR_CAST(net_dev);
+		}
+	}
+
+	bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
+				cma_port_from_service_id(req.service_id));
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, net_dev);
+
+	dev_put(net_dev);
+
+	return id_priv;
+}
+
 static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
 {
 	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1399,7 +1578,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	struct rdma_cm_event event;
 	int offset, ret;
 
-	listen_id = cm_id->context;
+	listen_id = cma_id_from_event(cm_id, ib_event);
+	if (IS_ERR(listen_id))
+		return PTR_ERR(listen_id);
+
 	if (!cma_check_req_qp_type(&listen_id->id, ib_event))
 		return -EINVAL;
 
-- 
cgit v0.10.2


From f887f2ac87c25124a90adf97d76b258eba6295cb Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:23 +0300
Subject: IB/cma: Validate routing of incoming requests

Pass incoming request parameters through the relevant IPv4/IPv6 routing
tables and make sure the network stack is configured to handle such
requests.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index ed6abdb..951ff96 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,6 +46,8 @@
 
 #include <net/tcp.h>
 #include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
 
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_cm_ib.h>
@@ -1081,16 +1083,98 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event,
 	return 0;
 }
 
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in *dst_addr,
+				  const struct sockaddr_in *src_addr)
+{
+	__be32 daddr = dst_addr->sin_addr.s_addr,
+	       saddr = src_addr->sin_addr.s_addr;
+	struct fib_result res;
+	struct flowi4 fl4;
+	int err;
+	bool ret;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+	    ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+	    ipv4_is_loopback(saddr))
+		return false;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_iif = net_dev->ifindex;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+
+	rcu_read_lock();
+	err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+	if (err)
+		return false;
+
+	ret = FIB_RES_DEV(res) == net_dev;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in6 *dst_addr,
+				  const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+			   IPV6_ADDR_LINKLOCAL;
+	struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+					 &src_addr->sin6_addr, net_dev->ifindex,
+					 strict);
+	bool ret;
+
+	if (!rt)
+		return false;
+
+	ret = rt->rt6i_idev->dev == net_dev;
+	ip6_rt_put(rt);
+
+	return ret;
+#else
+	return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+			     const struct sockaddr *daddr,
+			     const struct sockaddr *saddr)
+{
+	const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+	const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+	const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+	switch (daddr->sa_family) {
+	case AF_INET:
+		return saddr->sa_family == AF_INET &&
+		       validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+	case AF_INET6:
+		return saddr->sa_family == AF_INET6 &&
+		       validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+	default:
+		return false;
+	}
+}
+
 static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
 					  const struct cma_req_info *req)
 {
-	struct sockaddr_storage listen_addr_storage;
-	struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage;
+	struct sockaddr_storage listen_addr_storage, src_addr_storage;
+	struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+			*src_addr = (struct sockaddr *)&src_addr_storage;
 	struct net_device *net_dev;
 	const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
 	int err;
 
-	err = cma_save_ip_info(listen_addr, NULL, ib_event, req->service_id);
+	err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+			       req->service_id);
 	if (err)
 		return ERR_PTR(err);
 
@@ -1099,6 +1183,11 @@ static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
 	if (!net_dev)
 		return ERR_PTR(-ENODEV);
 
+	if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+		dev_put(net_dev);
+		return ERR_PTR(-EHOSTUNREACH);
+	}
+
 	return net_dev;
 }
 
-- 
cgit v0.10.2


From 0b3ca768fcb07338c99df5e3ccec99119435e321 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:24 +0300
Subject: IB/cma: Use found net_dev for passive connections

When receiving a new connection in cma_req_handler, we actually already
know the net_dev that is used for the connection's creation. Instead of
calling cma_translate_addr to resolve the new connection id's source
address, just use the net_dev that was found.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 951ff96..0a7475c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1277,33 +1277,31 @@ static struct rdma_id_private *cma_find_listener(
 }
 
 static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
-						 struct ib_cm_event *ib_event)
+						 struct ib_cm_event *ib_event,
+						 struct net_device **net_dev)
 {
 	struct cma_req_info req;
 	struct rdma_bind_list *bind_list;
 	struct rdma_id_private *id_priv;
-	struct net_device *net_dev;
 	int err;
 
 	err = cma_save_req_info(ib_event, &req);
 	if (err)
 		return ERR_PTR(err);
 
-	net_dev = cma_get_net_dev(ib_event, &req);
-	if (IS_ERR(net_dev)) {
-		if (PTR_ERR(net_dev) == -EAFNOSUPPORT) {
+	*net_dev = cma_get_net_dev(ib_event, &req);
+	if (IS_ERR(*net_dev)) {
+		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
 			/* Assuming the protocol is AF_IB */
-			net_dev = NULL;
+			*net_dev = NULL;
 		} else {
-			return ERR_CAST(net_dev);
+			return ERR_CAST(*net_dev);
 		}
 	}
 
 	bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
 				cma_port_from_service_id(req.service_id));
-	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, net_dev);
-
-	dev_put(net_dev);
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
 
 	return id_priv;
 }
@@ -1553,7 +1551,8 @@ out:
 }
 
 static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
-					       struct ib_cm_event *ib_event)
+					       struct ib_cm_event *ib_event,
+					       struct net_device *net_dev)
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
@@ -1585,14 +1584,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	if (rt->num_paths == 2)
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
-	if (cma_any_addr(cma_src_addr(id_priv))) {
-		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
-		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
-		ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
-	} else {
-		ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+	if (net_dev) {
+		ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
 		if (ret)
 			goto err;
+	} else {
+		/* An AF_IB connection */
+		WARN_ON_ONCE(ss_family != AF_IB);
+
+		cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv),
+				 &rt->addr.dev_addr);
 	}
 	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
 
@@ -1605,7 +1606,8 @@ err:
 }
 
 static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
-					      struct ib_cm_event *ib_event)
+					      struct ib_cm_event *ib_event,
+					      struct net_device *net_dev)
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
@@ -1624,10 +1626,18 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 			      ib_event->param.sidr_req_rcvd.service_id))
 		goto err;
 
-	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
-		ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+	if (net_dev) {
+		ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
 		if (ret)
 			goto err;
+	} else {
+		/* An AF_IB connection */
+		WARN_ON_ONCE(ss_family != AF_IB);
+
+		if (!cma_any_addr(cma_src_addr(id_priv)))
+			cma_translate_ib((struct sockaddr_ib *)
+						cma_src_addr(id_priv),
+					 &id->route.addr.dev_addr);
 	}
 
 	id_priv->state = RDMA_CM_CONNECT;
@@ -1665,28 +1675,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 {
 	struct rdma_id_private *listen_id, *conn_id;
 	struct rdma_cm_event event;
+	struct net_device *net_dev;
 	int offset, ret;
 
-	listen_id = cma_id_from_event(cm_id, ib_event);
+	listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
 	if (IS_ERR(listen_id))
 		return PTR_ERR(listen_id);
 
-	if (!cma_check_req_qp_type(&listen_id->id, ib_event))
-		return -EINVAL;
+	if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+		ret = -EINVAL;
+		goto net_dev_put;
+	}
 
-	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
-		return -ECONNABORTED;
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+		ret = -ECONNABORTED;
+		goto net_dev_put;
+	}
 
 	memset(&event, 0, sizeof event);
 	offset = cma_user_data_offset(listen_id);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
-		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
 		event.param.ud.private_data = ib_event->private_data + offset;
 		event.param.ud.private_data_len =
 				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
 	} else {
-		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
 		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
 				       ib_event->private_data, offset);
 	}
@@ -1724,6 +1739,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	mutex_unlock(&conn_id->handler_mutex);
 	mutex_unlock(&listen_id->handler_mutex);
 	cma_deref_id(conn_id);
+	if (net_dev)
+		dev_put(net_dev);
 	return 0;
 
 err3:
@@ -1737,6 +1754,11 @@ err1:
 	mutex_unlock(&listen_id->handler_mutex);
 	if (conn_id)
 		rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+	if (net_dev)
+		dev_put(net_dev);
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 51efe394bcab3a0c511634f7ae58bb88b1686983 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:25 +0300
Subject: IB/cma: Share ib_cm_ids between rdma_cm_ids

Use ib_cm_insert_listen to create listening IB CM IDs or share existing
ones if needed. When given a request on a specific CM ID, the code now
matches the request to the RDMA CM ID based on the request parameters, so
it no longer needs to rely on the ib_cm's private data matching
capabilities.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0a7475c..9b306d7 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1771,42 +1771,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(rdma_get_service_id);
 
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
-				 struct ib_cm_compare_data *compare)
-{
-	struct cma_hdr *cma_data, *cma_mask;
-	__be32 ip4_addr;
-	struct in6_addr ip6_addr;
-
-	memset(compare, 0, sizeof *compare);
-	cma_data = (void *) compare->data;
-	cma_mask = (void *) compare->mask;
-
-	switch (addr->sa_family) {
-	case AF_INET:
-		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
-		cma_set_ip_ver(cma_data, 4);
-		cma_set_ip_ver(cma_mask, 0xF);
-		if (!cma_any_addr(addr)) {
-			cma_data->dst_addr.ip4.addr = ip4_addr;
-			cma_mask->dst_addr.ip4.addr = htonl(~0);
-		}
-		break;
-	case AF_INET6:
-		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
-		cma_set_ip_ver(cma_data, 6);
-		cma_set_ip_ver(cma_mask, 0xF);
-		if (!cma_any_addr(addr)) {
-			cma_data->dst_addr.ip6 = ip6_addr;
-			memset(&cma_mask->dst_addr.ip6, 0xFF,
-			       sizeof cma_mask->dst_addr.ip6);
-		}
-		break;
-	default:
-		break;
-	}
-}
-
 static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 {
 	struct rdma_id_private *id_priv = iw_id->context;
@@ -1960,33 +1924,18 @@ out:
 
 static int cma_ib_listen(struct rdma_id_private *id_priv)
 {
-	struct ib_cm_compare_data compare_data;
 	struct sockaddr *addr;
 	struct ib_cm_id	*id;
 	__be64 svc_id;
-	int ret;
 
-	id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+	addr = cma_src_addr(id_priv);
+	svc_id = rdma_get_service_id(&id_priv->id, addr);
+	id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
 	if (IS_ERR(id))
 		return PTR_ERR(id);
-
 	id_priv->cm_id.ib = id;
 
-	addr = cma_src_addr(id_priv);
-	svc_id = rdma_get_service_id(&id_priv->id, addr);
-	if (cma_any_addr(addr) && !id_priv->afonly)
-		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
-	else {
-		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
-		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
-	}
-
-	if (ret) {
-		ib_destroy_cm_id(id_priv->cm_id.ib);
-		id_priv->cm_id.ib = NULL;
-	}
-
-	return ret;
+	return 0;
 }
 
 static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
-- 
cgit v0.10.2


From 73fec7fd04a2ad6c879c93881cba9a40d551b3fd Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 30 Jul 2015 17:50:26 +0300
Subject: IB/cm: Remove compare_data checks

Now that there are no ib_cm clients using the compare_data feature for
matching IB CM requests' private data, remove the compare_data parameter of
ib_cm_listen and remove the code implementing the feature.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d2b2c83..ea4db9c 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -222,7 +222,6 @@ struct cm_id_private {
 	/* todo: use alternate port on send failure */
 	struct cm_av av;
 	struct cm_av alt_av;
-	struct ib_cm_compare_data *compare_data;
 
 	void *private_data;
 	__be64 tid;
@@ -443,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
 	return cm_id_priv;
 }
 
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
-	int i;
-
-	for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
-		dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
-			   struct ib_cm_compare_data *dst_data)
-{
-	u32 src[IB_CM_COMPARE_SIZE];
-	u32 dst[IB_CM_COMPARE_SIZE];
-
-	if (!src_data || !dst_data)
-		return 0;
-
-	cm_mask_copy(src, src_data->data, dst_data->mask);
-	cm_mask_copy(dst, dst_data->data, src_data->mask);
-	return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
-				   struct ib_cm_compare_data *dst_data)
-{
-	u32 src[IB_CM_COMPARE_SIZE];
-
-	if (!dst_data)
-		return 0;
-
-	cm_mask_copy(src, private_data, dst_data->mask);
-	return memcmp(src, dst_data->data, sizeof(src));
-}
-
 /*
  * Trivial helpers to strip endian annotation and compare; the
  * endianness doesn't actually matter since we just need a stable
@@ -509,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 	struct cm_id_private *cur_cm_id_priv;
 	__be64 service_id = cm_id_priv->id.service_id;
 	__be64 service_mask = cm_id_priv->id.service_mask;
-	int data_cmp;
 
 	while (*link) {
 		parent = *link;
 		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
 					  service_node);
-		data_cmp = cm_compare_data(cm_id_priv->compare_data,
-					   cur_cm_id_priv->compare_data);
 		if ((cur_cm_id_priv->id.service_mask & service_id) ==
 		    (service_mask & cur_cm_id_priv->id.service_id) &&
-		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
-		    !data_cmp)
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device))
 			return cur_cm_id_priv;
 
 		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -531,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 			link = &(*link)->rb_left;
 		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
 			link = &(*link)->rb_right;
-		else if (data_cmp < 0)
-			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
 	}
@@ -542,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 }
 
 static struct cm_id_private * cm_find_listen(struct ib_device *device,
-					     __be64 service_id,
-					     u32 *private_data)
+					     __be64 service_id)
 {
 	struct rb_node *node = cm.listen_service_table.rb_node;
 	struct cm_id_private *cm_id_priv;
-	int data_cmp;
 
 	while (node) {
 		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
-		data_cmp = cm_compare_private_data(private_data,
-						   cm_id_priv->compare_data);
 		if ((cm_id_priv->id.service_mask & service_id) ==
 		     cm_id_priv->id.service_id &&
-		    (cm_id_priv->id.device == device) && !data_cmp)
+		    (cm_id_priv->id.device == device))
 			return cm_id_priv;
 
 		if (device < cm_id_priv->id.device)
@@ -566,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
 			node = node->rb_left;
 		else if (be64_gt(service_id, cm_id_priv->id.service_id))
 			node = node->rb_right;
-		else if (data_cmp < 0)
-			node = node->rb_left;
 		else
 			node = node->rb_right;
 	}
@@ -939,7 +892,6 @@ retest:
 	wait_for_completion(&cm_id_priv->comp);
 	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
 		cm_free_work(work);
-	kfree(cm_id_priv->compare_data);
 	kfree(cm_id_priv->private_data);
 	kfree(cm_id_priv);
 }
@@ -962,20 +914,11 @@ EXPORT_SYMBOL(ib_destroy_cm_id);
  *   range of service IDs.  If set to 0, the service ID is matched
  *   exactly.  This parameter is ignored if %service_id is set to
  *   IB_CM_ASSIGN_SERVICE_ID.
- * @compare_data: This parameter is optional.  It specifies data that must
- *   appear in the private data of a connection request for the specified
- *   listen request.
- * @lock: If set, lock the cm.lock spin-lock when adding the id to the
- *   listener tree. When false, the caller must already hold the spin-lock,
- *   and compare_data must be NULL.
  */
 static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
-			  __be64 service_mask,
-			  struct ib_cm_compare_data *compare_data,
-			  bool lock)
+			  __be64 service_mask)
 {
 	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
-	unsigned long flags = 0;
 	int ret = 0;
 
 	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -988,22 +931,9 @@ static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
 	if (cm_id->state != IB_CM_IDLE)
 		return -EINVAL;
 
-	if (compare_data) {
-		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
-						   GFP_KERNEL);
-		if (!cm_id_priv->compare_data)
-			return -ENOMEM;
-		cm_mask_copy(cm_id_priv->compare_data->data,
-			     compare_data->data, compare_data->mask);
-		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
-		       sizeof(compare_data->mask));
-	}
-
 	cm_id->state = IB_CM_LISTEN;
-	if (lock)
-		spin_lock_irqsave(&cm.lock, flags);
-
 	++cm_id_priv->listen_sharecount;
+
 	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
 		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
 		cm_id->service_mask = ~cpu_to_be64(0);
@@ -1016,22 +946,21 @@ static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
 	if (cur_cm_id_priv) {
 		cm_id->state = IB_CM_IDLE;
 		--cm_id_priv->listen_sharecount;
-		kfree(cm_id_priv->compare_data);
-		cm_id_priv->compare_data = NULL;
 		ret = -EBUSY;
 	}
-
-	if (lock)
-		spin_unlock_irqrestore(&cm.lock, flags);
-
 	return ret;
 }
 
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-		 struct ib_cm_compare_data *compare_data)
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
 {
-	return __ib_cm_listen(cm_id, service_id, service_mask, compare_data,
-			      true);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	ret = __ib_cm_listen(cm_id, service_id, service_mask);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL(ib_cm_listen);
 
@@ -1071,7 +1000,7 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 		goto new_id;
 
 	/* Find an existing ID */
-	cm_id_priv = cm_find_listen(device, service_id, NULL);
+	cm_id_priv = cm_find_listen(device, service_id);
 	if (cm_id_priv) {
 		if (cm_id->cm_handler != cm_handler || cm_id->context) {
 			/* Sharing an ib_cm_id with different handlers is not
@@ -1090,7 +1019,7 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 
 new_id:
 	/* Use newly created ID */
-	err = __ib_cm_listen(cm_id, service_id, 0, NULL, false);
+	err = __ib_cm_listen(cm_id, service_id, 0);
 
 	spin_unlock_irqrestore(&cm.lock, flags);
 
@@ -1615,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
 	/* Find matching listen request. */
 	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
-					   req_msg->service_id,
-					   req_msg->private_data);
+					   req_msg->service_id);
 	if (!listen_cm_id_priv) {
 		cm_cleanup_timewait(cm_id_priv->timewait_info);
 		spin_unlock_irq(&cm.lock);
@@ -3164,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	}
 	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
 	cur_cm_id_priv = cm_find_listen(cm_id->device,
-					sidr_req_msg->service_id,
-					sidr_req_msg->private_data);
+					sidr_req_msg->service_id);
 	if (!cur_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 8cde48b..6b4e8a0 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
 	if (result)
 		goto out;
 
-	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
-			      NULL);
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
 out:
 	ib_ucm_ctx_put(ctx);
 	return result;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index ee39be6..9d32157 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -848,7 +848,7 @@ int ipoib_cm_dev_open(struct net_device *dev)
 	}
 
 	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
-			   0, NULL);
+			   0);
 	if (ret) {
 		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
 		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 4c59ceb..3ab015b 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -3250,7 +3250,7 @@ static void srpt_add_one(struct ib_device *device)
 	 * in the system as service_id; therefore, the target_id will change
 	 * if this HCA is gone bad and replaced by different HCA
 	 */
-	if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL))
+	if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0))
 		goto err_cm;
 
 	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index e3f4863..92a7d85 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -105,8 +105,6 @@ enum ib_cm_data_size {
 	IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
 	IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
 	IB_CM_SIDR_REP_INFO_LENGTH	 = 72,
-	/* compare done u32 at a time */
-	IB_CM_COMPARE_SIZE		 = (64 / sizeof(u32))
 };
 
 struct ib_cm_id;
@@ -344,11 +342,6 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id);
 #define IB_SDP_SERVICE_ID	cpu_to_be64(0x0000000000010000ULL)
 #define IB_SDP_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
 
-struct ib_cm_compare_data {
-	u32  data[IB_CM_COMPARE_SIZE];
-	u32  mask[IB_CM_COMPARE_SIZE];
-};
-
 /**
  * ib_cm_listen - Initiates listening on the specified service ID for
  *   connection and service ID resolution requests.
@@ -361,12 +354,9 @@ struct ib_cm_compare_data {
  *   range of service IDs.  If set to 0, the service ID is matched
  *   exactly.  This parameter is ignored if %service_id is set to
  *   IB_CM_ASSIGN_SERVICE_ID.
- * @compare_data: This parameter is optional.  It specifies data that must
- *   appear in the private data of a connection request for the specified
- *   listen request.
  */
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-		 struct ib_cm_compare_data *compare_data);
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+		 __be64 service_mask);
 
 struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 				     ib_cm_handler cm_handler,
-- 
cgit v0.10.2


From be688195bd08b1c045f89d72c07c7e3ef6516f38 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 27 Aug 2015 15:55:15 +0300
Subject: IB/cma: Fix net_dev reference leak with failed requests

When no matching listening ID is found for a given request, the net_dev
that was used to find the request isn't released.

Fixes: 0b3ca768fcb0 ("IB/cma: Use found net_dev for passive connections")
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9b306d7..b1ab13f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1302,6 +1302,10 @@ static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
 	bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
 				cma_port_from_service_id(req.service_id));
 	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+	if (IS_ERR(id_priv)) {
+		dev_put(*net_dev);
+		*net_dev = NULL;
+	}
 
 	return id_priv;
 }
-- 
cgit v0.10.2


From 8b91ffc1cf67d3f0834197c80c5182890c8d508d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:34 +0300
Subject: IB/core: Get rid of redundant verb ib_destroy_mr

This was added in a thought of uniting all mr allocation
and deallocation routines but the fact is we have a single
deallocation routine already, ib_dereg_mr.

And, move mlx5_ib_destroy_mr specific logic into mlx5_ib_dereg_mr
(includes only signature stuff for now).

And, fixup the only callers (iser/isert) accordingly.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index bac3fb4..8197ce7 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1257,23 +1257,6 @@ struct ib_mr *ib_create_mr(struct ib_pd *pd,
 }
 EXPORT_SYMBOL(ib_create_mr);
 
-int ib_destroy_mr(struct ib_mr *mr)
-{
-	struct ib_pd *pd;
-	int ret;
-
-	if (atomic_read(&mr->usecnt))
-		return -EBUSY;
-
-	pd = mr->pd;
-	ret = mr->device->destroy_mr(mr);
-	if (!ret)
-		atomic_dec(&pd->usecnt);
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
 struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
 {
 	struct ib_mr *mr;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index bac5f98..48f02da 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1499,7 +1499,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
 	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
 	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
-	dev->ib_dev.destroy_mr		= mlx5_ib_destroy_mr;
 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7cae098..29c74e9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -573,7 +573,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
 		       int npages, int zap);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
-int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
 struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
 				struct ib_mr_init_attr *mr_init_attr);
 struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 10d2b21..9559ee8 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1170,6 +1170,19 @@ static int clean_mr(struct mlx5_ib_mr *mr)
 	int umred = mr->umred;
 	int err;
 
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+		kfree(mr->sig);
+		mr->sig = NULL;
+	}
+
 	if (!umred) {
 		err = destroy_mkey(dev, mr);
 		if (err) {
@@ -1317,36 +1330,6 @@ err_free:
 	return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
-	struct mlx5_ib_mr *mr = to_mmr(ibmr);
-	int err;
-
-	if (mr->sig) {
-		if (mlx5_core_destroy_psv(dev->mdev,
-					  mr->sig->psv_memory.psv_idx))
-			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
-				     mr->sig->psv_memory.psv_idx);
-		if (mlx5_core_destroy_psv(dev->mdev,
-					  mr->sig->psv_wire.psv_idx))
-			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
-				     mr->sig->psv_wire.psv_idx);
-		kfree(mr->sig);
-	}
-
-	err = destroy_mkey(dev, mr);
-	if (err) {
-		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
-			     mr->mmr.key, err);
-		return err;
-	}
-
-	kfree(mr);
-
-	return err;
-}
-
 struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
 					int max_page_list_len)
 {
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 5c9f565..7122186 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -334,7 +334,7 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 {
 	ib_free_fast_reg_page_list(pi_ctx->prot_frpl);
 	ib_dereg_mr(pi_ctx->prot_mr);
-	ib_destroy_mr(pi_ctx->sig_mr);
+	ib_dereg_mr(pi_ctx->sig_mr);
 	kfree(pi_ctx);
 }
 
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index d851e18..c054af1 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -491,7 +491,7 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
 		if (fr_desc->pi_ctx) {
 			ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
 			ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
-			ib_destroy_mr(fr_desc->pi_ctx->sig_mr);
+			ib_dereg_mr(fr_desc->pi_ctx->sig_mr);
 			kfree(fr_desc->pi_ctx);
 		}
 		kfree(fr_desc);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index fd7a695..521c205 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1671,7 +1671,6 @@ struct ib_device {
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
-	int                        (*destroy_mr)(struct ib_mr *mr);
 	struct ib_mr *		   (*create_mr)(struct ib_pd *pd,
 						struct ib_mr_init_attr *mr_init_attr);
 	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
@@ -2829,15 +2828,6 @@ struct ib_mr *ib_create_mr(struct ib_pd *pd,
 			   struct ib_mr_init_attr *mr_init_attr);
 
 /**
- * ib_destroy_mr - Destroys a memory region that was created using
- *     ib_create_mr and removes it from HW translation tables.
- * @mr: The memory region to destroy.
- *
- * This function can fail, if the memory region has memory windows bound to it.
- */
-int ib_destroy_mr(struct ib_mr *mr);
-
-/**
  * ib_alloc_fast_reg_mr - Allocates memory region usable with the
  *   IB_WR_FAST_REG_MR send work request.
  * @pd: The protection domain associated with the region.
-- 
cgit v0.10.2


From 9bee178b4f6b3e122ed8eda990450a638706e271 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:35 +0300
Subject: IB: Modify ib_create_mr API

Use ib_alloc_mr with specific parameters.
Change the existing callers.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 8197ce7..f96dd45 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1235,15 +1235,32 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
-			   struct ib_mr_init_attr *mr_init_attr)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd:            protection domain associated with the region
+ * @mr_type:       memory region type
+ * @max_num_sg:    maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+			  enum ib_mr_type mr_type,
+			  u32 max_num_sg)
 {
 	struct ib_mr *mr;
 
-	if (!pd->device->create_mr)
-		return ERR_PTR(-ENOSYS);
-
-	mr = pd->device->create_mr(pd, mr_init_attr);
+	if (pd->device->alloc_mr) {
+		mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+	} else {
+		if (mr_type != IB_MR_TYPE_MEM_REG ||
+		    !pd->device->alloc_fast_reg_mr)
+			return ERR_PTR(-ENOSYS);
+		mr = pd->device->alloc_fast_reg_mr(pd, max_num_sg);
+	}
 
 	if (!IS_ERR(mr)) {
 		mr->device  = pd->device;
@@ -1255,7 +1272,7 @@ struct ib_mr *ib_create_mr(struct ib_pd *pd,
 
 	return mr;
 }
-EXPORT_SYMBOL(ib_create_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
 
 struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
 {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 48f02da..82a371f 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1502,7 +1502,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
-	dev->ib_dev.create_mr		= mlx5_ib_create_mr;
+	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
 	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 29c74e9..31b50a4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -573,8 +573,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
 		       int npages, int zap);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
-				struct ib_mr_init_attr *mr_init_attr);
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg);
 struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
 					int max_page_list_len);
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9559ee8..26817a2 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1244,14 +1244,15 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 	return 0;
 }
 
-struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
-				struct ib_mr_init_attr *mr_init_attr)
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_create_mkey_mbox_in *in;
 	struct mlx5_ib_mr *mr;
 	int access_mode, err;
-	int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4);
+	int ndescs = roundup(max_num_sg, 4);
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
@@ -1267,9 +1268,11 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
 	in->seg.xlt_oct_size = cpu_to_be32(ndescs);
 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
-	access_mode = MLX5_ACCESS_MODE_MTT;
 
-	if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) {
+	if (mr_type == IB_MR_TYPE_MEM_REG) {
+		access_mode = MLX5_ACCESS_MODE_MTT;
+		in->seg.log2_page_size = PAGE_SHIFT;
+	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
 		u32 psv_index[2];
 
 		in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
@@ -1295,6 +1298,10 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
 		mr->sig->sig_err_exists = false;
 		/* Next UMR, Arm SIGERR */
 		++mr->sig->sigerr_count;
+	} else {
+		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
+		err = -EINVAL;
+		goto err_free_in;
 	}
 
 	in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 7122186..32d73c1 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -284,9 +284,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 		  struct fast_reg_descriptor *desc)
 {
 	struct iser_pi_context *pi_ctx = NULL;
-	struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2,
-					       .flags = IB_MR_SIGNATURE_EN};
-	int ret = 0;
+	int ret;
 
 	desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
 	if (!desc->pi_ctx)
@@ -309,7 +307,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 	}
 	desc->reg_indicators |= ISER_PROT_KEY_VALID;
 
-	pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+	pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
 	if (IS_ERR(pi_ctx->sig_mr)) {
 		ret = PTR_ERR(pi_ctx->sig_mr);
 		goto sig_mr_failure;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index c054af1..fd0aa93c 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -508,7 +508,6 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
 		    struct ib_device *device,
 		    struct ib_pd *pd)
 {
-	struct ib_mr_init_attr mr_init_attr;
 	struct pi_context *pi_ctx;
 	int ret;
 
@@ -536,10 +535,7 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
 	}
 	desc->ind |= ISERT_PROT_KEY_VALID;
 
-	memset(&mr_init_attr, 0, sizeof(mr_init_attr));
-	mr_init_attr.max_reg_descriptors = 2;
-	mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
-	pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+	pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
 	if (IS_ERR(pi_ctx->sig_mr)) {
 		isert_err("Failed to allocate signature enabled mr err=%ld\n",
 			  PTR_ERR(pi_ctx->sig_mr));
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 521c205..aba5143 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -557,20 +557,18 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
  */
 __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
 
-enum ib_mr_create_flags {
-	IB_MR_SIGNATURE_EN = 1,
-};
 
 /**
- * ib_mr_init_attr - Memory region init attributes passed to routine
- *     ib_create_mr.
- * @max_reg_descriptors: max number of registration descriptors that
- *     may be used with registration work requests.
- * @flags: MR creation flags bit mask.
+ * enum ib_mr_type - memory region type
+ * @IB_MR_TYPE_MEM_REG:       memory region that is used for
+ *                            normal registration
+ * @IB_MR_TYPE_SIGNATURE:     memory region that is used for
+ *                            signature operations (data-integrity
+ *                            capable regions)
  */
-struct ib_mr_init_attr {
-	int	    max_reg_descriptors;
-	u32	    flags;
+enum ib_mr_type {
+	IB_MR_TYPE_MEM_REG,
+	IB_MR_TYPE_SIGNATURE,
 };
 
 /**
@@ -1671,8 +1669,9 @@ struct ib_device {
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
-	struct ib_mr *		   (*create_mr)(struct ib_pd *pd,
-						struct ib_mr_init_attr *mr_init_attr);
+	struct ib_mr *		   (*alloc_mr)(struct ib_pd *pd,
+					       enum ib_mr_type mr_type,
+					       u32 max_num_sg);
 	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
 					       int max_page_list_len);
 	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
@@ -2817,15 +2816,9 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
  */
 int ib_dereg_mr(struct ib_mr *mr);
 
-
-/**
- * ib_create_mr - Allocates a memory region that may be used for
- *     signature handover operations.
- * @pd: The protection domain associated with the region.
- * @mr_init_attr: memory region init attributes.
- */
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
-			   struct ib_mr_init_attr *mr_init_attr);
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+			  enum ib_mr_type mr_type,
+			  u32 max_num_sg);
 
 /**
  * ib_alloc_fast_reg_mr - Allocates memory region usable with the
-- 
cgit v0.10.2


From 34780f012ceeb41ec6d44b3877396042e750862c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:36 +0300
Subject: IB/iser: Convert to ib_alloc_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 32d73c1..8f24728 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -299,8 +299,8 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 		goto prot_frpl_failure;
 	}
 
-	pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd,
-					ISCSI_ISER_SG_TABLESIZE + 1);
+	pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+				      ISCSI_ISER_SG_TABLESIZE + 1);
 	if (IS_ERR(pi_ctx->prot_mr)) {
 		ret = PTR_ERR(pi_ctx->prot_mr);
 		goto prot_mr_failure;
@@ -351,7 +351,8 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
 		return PTR_ERR(desc->data_frpl);
 	}
 
-	desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
+	desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+				    ISCSI_ISER_SG_TABLESIZE + 1);
 	if (IS_ERR(desc->data_mr)) {
 		ret = PTR_ERR(desc->data_mr);
 		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
-- 
cgit v0.10.2


From a89be2cc51f6602e803f2f9a9c8c4f59a0bd58b3 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:37 +0300
Subject: iser-target: Convert to ib_alloc_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index fd0aa93c..0ebbaa5 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -526,7 +526,8 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
 		goto err_pi_ctx;
 	}
 
-	pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+	pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+				      ISCSI_ISER_SG_TABLESIZE);
 	if (IS_ERR(pi_ctx->prot_mr)) {
 		isert_err("Failed to allocate prot frmr err=%ld\n",
 			  PTR_ERR(pi_ctx->prot_mr));
@@ -573,7 +574,8 @@ isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
 		return PTR_ERR(fr_desc->data_frpl);
 	}
 
-	fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+	fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+				       ISCSI_ISER_SG_TABLESIZE);
 	if (IS_ERR(fr_desc->data_mr)) {
 		isert_err("Failed to allocate data frmr err=%ld\n",
 			  PTR_ERR(fr_desc->data_mr));
-- 
cgit v0.10.2


From 563b67c5f90abeae1038008e6c9c187fb36ad35c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:38 +0300
Subject: IB/srp: Convert to ib_alloc_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 7755df4..4326560 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -378,7 +378,8 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
 	INIT_LIST_HEAD(&pool->free_list);
 
 	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
-		mr = ib_alloc_fast_reg_mr(pd, max_page_list_len);
+		mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+				 max_page_list_len);
 		if (IS_ERR(mr)) {
 			ret = PTR_ERR(mr);
 			goto destroy_pool;
-- 
cgit v0.10.2


From 0410e38eca85e042f5d5a281dbcc792db701ed44 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:39 +0300
Subject: xprtrdma, svcrdma: Convert to ib_alloc_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 04ea914..5318951 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -117,7 +117,7 @@ __frwr_recovery_worker(struct work_struct *work)
 	if (ib_dereg_mr(r->r.frmr.fr_mr))
 		goto out_fail;
 
-	r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+	r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
 	if (IS_ERR(r->r.frmr.fr_mr))
 		goto out_fail;
 
@@ -148,7 +148,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 	struct rpcrdma_frmr *f = &r->r.frmr;
 	int rc;
 
-	f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+	f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
 	if (IS_ERR(f->fr_mr))
 		goto out_mr_err;
 	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
@@ -158,7 +158,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 
 out_mr_err:
 	rc = PTR_ERR(f->fr_mr);
-	dprintk("RPC:       %s: ib_alloc_fast_reg_mr status %i\n",
+	dprintk("RPC:       %s: ib_alloc_mr status %i\n",
 		__func__, rc);
 	return rc;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fdc850f..8752a2d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -738,7 +738,7 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
 	if (!frmr)
 		goto err;
 
-	mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+	mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, RPCSVC_MAXPAGES);
 	if (IS_ERR(mr))
 		goto err_free_frmr;
 
-- 
cgit v0.10.2


From 9ac07501e1918b8d1140adcc360e8d8c7f5a2f7c Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 7 Aug 2015 11:11:20 -0500
Subject: svcrdma: limit FRMR page list lengths to device max

Svcrdma was incorrectly allocating fastreg MRs and page lists using
RPCSVC_MAXPAGES, which can exceed the device capabilities.  So limit
the depth to the minimum of RPCSVC_MAXPAGES and xprt->sc_frmr_pg_list_len.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 8752a2d..11d5133 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -733,17 +733,19 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
 	struct ib_mr *mr;
 	struct ib_fast_reg_page_list *pl;
 	struct svc_rdma_fastreg_mr *frmr;
+	u32 num_sg;
 
 	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
 	if (!frmr)
 		goto err;
 
-	mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, RPCSVC_MAXPAGES);
+	num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
+	mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
 	if (IS_ERR(mr))
 		goto err_free_frmr;
 
 	pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
-					 RPCSVC_MAXPAGES);
+					 num_sg);
 	if (IS_ERR(pl))
 		goto err_free_mr;
 
-- 
cgit v0.10.2


From fc27995942960c894bc4725435dce8750e44cd64 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:40 +0300
Subject: RDS: Convert to ib_alloc_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index dba8d08..6a8fbd6 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -667,11 +667,12 @@ static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
 	struct ib_mr *mr;
 	int err;
 
-	mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+	mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
+			 pool->max_message_size);
 	if (IS_ERR(mr)) {
 		err = PTR_ERR(mr);
 
-		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+		printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
 		return err;
 	}
 
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 334fe98..86152ec 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -153,9 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic)
 		sge->length = sizeof(struct rds_header);
 		sge->lkey = 0;
 
-		send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+		send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
+					 fastreg_message_size);
 		if (IS_ERR(send->s_mr)) {
-			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+			printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
 			break;
 		}
 
-- 
cgit v0.10.2


From b3778ba8ded0aafbd820b68be0686ff0c6026eff Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:41 +0300
Subject: mlx5: Drop mlx5_ib_alloc_fast_reg_mr

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 82a371f..ce75875 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1503,7 +1503,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
 	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
-	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 31b50a4..62b06ae 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -576,8 +576,6 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
 			       enum ib_mr_type mr_type,
 			       u32 max_num_sg);
-struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-					int max_page_list_len);
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
 							       int page_list_len);
 void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 26817a2..dad82db 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1337,50 +1337,6 @@ err_free:
 	return ERR_PTR(err);
 }
 
-struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-					int max_page_list_len)
-{
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_create_mkey_mbox_in *in;
-	struct mlx5_ib_mr *mr;
-	int err;
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return ERR_PTR(-ENOMEM);
-
-	in = kzalloc(sizeof(*in), GFP_KERNEL);
-	if (!in) {
-		err = -ENOMEM;
-		goto err_free;
-	}
-
-	in->seg.status = MLX5_MKEY_STATUS_FREE;
-	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
-	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
-	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
-	/*
-	 * TBD not needed - issue 197292 */
-	in->seg.log2_page_size = PAGE_SHIFT;
-
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
-				    NULL, NULL);
-	kfree(in);
-	if (err)
-		goto err_free;
-
-	mr->ibmr.lkey = mr->mmr.key;
-	mr->ibmr.rkey = mr->mmr.key;
-	mr->umem = NULL;
-
-	return &mr->ibmr;
-
-err_free:
-	kfree(mr);
-	return ERR_PTR(err);
-}
-
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
 							       int page_list_len)
 {
-- 
cgit v0.10.2


From 679e34d1d050fc67f2ab157ebf8553dddc216c55 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:42 +0300
Subject: mlx4: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 05166b7..9ab73a4 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2294,7 +2294,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
 	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
 	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
-	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+	ibdev->ib_dev.alloc_mr		= mlx4_ib_alloc_mr;
 	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
 	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
 	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 334387f..3b85f04 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -680,8 +680,9 @@ struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
 		    struct ib_mw_bind *mw_bind);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
-struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-					int max_page_list_len);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg);
 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
 							       int page_list_len);
 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index e0d2717..2542fd3 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -350,19 +350,24 @@ int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
 	return 0;
 }
 
-struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-					int max_page_list_len)
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_mr *mr;
 	int err;
 
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > MLX4_MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
 	mr = kmalloc(sizeof *mr, GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
-			    max_page_list_len, 0, &mr->mmr);
+			    max_num_sg, 0, &mr->mmr);
 	if (err)
 		goto err_free;
 
-- 
cgit v0.10.2


From cacb7d59bed3fd9f65d6ba1a4ea948ce8baa9126 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:43 +0300
Subject: ocrdma: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index b119a34..81ed8a3 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -309,7 +309,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
 	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
 
-	dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr;
+	dev->ibdev.alloc_mr = ocrdma_alloc_mr;
 	dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
 	dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index bc84cd4..8e5fb44 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -2998,21 +2998,26 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
 	return 0;
 }
 
-struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
+			      enum ib_mr_type mr_type,
+			      u32 max_num_sg)
 {
 	int status;
 	struct ocrdma_mr *mr;
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 
-	if (max_page_list_len > dev->attr.max_pages_per_frmr)
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > dev->attr.max_pages_per_frmr)
 		return ERR_PTR(-EINVAL);
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	status = ocrdma_get_pbl_info(dev, mr, max_page_list_len);
+	status = ocrdma_get_pbl_info(dev, mr, max_num_sg);
 	if (status)
 		goto pbl_err;
 	mr->hwmr.fr_mr = 1;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index eaccb2d..68e026b 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -111,7 +111,9 @@ struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
 				   int num_phys_buf, int acc, u64 *iova_start);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
-struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
+			      enum ib_mr_type mr_type,
+			      u32 max_num_sg);
 struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
 							*ibdev,
 							int page_list_len);
-- 
cgit v0.10.2


From a21640347a01ba2f96dfc887b8e33cce462780fd Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:44 +0300
Subject: iw_cxgb4: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index cc77844..c7bb38c 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -970,7 +970,9 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
 struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
 					struct ib_device *device,
 					int page_list_len);
-struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth);
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_num_sg);
 int c4iw_dealloc_mw(struct ib_mw *mw);
 struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index cff815b..026b91e 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -853,7 +853,9 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
 	return 0;
 }
 
-struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_num_sg)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
@@ -862,6 +864,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
 	u32 stag = 0;
 	int ret = 0;
 
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > t4_max_fr_depth(use_dsgl))
+		return ERR_PTR(-EINVAL);
+
 	php = to_c4iw_pd(pd);
 	rhp = php->rhp;
 	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -871,10 +877,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
 	}
 
 	mhp->rhp = rhp;
-	ret = alloc_pbl(mhp, pbl_depth);
+	ret = alloc_pbl(mhp, max_num_sg);
 	if (ret)
 		goto err1;
-	mhp->attr.pbl_size = pbl_depth;
+	mhp->attr.pbl_size = max_num_sg;
 	ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
 				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
 	if (ret)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 6eee3d3..7746113 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -556,7 +556,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
 	dev->ibdev.alloc_mw = c4iw_alloc_mw;
 	dev->ibdev.bind_mw = c4iw_bind_mw;
 	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
-	dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr;
+	dev->ibdev.alloc_mr = c4iw_alloc_mr;
 	dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
 	dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
 	dev->ibdev.attach_mcast = c4iw_multicast_attach;
-- 
cgit v0.10.2


From f683d3bdbb37baa49510e32789164e5d33d76ba1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:45 +0300
Subject: cxgb3: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index bbbe018..93308c4 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -800,7 +800,9 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
 	return 0;
 }
 
-static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
+				   enum ib_mr_type mr_type,
+				   u32 max_num_sg)
 {
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
@@ -809,6 +811,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
 	u32 stag = 0;
 	int ret = 0;
 
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > T3_MAX_FASTREG_DEPTH)
+		return ERR_PTR(-EINVAL);
+
 	php = to_iwch_pd(pd);
 	rhp = php->rhp;
 	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -816,10 +822,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
 		goto err;
 
 	mhp->rhp = rhp;
-	ret = iwch_alloc_pbl(mhp, pbl_depth);
+	ret = iwch_alloc_pbl(mhp, max_num_sg);
 	if (ret)
 		goto err1;
-	mhp->attr.pbl_size = pbl_depth;
+	mhp->attr.pbl_size = max_num_sg;
 	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
 				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
 	if (ret)
@@ -1443,7 +1449,7 @@ int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.alloc_mw = iwch_alloc_mw;
 	dev->ibdev.bind_mw = iwch_bind_mw;
 	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
-	dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+	dev->ibdev.alloc_mr = iwch_alloc_mr;
 	dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
 	dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
 	dev->ibdev.attach_mcast = iwch_multicast_attach;
-- 
cgit v0.10.2


From e02e4d554d2b0c20b1f191c5d98bbf5da500a285 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:46 +0300
Subject: nes: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index fbc43e5..44cb513 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -375,9 +375,11 @@ static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 }
 
 /*
- * nes_alloc_fast_reg_mr
+ * nes_alloc_mr
  */
-static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
+				  enum ib_mr_type mr_type,
+				  u32 max_num_sg)
 {
 	struct nes_pd *nespd = to_nespd(ibpd);
 	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
@@ -393,11 +395,18 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
 	u32 stag;
 	int ret;
 	struct ib_mr *ibmr;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+
 /*
  * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
  *	 for the fast_reg_mr operation to always know the size of the PBL.
  */
-	if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
 		return ERR_PTR(-E2BIG);
 
 	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
@@ -424,7 +433,7 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
 	nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
 		  stag, stag_index);
 
-	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len);
+	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
 
 	if (ret == 0) {
 		nesmr->ibmr.rkey = stag;
@@ -3929,7 +3938,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
 	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
 	nesibdev->ibdev.bind_mw = nes_bind_mw;
 
-	nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr;
+	nesibdev->ibdev.alloc_mr = nes_alloc_mr;
 	nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list;
 	nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list;
 
-- 
cgit v0.10.2


From 1302f8452bfbede92c5e984afd64d91eb5459ee7 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:47 +0300
Subject: qib: Support ib_alloc_mr verb

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
index c4473db..19220dc 100644
--- a/drivers/infiniband/hw/qib/qib_mr.c
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -327,11 +327,16 @@ out:
  *
  * Return the memory region on success, otherwise return an errno.
  */
-struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
+			   enum ib_mr_type mr_type,
+			   u32 max_num_sg)
 {
 	struct qib_mr *mr;
 
-	mr = alloc_mr(max_page_list_len, pd);
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = alloc_mr(max_num_sg, pd);
 	if (IS_ERR(mr))
 		return (struct ib_mr *)mr;
 
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index bc0599c..3dcc498 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -2243,7 +2243,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	ibdev->reg_phys_mr = qib_reg_phys_mr;
 	ibdev->reg_user_mr = qib_reg_user_mr;
 	ibdev->dereg_mr = qib_dereg_mr;
-	ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr;
+	ibdev->alloc_mr = qib_alloc_mr;
 	ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list;
 	ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list;
 	ibdev->alloc_fmr = qib_alloc_fmr;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index bce0fa5..a08df70 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -1034,7 +1034,9 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 int qib_dereg_mr(struct ib_mr *ibmr);
 
-struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
+			   enum ib_mr_type mr_type,
+			   u32 max_entries);
 
 struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list(
 				struct ib_device *ibdev, int page_list_len);
-- 
cgit v0.10.2


From d9fe6dd7afaff529124dd7f49c2da89ef789a56f Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Fri, 7 Aug 2015 10:51:25 -0400
Subject: IB/hfi1: Support ib_alloc_mr verb

Ported from upstream qib commit
68c02e232b8a ("qib: Support ib_alloc_mr verb")

Tested-by: Jubin John <jubin.john@intel.com>
Reviewed-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
index 23567f8..bd64e4f 100644
--- a/drivers/staging/rdma/hfi1/mr.c
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -348,11 +348,16 @@ out:
  *
  * Return the memory region on success, otherwise return an errno.
  */
-struct ib_mr *hfi1_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_num_sg)
 {
 	struct hfi1_mr *mr;
 
-	mr = alloc_mr(max_page_list_len, pd);
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = alloc_mr(max_num_sg, pd);
 	if (IS_ERR(mr))
 		return (struct ib_mr *)mr;
 
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index 5f4b661..53ac214 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -2045,6 +2045,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	ibdev->reg_phys_mr = hfi1_reg_phys_mr;
 	ibdev->reg_user_mr = hfi1_reg_user_mr;
 	ibdev->dereg_mr = hfi1_dereg_mr;
+	ibdev->alloc_mr = hfi1_alloc_mr;
 	ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list;
 	ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list;
 	ibdev->alloc_fmr = hfi1_alloc_fmr;
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 8125361..ed903a9 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -1016,7 +1016,9 @@ struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 int hfi1_dereg_mr(struct ib_mr *ibmr);
 
-struct ib_mr *hfi1_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_entries);
 
 struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list(
 				struct ib_device *ibdev, int page_list_len);
-- 
cgit v0.10.2


From d9f272c523db47a56a64942eb6f25361c400de66 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 30 Jul 2015 10:32:48 +0300
Subject: IB/core: Drop ib_alloc_fast_reg_mr

Fully replaced by a more generic and suitable
ib_alloc_mr.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f96dd45..c80ed17 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1253,36 +1253,10 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
 {
 	struct ib_mr *mr;
 
-	if (pd->device->alloc_mr) {
-		mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
-	} else {
-		if (mr_type != IB_MR_TYPE_MEM_REG ||
-		    !pd->device->alloc_fast_reg_mr)
-			return ERR_PTR(-ENOSYS);
-		mr = pd->device->alloc_fast_reg_mr(pd, max_num_sg);
-	}
-
-	if (!IS_ERR(mr)) {
-		mr->device  = pd->device;
-		mr->pd      = pd;
-		mr->uobject = NULL;
-		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
-	}
-
-	return mr;
-}
-EXPORT_SYMBOL(ib_alloc_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
-{
-	struct ib_mr *mr;
-
-	if (!pd->device->alloc_fast_reg_mr)
+	if (!pd->device->alloc_mr)
 		return ERR_PTR(-ENOSYS);
 
-	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+	mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
 	if (!IS_ERR(mr)) {
 		mr->device  = pd->device;
 		mr->pd      = pd;
@@ -1293,7 +1267,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
 
 	return mr;
 }
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
 
 struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
 							  int max_page_list_len)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index aba5143..c3540da 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1672,8 +1672,6 @@ struct ib_device {
 	struct ib_mr *		   (*alloc_mr)(struct ib_pd *pd,
 					       enum ib_mr_type mr_type,
 					       u32 max_num_sg);
-	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
-					       int max_page_list_len);
 	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
 								   int page_list_len);
 	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
@@ -2821,15 +2819,6 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
 			  u32 max_num_sg);
 
 /**
- * ib_alloc_fast_reg_mr - Allocates memory region usable with the
- *   IB_WR_FAST_REG_MR send work request.
- * @pd: The protection domain associated with the region.
- * @max_page_list_len: requested max physical buffer list length to be
- *   used with fast register work requests for this MR.
- */
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
-
-/**
  * ib_alloc_fast_reg_page_list - Allocates a page list array
  * @device - ib device pointer.
  * @page_list_len - size of the page list array to be allocated.
-- 
cgit v0.10.2


From 399e6f95811bd36fb64b3d30cf8529d633884b4c Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:22 +0300
Subject: net/ipv6: Export addrconf_ifid_eui48

For loopback purposes, RoCE devices should have a default GID in the
port GID table, even when the interface is down. In order to do so,
we use the IPv6 link local address which would have been genenrated
for the related Ethernet netdevice when it goes up as a default GID.

addrconf_ifid_eui48 is used to gernerate this address, export it.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index def59d3..431fdfa 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -91,6 +91,37 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
+static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+	if (dev->addr_len != ETH_ALEN)
+		return -1;
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+	/*
+	 * The zSeries OSA network cards can be shared among various
+	 * OS instances, but the OSA cards have only one MAC address.
+	 * This leads to duplicate address conflicts in conjunction
+	 * with IPv6 if more than one instance uses the same card.
+	 *
+	 * The driver for these cards can deliver a unique 16-bit
+	 * identifier for each instance sharing the same card.  It is
+	 * placed instead of 0xFFFE in the interface identifier.  The
+	 * "u" bit of the interface identifier is not inverted in this
+	 * case.  Hence the resulting interface identifier has local
+	 * scope according to RFC2373.
+	 */
+	if (dev->dev_id) {
+		eui[3] = (dev->dev_id >> 8) & 0xFF;
+		eui[4] = dev->dev_id & 0xFF;
+	} else {
+		eui[3] = 0xFF;
+		eui[4] = 0xFE;
+		eui[0] ^= 2;
+	}
+	return 0;
+}
+
 static inline unsigned long addrconf_timeout_fixup(u32 timeout,
 						   unsigned int unit)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 21c2c81..5b0c041 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1845,37 +1845,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 	__ipv6_dev_ac_dec(ifp->idev, &addr);
 }
 
-static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
-{
-	if (dev->addr_len != ETH_ALEN)
-		return -1;
-	memcpy(eui, dev->dev_addr, 3);
-	memcpy(eui + 5, dev->dev_addr + 3, 3);
-
-	/*
-	 * The zSeries OSA network cards can be shared among various
-	 * OS instances, but the OSA cards have only one MAC address.
-	 * This leads to duplicate address conflicts in conjunction
-	 * with IPv6 if more than one instance uses the same card.
-	 *
-	 * The driver for these cards can deliver a unique 16-bit
-	 * identifier for each instance sharing the same card.  It is
-	 * placed instead of 0xFFFE in the interface identifier.  The
-	 * "u" bit of the interface identifier is not inverted in this
-	 * case.  Hence the resulting interface identifier has local
-	 * scope according to RFC2373.
-	 */
-	if (dev->dev_id) {
-		eui[3] = (dev->dev_id >> 8) & 0xFF;
-		eui[4] = dev->dev_id & 0xFF;
-	} else {
-		eui[3] = 0xFF;
-		eui[4] = 0xFE;
-		eui[0] ^= 2;
-	}
-	return 0;
-}
-
 static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
 {
 	if (dev->addr_len != IEEE802154_ADDR_LEN)
-- 
cgit v0.10.2


From 816dd19b3d191da88bc034fb85e21ed09a3ed320 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:23 +0300
Subject: net: Add info for NETDEV_CHANGEUPPER event

Some consumers of NETDEV_CHANGEUPPER event would like to know which
upper device was linked/unlinked and what operation was carried.

Add information in the notifier info block for that purpose.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e20979d..2b7fe4e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3556,6 +3556,20 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 				    netdev_features_t features);
 
+enum netdev_changeupper_event {
+	NETDEV_CHANGEUPPER_LINK,
+	NETDEV_CHANGEUPPER_UNLINK,
+};
+
+struct netdev_changeupper_info {
+	struct netdev_notifier_info	info; /* must be first */
+	enum netdev_changeupper_event	event;
+	struct net_device		*upper;
+};
+
+void netdev_changeupper_info_change(struct net_device *dev,
+				    struct netdev_changeupper_info *info);
+
 struct netdev_bonding_info {
 	ifslave	slave;
 	ifbond	master;
diff --git a/net/core/dev.c b/net/core/dev.c
index a8e4dd4..6e6f14e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5302,6 +5302,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   void *private)
 {
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
+	struct netdev_changeupper_info changeupper_info;
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -5357,7 +5358,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_LINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 	return 0;
 
 rollback_lower_mesh:
@@ -5453,6 +5457,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_adjacent *i, *j;
+	struct netdev_changeupper_info changeupper_info;
 	ASSERT_RTNL();
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
@@ -5474,7 +5479,10 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
 		__netdev_adjacent_dev_unlink(dev, i->dev);
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_UNLINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
cgit v0.10.2


From e999869548b9ab97b7dffa053ba2fee81c31d069 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:24 +0300
Subject: net/bonding: Export bond_option_active_slave_get_rcu

Some consumers of the netdev events API would like to know who is the
active slave when a NETDEV_CHANGEUPPER or NETDEV_BONDING_FAILOVER
events occur. For example, when managing RoCE GIDs, GIDs based on the
bond's ips should only be set on the port which corresponds to active
slave netdevice.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index e9c624d..28bd005 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -730,19 +730,6 @@ static int bond_option_mode_set(struct bonding *bond,
 	return 0;
 }
 
-static struct net_device *__bond_option_active_slave_get(struct bonding *bond,
-							 struct slave *slave)
-{
-	return bond_uses_primary(bond) && slave ? slave->dev : NULL;
-}
-
-struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
-{
-	struct slave *slave = rcu_dereference(bond->curr_active_slave);
-
-	return __bond_option_active_slave_get(bond, slave);
-}
-
 static int bond_option_active_slave_set(struct bonding *bond,
 					const struct bond_opt_value *newval)
 {
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 20defc0..c1740a2 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -310,6 +310,13 @@ static inline bool bond_uses_primary(struct bonding *bond)
 	return bond_mode_uses_primary(BOND_MODE(bond));
 }
 
+static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
+{
+	struct slave *slave = rcu_dereference(bond->curr_active_slave);
+
+	return bond_uses_primary(bond) && slave ? slave->dev : NULL;
+}
+
 static inline bool bond_slave_is_up(struct slave *slave)
 {
 	return netif_running(slave->dev) && netif_carrier_ok(slave->dev);
-- 
cgit v0.10.2


From 55aeed06544f675f25aef06a8c47b0b6b8850f4f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Tue, 4 Aug 2015 15:23:34 -0600
Subject: IB/core: Make ib_alloc_device init the kobject

This gets rid of the weird in-between state where struct ib_device
was allocated but the kobject didn't work.

Consequently ib_device_release is now guaranteed to be called in
all situations and we needn't duplicate its kfrees on error paths.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index c93af66..a6d5025 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -351,10 +351,10 @@ static void ib_cache_setup_one(struct ib_device *device)
 	rwlock_init(&device->cache.lock);
 
 	device->cache.pkey_cache =
-		kmalloc(sizeof *device->cache.pkey_cache *
+		kzalloc(sizeof *device->cache.pkey_cache *
 			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
 	device->cache.gid_cache =
-		kmalloc(sizeof *device->cache.gid_cache *
+		kzalloc(sizeof *device->cache.gid_cache *
 			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
 
 	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
@@ -369,11 +369,8 @@ static void ib_cache_setup_one(struct ib_device *device)
 		goto err;
 	}
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		device->cache.pkey_cache[p] = NULL;
-		device->cache.gid_cache [p] = NULL;
+	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
 		ib_cache_update(device, p + rdma_start_port(device));
-	}
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936..950583a 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,9 +43,6 @@ int  ib_device_register_sysfs(struct ib_device *device,
 						   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
 
-int  ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
 int  ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a9a2781..a4a914a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -165,6 +165,35 @@ static int alloc_name(char *name)
 	return 0;
 }
 
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	kfree(dev->port_immutable);
+	kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+	.dev_uevent = ib_device_uevent,
+};
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -177,9 +206,27 @@ static int alloc_name(char *name)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
-	BUG_ON(size < sizeof (struct ib_device));
+	struct ib_device *device;
+
+	if (WARN_ON(size < sizeof(struct ib_device)))
+		return NULL;
+
+	device = kzalloc(size, GFP_KERNEL);
+	if (!device)
+		return NULL;
+
+	device->dev.class = &ib_class;
+	device_initialize(&device->dev);
+
+	dev_set_drvdata(&device->dev, device);
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+	INIT_LIST_HEAD(&device->client_data_list);
+	INIT_LIST_HEAD(&device->port_list);
 
-	return kzalloc(size, GFP_KERNEL);
+	return device;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -191,13 +238,8 @@ EXPORT_SYMBOL(ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-	if (device->reg_state == IB_DEV_UNINITIALIZED) {
-		kfree(device);
-		return;
-	}
-
-	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+		device->reg_state != IB_DEV_UNINITIALIZED);
 	kobject_put(&device->dev.kobj);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
@@ -235,7 +277,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 
 static int read_port_immutable(struct ib_device *device)
 {
-	int ret = -ENOMEM;
+	int ret;
 	u8 start_port = rdma_start_port(device);
 	u8 end_port = rdma_end_port(device);
 	u8 port;
@@ -251,26 +293,18 @@ static int read_port_immutable(struct ib_device *device)
 					 * (end_port + 1),
 					 GFP_KERNEL);
 	if (!device->port_immutable)
-		goto err;
+		return -ENOMEM;
 
 	for (port = start_port; port <= end_port; ++port) {
 		ret = device->get_port_immutable(device, port,
 						 &device->port_immutable[port]);
 		if (ret)
-			goto err;
+			return ret;
 
-		if (verify_immutable(device, port)) {
-			ret = -EINVAL;
-			goto err;
-		}
+		if (verify_immutable(device, port))
+			return -EINVAL;
 	}
-
-	ret = 0;
-	goto out;
-err:
-	kfree(device->port_immutable);
-out:
-	return ret;
+	return 0;
 }
 
 /**
@@ -301,11 +335,6 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
-	INIT_LIST_HEAD(&device->event_handler_list);
-	INIT_LIST_HEAD(&device->client_data_list);
-	spin_lock_init(&device->event_handler_lock);
-	spin_lock_init(&device->client_data_lock);
-
 	ret = read_port_immutable(device);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -317,7 +346,6 @@ int ib_register_device(struct ib_device *device,
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
-		kfree(device->port_immutable);
 		goto out;
 	}
 
@@ -834,7 +862,7 @@ static int __init ib_core_init(void)
 	if (!ib_wq)
 		return -ENOMEM;
 
-	ret = ib_sysfs_setup();
+	ret = class_register(&ib_class);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
 		goto err;
@@ -858,7 +886,7 @@ err_nl:
 	ibnl_cleanup();
 
 err_sysfs:
-	ib_sysfs_cleanup();
+	class_unregister(&ib_class);
 
 err:
 	destroy_workqueue(ib_wq);
@@ -869,7 +897,7 @@ static void __exit ib_core_cleanup(void)
 {
 	ib_cache_cleanup();
 	ibnl_cleanup();
-	ib_sysfs_cleanup();
+	class_unregister(&ib_class);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 }
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 0b84a9c..34cdd74 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
 	.default_attrs = port_default_attrs
 };
 
-static void ib_device_release(struct device *device)
-{
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	kfree(dev->port_immutable);
-	kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
-			    struct kobj_uevent_env *env)
-{
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	if (add_uevent_var(env, "NAME=%s", dev->name))
-		return -ENOMEM;
-
-	/*
-	 * It would be nice to pass the node GUID with the event...
-	 */
-
-	return 0;
-}
-
 static struct attribute **
 alloc_group_attrs(ssize_t (*show)(struct ib_port *,
 				  struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
 	&dev_attr_node_desc
 };
 
-static struct class ib_class = {
-	.name    = "infiniband",
-	.dev_release = ib_device_release,
-	.dev_uevent = ib_device_uevent,
-};
-
 /* Show a given an attribute in the statistics group */
 static ssize_t show_protocol_stat(const struct device *device,
 			    struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
 	int ret;
 	int i;
 
-	class_dev->class      = &ib_class;
-	class_dev->parent     = device->dma_device;
-	dev_set_name(class_dev, "%s", device->name);
-	dev_set_drvdata(class_dev, device);
-
-	INIT_LIST_HEAD(&device->port_list);
+	device->dev.parent = device->dma_device;
+	ret = dev_set_name(class_dev, "%s", device->name);
+	if (ret)
+		return ret;
 
-	ret = device_register(class_dev);
+	ret = device_add(class_dev);
 	if (ret)
 		goto err;
 
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 
 	device_unregister(&device->dev);
 }
-
-int ib_sysfs_setup(void)
-{
-	return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
-	class_unregister(&ib_class);
-}
-- 
cgit v0.10.2


From 03db3a2d81e6e84f3ed3cb9e087cae17d762642b Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:26 +0300
Subject: IB/core: Add RoCE GID table management

RoCE GIDs are based on IP addresses configured on Ethernet net-devices
which relate to the RDMA (RoCE) device port.

Currently, each of the low-level drivers that support RoCE (ocrdma,
mlx4) manages its own RoCE port GID table. As there's nothing which is
essentially vendor specific, we generalize that, and enhance the RDMA
core GID cache to do this job.

In order to populate the GID table, we listen for events:

(a) netdev up/down/change_addr events - if a netdev is built onto
    our RoCE device, we need to add/delete its IPs. This involves
    adding all GIDs related to this ndev, add default GIDs, etc.

(b) inet events - add new GIDs (according to the IP addresses)
    to the table.

For programming the port RoCE GID table, providers must implement
the add_gid and del_gid callbacks.

RoCE GID management requires us to state the associated net_device
alongside the GID. This information is necessary in order to manage
the GID table. For example, when a net_device is removed, its
associated GIDs need to be removed as well.

RoCE mandates generating a default GID for each port, based on the
related net-device's IPv6 link local. In contrast to the GID based on
the regular IPv6 link-local (as we generate GID per IP address),
the default GID is also available when the net device is down (in
order to support loopback).

Locking is done as follows:
The patch modify the GID table code both for new RoCE drivers
implementing the add_gid/del_gid callbacks and for current RoCE and
IB drivers that do not. The flows for updating the table are
different, so the locking requirements are too.

While updating RoCE GID table, protection against multiple writers is
achieved via mutex_lock(&table->lock). Since writing to a table
requires us to find an entry (possible a free entry) in the table and
then modify it, this mutex protects both the find_gid and write_gid
ensuring the atomicity of the action.
Each entry in the GID cache is protected by rwlock. In RoCE, writing
(usually results from netdev notifier) involves invoking the vendor's
add_gid and del_gid callbacks, which could sleep.
Therefore, an invalid flag is added for each entry. Updates for RoCE are
done via a workqueue, thus sleeping is permitted.

In IB, updates are done in write_lock_irq(&device->cache.lock), thus
write_gid isn't allowed to sleep and add_gid/del_gid are not called.

When passing net-device into/out-of the GID cache, the device
is always passed held (dev_hold).

The code uses a single work item for updating all RDMA devices,
following a netdev or inet notifier.

The patch moves the cache from being a client (which was incorrect,
as the cache is part of the IB infrastructure) to being explicitly
initialized/freed when a device is registered/removed.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf7367..d43a899 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
-				device.o fmr_pool.o cache.o netlink.o
+				device.o fmr_pool.o cache.o netlink.o \
+				roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index a6d5025..fc39a2f 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
 
 #include <rdma/ib_cache.h>
 
@@ -47,76 +49,620 @@ struct ib_pkey_cache {
 	u16             table[0];
 };
 
-struct ib_gid_cache {
-	int             table_len;
-	union ib_gid    table[0];
-};
-
 struct ib_update_work {
 	struct work_struct work;
 	struct ib_device  *device;
 	u8                 port_num;
 };
 
-int ib_get_cached_gid(struct ib_device *device,
-		      u8                port_num,
-		      int               index,
-		      union ib_gid     *gid)
+static union ib_gid zgid;
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+	GID_ATTR_FIND_MASK_GID          = 1UL << 0,
+	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
+	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 2,
+};
+
+enum gid_table_entry_props {
+	GID_TABLE_ENTRY_INVALID		= 1UL << 0,
+	GID_TABLE_ENTRY_DEFAULT		= 1UL << 1,
+};
+
+enum gid_table_write_action {
+	GID_TABLE_WRITE_ACTION_ADD,
+	GID_TABLE_WRITE_ACTION_DEL,
+	/* MODIFY only updates the GID table. Currently only used by
+	 * ib_cache_update.
+	 */
+	GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+	/* This lock protects an entry from being
+	 * read and written simultaneously.
+	 */
+	rwlock_t	    lock;
+	unsigned long	    props;
+	union ib_gid        gid;
+	struct ib_gid_attr  attr;
+	void		   *context;
+};
+
+struct ib_gid_table {
+	int                  sz;
+	/* In RoCE, adding a GID to the table requires:
+	 * (a) Find if this GID is already exists.
+	 * (b) Find a free space.
+	 * (c) Write the new GID
+	 *
+	 * Delete requires different set of operations:
+	 * (a) Find the GID
+	 * (b) Delete it.
+	 *
+	 * Add/delete should be carried out atomically.
+	 * This is done by locking this mutex from multiple
+	 * writers. We don't need this lock for IB, as the MAD
+	 * layer replaces all entries. All data_vec entries
+	 * are locked by this lock.
+	 **/
+	struct mutex         lock;
+	struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+		     struct ib_gid_table *table, int ix,
+		     const union ib_gid *gid,
+		     const struct ib_gid_attr *attr,
+		     enum gid_table_write_action action,
+		     bool  default_gid)
 {
-	struct ib_gid_cache *cache;
+	int ret = 0;
+	struct net_device *old_net_dev;
 	unsigned long flags;
+
+	/* in rdma_cap_roce_gid_table, this funciton should be protected by a
+	 * sleep-able lock.
+	 */
+	write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+	if (rdma_cap_roce_gid_table(ib_dev, port)) {
+		table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+		write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+		/* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+		 * RoCE providers and thus only updates the cache.
+		 */
+		if (action == GID_TABLE_WRITE_ACTION_ADD)
+			ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+					      &table->data_vec[ix].context);
+		else if (action == GID_TABLE_WRITE_ACTION_DEL)
+			ret = ib_dev->del_gid(ib_dev, port, ix,
+					      &table->data_vec[ix].context);
+		write_lock_irqsave(&table->data_vec[ix].lock, flags);
+	}
+
+	old_net_dev = table->data_vec[ix].attr.ndev;
+	if (old_net_dev && old_net_dev != attr->ndev)
+		dev_put(old_net_dev);
+	/* if modify_gid failed, just delete the old gid */
+	if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+		gid = &zgid;
+		attr = &zattr;
+		table->data_vec[ix].context = NULL;
+	}
+	if (default_gid)
+		table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+	memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+	memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+	if (table->data_vec[ix].attr.ndev &&
+	    table->data_vec[ix].attr.ndev != old_net_dev)
+		dev_hold(table->data_vec[ix].attr.ndev);
+
+	table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+	write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+	if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+		struct ib_event event;
+
+		event.device		= ib_dev;
+		event.element.port_num	= port;
+		event.event		= IB_EVENT_GID_CHANGE;
+
+		ib_dispatch_event(&event);
+	}
+	return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+		   struct ib_gid_table *table, int ix,
+		   const union ib_gid *gid,
+		   const struct ib_gid_attr *attr,
+		   bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, gid, attr,
+			 GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+		      struct ib_gid_table *table, int ix,
+		      const union ib_gid *gid,
+		      const struct ib_gid_attr *attr,
+		      bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, gid, attr,
+			 GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+		   struct ib_gid_table *table, int ix,
+		   bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+			 GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+		    const struct ib_gid_attr *val, bool default_gid,
+		    unsigned long mask)
+{
+	int i;
+
+	for (i = 0; i < table->sz; i++) {
+		unsigned long flags;
+		struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+		read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_GID &&
+		    memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+		    attr->ndev != val->ndev)
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+		    !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+		    default_gid)
+			goto next;
+
+		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+		return i;
+next:
+		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+	}
+
+	return -1;
+}
+
+static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
 	int ret = 0;
+	struct net_device *idev;
 
-	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	if (!memcmp(gid, &zgid, sizeof(*gid)))
 		return -EINVAL;
 
-	read_lock_irqsave(&device->cache.lock, flags);
+	if (ib_dev->get_netdev) {
+		idev = ib_dev->get_netdev(ib_dev, port);
+		if (idev && attr->ndev != idev) {
+			union ib_gid default_gid;
 
-	cache = device->cache.gid_cache[port_num - rdma_start_port(device)];
+			/* Adding default GIDs in not permitted */
+			make_default_gid(idev, &default_gid);
+			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+				dev_put(idev);
+				return -EPERM;
+			}
+		}
+		if (idev)
+			dev_put(idev);
+	}
 
-	if (index < 0 || index >= cache->table_len)
-		ret = -EINVAL;
-	else
-		*gid = cache->table[index];
+	mutex_lock(&table->lock);
 
-	read_unlock_irqrestore(&device->cache.lock, flags);
+	ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_NETDEV);
+	if (ix >= 0)
+		goto out_unlock;
 
+	ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_DEFAULT);
+	if (ix < 0) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+	mutex_unlock(&table->lock);
 	return ret;
 }
-EXPORT_SYMBOL(ib_get_cached_gid);
 
-int ib_find_cached_gid(struct ib_device   *device,
-		       const union ib_gid *gid,
-		       u8                 *port_num,
-		       u16                *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
 {
-	struct ib_gid_cache *cache;
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
+
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	mutex_lock(&table->lock);
+
+	ix = find_gid(table, gid, attr, false,
+		      GID_ATTR_FIND_MASK_GID	  |
+		      GID_ATTR_FIND_MASK_NETDEV	  |
+		      GID_ATTR_FIND_MASK_DEFAULT);
+	if (ix < 0)
+		goto out_unlock;
+
+	del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+	mutex_unlock(&table->lock);
+	return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
+
+	table  = ports_table[port - rdma_start_port(ib_dev)];
+
+	mutex_lock(&table->lock);
+
+	for (ix = 0; ix < table->sz; ix++)
+		if (table->data_vec[ix].attr.ndev == ndev)
+			del_gid(ib_dev, port, table, ix, false);
+
+	mutex_unlock(&table->lock);
+	return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+			      union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
 	unsigned long flags;
-	int p, i;
-	int ret = -ENOENT;
 
-	*port_num = -1;
-	if (index)
-		*index = -1;
+	table = ports_table[port - rdma_start_port(ib_dev)];
 
-	read_lock_irqsave(&device->cache.lock, flags);
+	if (index < 0 || index >= table->sz)
+		return -EINVAL;
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		cache = device->cache.gid_cache[p];
-		for (i = 0; i < cache->table_len; ++i) {
-			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
-				*port_num = p + rdma_start_port(device);
-				if (index)
-					*index = i;
-				ret = 0;
-				goto found;
-			}
+	read_lock_irqsave(&table->data_vec[index].lock, flags);
+	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+		read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+		return -EAGAIN;
+	}
+
+	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+	if (attr) {
+		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+		if (attr->ndev)
+			dev_hold(attr->ndev);
+	}
+
+	read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+	return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+				    const union ib_gid *gid,
+				    const struct ib_gid_attr *val,
+				    unsigned long mask,
+				    u8 *port, u16 *index)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	u8 p;
+	int local_index;
+
+	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+		table = ports_table[p];
+		local_index = find_gid(table, gid, val, false, mask);
+		if (local_index >= 0) {
+			if (index)
+				*index = local_index;
+			if (port)
+				*port = p + rdma_start_port(ib_dev);
+			return 0;
 		}
 	}
-found:
-	read_unlock_irqrestore(&device->cache.lock, flags);
 
-	return ret;
+	return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+			     const union ib_gid *gid,
+			     struct net_device *ndev, u8 *port,
+			     u16 *index)
+{
+	unsigned long mask = GID_ATTR_FIND_MASK_GID;
+	struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+					mask, port, index);
+}
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+			      const union ib_gid *gid,
+			      u8 port, struct net_device *ndev,
+			      u16 *index)
+{
+	int local_index;
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID;
+	struct ib_gid_attr val = {.ndev = ndev};
+
+	if (port < rdma_start_port(ib_dev) ||
+	    port > rdma_end_port(ib_dev))
+		return -ENOENT;
+
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	local_index = find_gid(table, gid, &val, false, mask);
+	if (local_index >= 0) {
+		if (index)
+			*index = local_index;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+	unsigned int i;
+	struct ib_gid_table *table =
+		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+	if (!table->data_vec)
+		goto err_free_table;
+
+	mutex_init(&table->lock);
+
+	table->sz = sz;
+
+	for (i = 0; i < sz; i++)
+		rwlock_init(&table->data_vec[i].lock);
+
+	return table;
+
+err_free_table:
+	kfree(table);
+	return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+	if (table) {
+		kfree(table->data_vec);
+		kfree(table);
+	}
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+				   struct ib_gid_table *table)
+{
+	int i;
+
+	if (!table)
+		return;
+
+	for (i = 0; i < table->sz; ++i) {
+		if (memcmp(&table->data_vec[i].gid, &zgid,
+			   sizeof(table->data_vec[i].gid)))
+			del_gid(ib_dev, port, table, i,
+				table->data_vec[i].props &
+				GID_ATTR_FIND_MASK_DEFAULT);
+	}
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  enum ib_cache_gid_default_mode mode)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+	struct ib_gid_table *table;
+	int ix;
+	union ib_gid current_gid;
+	struct ib_gid_attr current_gid_attr = {};
+
+	table  = ports_table[port - rdma_start_port(ib_dev)];
+
+	make_default_gid(ndev, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+	/* Coudn't find default GID location */
+	WARN_ON(ix < 0);
+
+	mutex_lock(&table->lock);
+	if (!__ib_cache_gid_get(ib_dev, port, ix,
+				&current_gid, &current_gid_attr) &&
+	    mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+	    !memcmp(&gid, &current_gid, sizeof(gid)) &&
+	    !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+		goto unlock;
+
+	if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+	     memcmp(&current_gid_attr, &zattr,
+		    sizeof(current_gid_attr))) &&
+	    del_gid(ib_dev, port, table, ix, true)) {
+		pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+			ix, gid.raw);
+		goto unlock;
+	}
+
+	if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+		if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+			pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+				gid.raw);
+
+unlock:
+	if (current_gid_attr.ndev)
+		dev_put(current_gid_attr.ndev);
+	mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+				     struct ib_gid_table *table)
+{
+	if (rdma_protocol_roce(ib_dev, port)) {
+		struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+		entry->props |= GID_TABLE_ENTRY_DEFAULT;
+	}
+
+	return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+	u8 port;
+	struct ib_gid_table **table;
+	int err = 0;
+
+	table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+	if (!table) {
+		pr_warn("failed to allocate ib gid cache for %s\n",
+			ib_dev->name);
+		return -ENOMEM;
+	}
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		u8 rdma_port = port + rdma_start_port(ib_dev);
+
+		table[port] =
+			alloc_gid_table(
+				ib_dev->port_immutable[rdma_port].gid_tbl_len);
+		if (!table[port]) {
+			err = -ENOMEM;
+			goto rollback_table_setup;
+		}
+
+		err = gid_table_reserve_default(ib_dev,
+						port + rdma_start_port(ib_dev),
+						table[port]);
+		if (err)
+			goto rollback_table_setup;
+	}
+
+	ib_dev->cache.gid_cache = table;
+	return 0;
+
+rollback_table_setup:
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table[port]);
+		release_gid_table(table[port]);
+	}
+
+	kfree(table);
+	return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table **table = ib_dev->cache.gid_cache;
+	u8 port;
+
+	if (!table)
+		return;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++)
+		release_gid_table(table[port]);
+
+	kfree(table);
+	ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table **table = ib_dev->cache.gid_cache;
+	u8 port;
+
+	if (!table)
+		return;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++)
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+	int err;
+
+	err = _gid_table_setup_one(ib_dev);
+
+	if (err)
+		return err;
+
+	err = roce_rescan_device(ib_dev);
+
+	if (err) {
+		gid_table_cleanup_one(ib_dev);
+		gid_table_release_one(ib_dev);
+	}
+
+	return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+		return -EINVAL;
+
+	return __ib_cache_gid_get(device, port_num, index, gid, NULL);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       const union ib_gid *gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	return ib_cache_gid_find(device, gid, NULL, port_num, index);
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
@@ -243,9 +789,21 @@ static void ib_cache_update(struct ib_device *device,
 {
 	struct ib_port_attr       *tprops = NULL;
 	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
-	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	struct ib_gid_cache {
+		int             table_len;
+		union ib_gid    table[0];
+	}			  *gid_cache = NULL;
 	int                        i;
 	int                        ret;
+	struct ib_gid_table	  *table;
+	struct ib_gid_table	 **ports_table = device->cache.gid_cache;
+	bool			   use_roce_gid_table =
+					rdma_cap_roce_gid_table(device, port);
+
+	if (port < rdma_start_port(device) || port > rdma_end_port(device))
+		return;
+
+	table = ports_table[port - rdma_start_port(device)];
 
 	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
 	if (!tprops)
@@ -265,12 +823,14 @@ static void ib_cache_update(struct ib_device *device,
 
 	pkey_cache->table_len = tprops->pkey_tbl_len;
 
-	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
-			    sizeof *gid_cache->table, GFP_KERNEL);
-	if (!gid_cache)
-		goto err;
+	if (!use_roce_gid_table) {
+		gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+			    sizeof(*gid_cache->table), GFP_KERNEL);
+		if (!gid_cache)
+			goto err;
 
-	gid_cache->table_len = tprops->gid_tbl_len;
+		gid_cache->table_len = tprops->gid_tbl_len;
+	}
 
 	for (i = 0; i < pkey_cache->table_len; ++i) {
 		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +841,36 @@ static void ib_cache_update(struct ib_device *device,
 		}
 	}
 
-	for (i = 0; i < gid_cache->table_len; ++i) {
-		ret = ib_query_gid(device, port, i, gid_cache->table + i);
-		if (ret) {
-			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
-			       ret, device->name, i);
-			goto err;
+	if (!use_roce_gid_table) {
+		for (i = 0;  i < gid_cache->table_len; ++i) {
+			ret = ib_query_gid(device, port, i,
+					   gid_cache->table + i);
+			if (ret) {
+				printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+				       ret, device->name, i);
+				goto err;
+			}
 		}
 	}
 
 	write_lock_irq(&device->cache.lock);
 
 	old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
-	old_gid_cache  = device->cache.gid_cache [port - rdma_start_port(device)];
 
 	device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
-	device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache;
+	if (!use_roce_gid_table) {
+		for (i = 0; i < gid_cache->table_len; i++) {
+			modify_gid(device, port, table, i, gid_cache->table + i,
+				   &zattr, false);
+		}
+	}
 
 	device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
 
 	write_unlock_irq(&device->cache.lock);
 
+	kfree(gid_cache);
 	kfree(old_pkey_cache);
-	kfree(old_gid_cache);
 	kfree(tprops);
 	return;
 
@@ -344,82 +911,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
 	}
 }
 
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
 {
 	int p;
+	int err;
 
 	rwlock_init(&device->cache.lock);
 
 	device->cache.pkey_cache =
 		kzalloc(sizeof *device->cache.pkey_cache *
 			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-	device->cache.gid_cache =
-		kzalloc(sizeof *device->cache.gid_cache *
-			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-
 	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
 					  (rdma_end_port(device) -
 					   rdma_start_port(device) + 1),
 					  GFP_KERNEL);
-
-	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	if (!device->cache.pkey_cache ||
 	    !device->cache.lmc_cache) {
 		printk(KERN_WARNING "Couldn't allocate cache "
 		       "for %s\n", device->name);
-		goto err;
+		return -ENOMEM;
 	}
 
+	err = gid_table_setup_one(device);
+	if (err)
+		/* Allocated memory will be cleaned in the release function */
+		return err;
+
 	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
 		ib_cache_update(device, p + rdma_start_port(device));
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
-	if (ib_register_event_handler(&device->cache.event_handler))
-		goto err_cache;
-
-	return;
+	err = ib_register_event_handler(&device->cache.event_handler);
+	if (err)
+		goto err;
 
-err_cache:
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		kfree(device->cache.pkey_cache[p]);
-		kfree(device->cache.gid_cache[p]);
-	}
+	return 0;
 
 err:
-	kfree(device->cache.pkey_cache);
-	kfree(device->cache.gid_cache);
-	kfree(device->cache.lmc_cache);
+	gid_table_cleanup_one(device);
+	return err;
 }
 
-static void ib_cache_cleanup_one(struct ib_device *device, void *client_data)
+void ib_cache_release_one(struct ib_device *device)
 {
 	int p;
 
-	ib_unregister_event_handler(&device->cache.event_handler);
-	flush_workqueue(ib_wq);
-
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		kfree(device->cache.pkey_cache[p]);
-		kfree(device->cache.gid_cache[p]);
-	}
-
+	/*
+	 * The release function frees all the cache elements.
+	 * This function should be called as part of freeing
+	 * all the device's resources when the cache could no
+	 * longer be accessed.
+	 */
+	if (device->cache.pkey_cache)
+		for (p = 0;
+		     p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+			kfree(device->cache.pkey_cache[p]);
+
+	gid_table_release_one(device);
 	kfree(device->cache.pkey_cache);
-	kfree(device->cache.gid_cache);
 	kfree(device->cache.lmc_cache);
 }
 
-static struct ib_client cache_client = {
-	.name   = "cache",
-	.add    = ib_cache_setup_one,
-	.remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+	/* The cleanup function unregisters the event handler,
+	 * waits for all in-progress workqueue elements and cleans
+	 * up the GID cache. This function should be called after
+	 * the device was removed from the devices list and all
+	 * clients were removed, so the cache exists but is
+	 * non-functional and shouldn't be updated anymore.
+	 */
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_workqueue(ib_wq);
+	gid_table_cleanup_one(device);
+}
 
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
 {
-	return ib_register_client(&cache_client);
+	roce_gid_mgmt_init();
 }
 
 void __exit ib_cache_cleanup(void)
 {
-	ib_unregister_client(&cache_client);
+	roce_gid_mgmt_cleanup();
 }
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 950583a..70bb36e 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,9 +43,58 @@ int  ib_device_register_sysfs(struct ib_device *device,
 						   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
 
-int  ib_cache_setup(void);
+void ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
 int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+	      struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+	     struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie);
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+			      const union ib_gid *gid,
+			      u8 port, struct net_device *ndev,
+			      u16 *index);
+
+enum ib_cache_gid_default_mode {
+	IB_CACHE_GID_DEFAULT_MODE_SET,
+	IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a4a914a..dfa2c57 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -40,6 +40,8 @@
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "core_priv.h"
 
@@ -169,6 +171,7 @@ static void ib_device_release(struct device *device)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
+	ib_cache_release_one(dev);
 	kfree(dev->port_immutable);
 	kfree(dev);
 }
@@ -342,10 +345,17 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	ret = ib_cache_setup_one(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto out;
+	}
+
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
+		ib_cache_cleanup_one(device);
 		goto out;
 	}
 
@@ -399,6 +409,7 @@ void ib_unregister_device(struct ib_device *device)
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
+	ib_cache_cleanup_one(device);
 
 	down_write(&lists_rwsem);
 	spin_lock_irqsave(&device->client_data_lock, flags);
@@ -670,11 +681,80 @@ EXPORT_SYMBOL(ib_query_port);
 int ib_query_gid(struct ib_device *device,
 		 u8 port_num, int index, union ib_gid *gid)
 {
+	if (rdma_cap_roce_gid_table(device, port_num))
+		return ib_get_cached_gid(device, port_num, index, gid);
+
 	return device->query_gid(device, port_num, index, gid);
 }
 EXPORT_SYMBOL(ib_query_gid);
 
 /**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie)
+{
+	u8 port;
+
+	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+	     port++)
+		if (rdma_protocol_roce(ib_dev, port)) {
+			struct net_device *idev = NULL;
+
+			if (ib_dev->get_netdev)
+				idev = ib_dev->get_netdev(ib_dev, port);
+
+			if (idev &&
+			    idev->reg_state >= NETREG_UNREGISTERED) {
+				dev_put(idev);
+				idev = NULL;
+			}
+
+			if (filter(ib_dev, port, idev, filter_cookie))
+				cb(ib_dev, port, idev, cookie);
+
+			if (idev)
+				dev_put(idev);
+		}
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie)
+{
+	struct ib_device *dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list)
+		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+	up_read(&lists_rwsem);
+}
+
+/**
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
  * @port_num:Port number to query
@@ -753,6 +833,13 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 	int ret, port, i;
 
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+		if (rdma_cap_roce_gid_table(device, port)) {
+			if (!ib_cache_gid_find_by_port(device, gid, port,
+						       NULL, index))
+				*port_num = port;
+				return 0;
+		}
+
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
 			ret = ib_query_gid(device, port, i, &tmp_gid);
 			if (ret)
@@ -874,17 +961,10 @@ static int __init ib_core_init(void)
 		goto err_sysfs;
 	}
 
-	ret = ib_cache_setup();
-	if (ret) {
-		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto err_nl;
-	}
+	ib_cache_setup();
 
 	return 0;
 
-err_nl:
-	ibnl_cleanup();
-
 err_sysfs:
 	class_unregister(&ib_class);
 
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 0000000..7bf4798
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+	GID_DEL = 0,
+	GID_ADD
+};
+
+struct update_gid_event_work {
+	struct work_struct work;
+	union ib_gid       gid;
+	struct ib_gid_attr gid_attr;
+	enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ		2
+struct netdev_event_work_cmd {
+	roce_netdev_callback	cb;
+	roce_netdev_filter	filter;
+};
+
+struct netdev_event_work {
+	struct work_struct		work;
+	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
+	struct net_device		*ndev;
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+		       u8 port, union ib_gid *gid,
+		       struct ib_gid_attr *gid_attr)
+{
+	switch (gid_op) {
+	case GID_ADD:
+		ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+		break;
+	case GID_DEL:
+		ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+		break;
+	}
+}
+
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *real_dev;
+	struct net_device *master_dev;
+	struct net_device *event_ndev = (struct net_device *)cookie;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	real_dev = rdma_vlan_dev_real_dev(event_ndev);
+	res = (real_dev ? real_dev : event_ndev) ==
+		(master_dev ? master_dev : rdma_ndev);
+	rcu_read_unlock();
+
+	return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	return 1;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+			  struct ib_device *ib_dev,
+			  u8 port, struct net_device *ndev,
+			  struct sockaddr *addr)
+{
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+
+	rdma_ip2gid(addr, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+				     u8 port, struct net_device *event_ndev,
+				     struct net_device *rdma_ndev)
+{
+	if (rdma_ndev != event_ndev)
+		return;
+
+	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+				     IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct in_device *in_dev;
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in_dev = in_dev_get(ndev);
+	if (!in_dev)
+		return;
+
+	for_ifa(in_dev) {
+		struct sockaddr_in ip;
+
+		ip.sin_family = AF_INET;
+		ip.sin_addr.s_addr = ifa->ifa_address;
+		update_gid_ip(GID_ADD, ib_dev, port, ndev,
+			      (struct sockaddr *)&ip);
+	}
+	endfor_ifa(in_dev);
+
+	in_dev_put(in_dev);
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *in6_dev;
+	struct sin6_list {
+		struct list_head	list;
+		struct sockaddr_in6	sin6;
+	};
+	struct sin6_list *sin6_iter;
+	struct sin6_list *sin6_temp;
+	struct ib_gid_attr gid_attr = {.ndev = ndev};
+	LIST_HEAD(sin6_list);
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in6_dev = in6_dev_get(ndev);
+	if (!in6_dev)
+		return;
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry) {
+			pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+			continue;
+		}
+
+		entry->sin6.sin6_family = AF_INET6;
+		entry->sin6.sin6_addr = ifp->addr;
+		list_add_tail(&entry->list, &sin6_list);
+	}
+	read_unlock_bh(&in6_dev->lock);
+
+	in6_dev_put(in6_dev);
+
+	list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+		union ib_gid	gid;
+
+		rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+		update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+		list_del(&sin6_iter->list);
+		kfree(sin6_iter);
+	}
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+	enum_netdev_ipv4_ips(ib_dev, port, event_ndev);
+	if (IS_ENABLED(CONFIG_IPV6))
+		enum_netdev_ipv6_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+				    u8 port,
+				    struct net_device *rdma_ndev,
+				    void *cookie)
+{
+	struct net *net;
+	struct net_device *ndev;
+
+	/* Lock the rtnl to make sure the netdevs does not move under
+	 * our feet
+	 */
+	rtnl_lock();
+	for_each_net(net)
+		for_each_netdev(net, ndev)
+			if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+				add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+	rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+			    enum_all_gids_of_dev_cb, NULL);
+
+	return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+					      u8 port,
+					      struct net_device *rdma_ndev,
+					      void *cookie)
+{
+	struct update_gid_event_work *parsed = cookie;
+
+	return update_gid(parsed->gid_op, device,
+			  port, &parsed->gid,
+			  &parsed->gid_attr);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+	struct netdev_event_work *work =
+		container_of(_work, struct netdev_event_work, work);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++)
+		ib_enum_all_roce_netdevs(work->cmds[i].filter, work->ndev,
+					 work->cmds[i].cb, work->ndev);
+
+	dev_put(work->ndev);
+	kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+				struct net_device *ndev)
+{
+	struct netdev_event_work *ndev_work =
+		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+	if (!ndev_work) {
+		pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+		return NOTIFY_DONE;
+	}
+
+	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+	ndev_work->ndev = ndev;
+	dev_hold(ndev);
+	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+	queue_work(ib_wq, &ndev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	static const struct netdev_event_work_cmd add_cmd = {
+		.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+	static const struct netdev_event_work_cmd del_cmd = {
+		.cb = del_netdev_ips, .filter = pass_all_filter};
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_UP:
+		cmds[0] = add_cmd;
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (ndev->reg_state < NETREG_UNREGISTERED)
+			cmds[0] = del_cmd;
+		else
+			return NOTIFY_DONE;
+		break;
+
+	case NETDEV_CHANGEADDR:
+		cmds[0] = del_cmd;
+		cmds[1] = add_cmd;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+	struct update_gid_event_work *work =
+		container_of(_work, struct update_gid_event_work, work);
+
+	ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+				 callback_for_addr_gid_device_scan, work);
+
+	dev_put(work->gid_attr.ndev);
+	kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+		      struct sockaddr *sa, struct net_device *ndev)
+{
+	struct update_gid_event_work *work;
+	enum gid_op_type gid_op;
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		gid_op = GID_ADD;
+		break;
+
+	case NETDEV_DOWN:
+		gid_op = GID_DEL;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+		return NOTIFY_DONE;
+	}
+
+	INIT_WORK(&work->work, update_gid_event_work_handler);
+
+	rdma_ip2gid(sa, &work->gid);
+	work->gid_op = gid_op;
+
+	memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+	dev_hold(ndev);
+	work->gid_attr.ndev   = ndev;
+
+	queue_work(ib_wq, &work->work);
+
+	return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+			  void *ptr)
+{
+	struct sockaddr_in	in;
+	struct net_device	*ndev;
+	struct in_ifaddr	*ifa = ptr;
+
+	in.sin_family = AF_INET;
+	in.sin_addr.s_addr = ifa->ifa_address;
+	ndev = ifa->ifa_dev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct sockaddr_in6	in6;
+	struct net_device	*ndev;
+	struct inet6_ifaddr	*ifa6 = ptr;
+
+	in6.sin6_family = AF_INET6;
+	in6.sin6_addr = ifa6->addr;
+	ndev = ifa6->idev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+	.notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+	.notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+	.notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+	register_inetaddr_notifier(&nb_inetaddr);
+	if (IS_ENABLED(CONFIG_IPV6))
+		register_inet6addr_notifier(&nb_inet6addr);
+	/* We relay on the netdevice notifier to enumerate all
+	 * existing devices in the system. Register to this notifier
+	 * last to make sure we will not miss any IP add/del
+	 * callbacks.
+	 */
+	register_netdevice_notifier(&nb_netdevice);
+
+	return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+	if (IS_ENABLED(CONFIG_IPV6))
+		unregister_inet6addr_notifier(&nb_inet6addr);
+	unregister_inetaddr_notifier(&nb_inetaddr);
+	unregister_netdevice_notifier(&nb_netdevice);
+	/* Ensure all gid deletion tasks complete before we go down,
+	 * to avoid any reference to free'd memory. By the time
+	 * ib-core is removed, all physical devices have been removed,
+	 * so no issue with remaining hardware contexts.
+	 */
+}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c3540da..2de5683 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,10 @@ union ib_gid {
 	} global;
 };
 
+struct ib_gid_attr {
+	struct net_device	*ndev;
+};
+
 enum rdma_node_type {
 	/* IB values map to NodeInfo:NodeType. */
 	RDMA_NODE_IB_CA 	= 1,
@@ -285,7 +289,7 @@ enum ib_port_cap_flags {
 	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
 	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
 	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
-	IB_PORT_IP_BASED_GIDS			= 1 << 26
+	IB_PORT_IP_BASED_GIDS			= 1 << 26,
 };
 
 enum ib_port_width {
@@ -1487,7 +1491,7 @@ struct ib_cache {
 	rwlock_t                lock;
 	struct ib_event_handler event_handler;
 	struct ib_pkey_cache  **pkey_cache;
-	struct ib_gid_cache   **gid_cache;
+	struct ib_gid_table   **gid_cache;
 	u8                     *lmc_cache;
 };
 
@@ -1573,9 +1577,47 @@ struct ib_device {
 						 struct ib_port_attr *port_attr);
 	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
 						     u8 port_num);
+	/* When calling get_netdev, the HW vendor's driver should return the
+	 * net device of device @device at port @port_num or NULL if such
+	 * a net device doesn't exist. The vendor driver should call dev_hold
+	 * on this net device. The HW vendor's device driver must guarantee
+	 * that this function returns NULL before the net device reaches
+	 * NETDEV_UNREGISTER_FINAL state.
+	 */
+	struct net_device	  *(*get_netdev)(struct ib_device *device,
+						 u8 port_num);
 	int		           (*query_gid)(struct ib_device *device,
 						u8 port_num, int index,
 						union ib_gid *gid);
+	/* When calling add_gid, the HW vendor's driver should
+	 * add the gid of device @device at gid index @index of
+	 * port @port_num to be @gid. Meta-info of that gid (for example,
+	 * the network device related to this gid is available
+	 * at @attr. @context allows the HW vendor driver to store extra
+	 * information together with a GID entry. The HW vendor may allocate
+	 * memory to contain this information and store it in @context when a
+	 * new GID entry is written to. Params are consistent until the next
+	 * call of add_gid or delete_gid. The function should return 0 on
+	 * success or error otherwise. The function could be called
+	 * concurrently for different ports. This function is only called
+	 * when roce_gid_table is used.
+	 */
+	int		           (*add_gid)(struct ib_device *device,
+					      u8 port_num,
+					      unsigned int index,
+					      const union ib_gid *gid,
+					      const struct ib_gid_attr *attr,
+					      void **context);
+	/* When calling del_gid, the HW vendor's driver should delete the
+	 * gid of device @device at gid index @index of port @port_num.
+	 * Upon the deletion of a GID entry, the HW vendor must free any
+	 * allocated memory. The caller will clear @context afterwards.
+	 * This function is only called when roce_gid_table is used.
+	 */
+	int		           (*del_gid)(struct ib_device *device,
+					      u8 port_num,
+					      unsigned int index,
+					      void **context);
 	int		           (*query_pkey)(struct ib_device *device,
 						 u8 port_num, u16 index, u16 *pkey);
 	int		           (*modify_device)(struct ib_device *device,
@@ -2108,6 +2150,26 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n
 	return device->port_immutable[port_num].max_mad_size;
 }
 
+/**
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+					   u8 port_num)
+{
+	return rdma_protocol_roce(device, port_num) &&
+		device->add_gid && device->del_gid;
+}
+
 int ib_query_gid(struct ib_device *device,
 		 u8 port_num, int index, union ib_gid *gid);
 
-- 
cgit v0.10.2


From 98d25afa970d134024d8652360569e3bd74782b3 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 18 Aug 2015 12:22:10 +0300
Subject: IB/core: missing curly braces in ib_find_gid()

Smatch says that, based on the indenting, we should probably add curly
braces here.

Fixes: 03db3a2d81e6 ('IB/core: Add RoCE GID table management')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index dfa2c57..7a4ee79 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -835,9 +835,10 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
 		if (rdma_cap_roce_gid_table(device, port)) {
 			if (!ib_cache_gid_find_by_port(device, gid, port,
-						       NULL, index))
+						       NULL, index)) {
 				*port_num = port;
 				return 0;
+			}
 		}
 
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
-- 
cgit v0.10.2


From 238fdf48f2b54a01cedb5774c3a1e81c94e1a3a0 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:27 +0300
Subject: IB/core: Add RoCE table bonding support

Handling bonding and other devices require us to all all GIDs of the
net-devices which are upper-devices of the RoCE port related
net-device.

Active-backup configurations imposes even more challenges as the
default GID should only be set on the active devices (this is
necessary as otherwise the same MAC could be used for several
slaves and thus several slaves will have identical GIDs).

Managing these configurations are done by listening to:
(a) NETDEV_CHANGEUPPER event
	(1) if a related net-device is linked, delete all inactive
	    slaves default GIDs and add the upper device GIDs.
	(2) if a related net-device is unlinked, delete all upper GIDs
	    and add the default GIDs.
(b) NETDEV_BONDING_FAILOVER:
	(1) delete the bond GIDs from inactive slaves
	(2) delete the inactive slave's default GIDs
	(3) Add the bond GIDs to the active slave.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 7bf4798..6eecdfb 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -37,6 +37,7 @@
 
 /* For in6_dev_get/in6_dev_put */
 #include <net/addrconf.h>
+#include <net/bonding.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_addr.h>
@@ -53,16 +54,17 @@ struct update_gid_event_work {
 	enum gid_op_type gid_op;
 };
 
-#define ROCE_NETDEV_CALLBACK_SZ		2
+#define ROCE_NETDEV_CALLBACK_SZ		3
 struct netdev_event_work_cmd {
 	roce_netdev_callback	cb;
 	roce_netdev_filter	filter;
+	struct net_device	*ndev;
+	struct net_device	*filter_ndev;
 };
 
 struct netdev_event_work {
 	struct work_struct		work;
 	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
-	struct net_device		*ndev;
 };
 
 static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
@@ -79,12 +81,70 @@ static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
 	}
 }
 
+enum bonding_slave_state {
+	BONDING_SLAVE_STATE_ACTIVE	= 1UL << 0,
+	BONDING_SLAVE_STATE_INACTIVE	= 1UL << 1,
+	/* No primary slave or the device isn't a slave in bonding */
+	BONDING_SLAVE_STATE_NA		= 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+								   struct net_device *upper)
+{
+	if (upper && netif_is_bond_master(upper)) {
+		struct net_device *pdev =
+			bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+		if (pdev)
+			return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+				BONDING_SLAVE_STATE_INACTIVE;
+	}
+
+	return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+	struct net_device *_upper = NULL;
+	struct list_head *iter;
+
+	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+		if (_upper == upper)
+			break;
+
+	return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES		(BONDING_SLAVE_STATE_ACTIVE |	\
+					 BONDING_SLAVE_STATE_NA)
 static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
 				 struct net_device *rdma_ndev, void *cookie)
 {
+	struct net_device *event_ndev = (struct net_device *)cookie;
 	struct net_device *real_dev;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	real_dev = rdma_vlan_dev_real_dev(event_ndev);
+	if (!real_dev)
+		real_dev = event_ndev;
+
+	res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	       (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+		REQUIRED_BOND_STATES)) ||
+	       real_dev == rdma_ndev);
+
+	rcu_read_unlock();
+	return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+				      struct net_device *rdma_ndev, void *cookie)
+{
 	struct net_device *master_dev;
-	struct net_device *event_ndev = (struct net_device *)cookie;
 	int res;
 
 	if (!rdma_ndev)
@@ -92,9 +152,8 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
 
 	rcu_read_lock();
 	master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
-	real_dev = rdma_vlan_dev_real_dev(event_ndev);
-	res = (real_dev ? real_dev : event_ndev) ==
-		(master_dev ? master_dev : rdma_ndev);
+	res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+		BONDING_SLAVE_STATE_INACTIVE;
 	rcu_read_unlock();
 
 	return res;
@@ -106,6 +165,25 @@ static int pass_all_filter(struct ib_device *ib_dev, u8 port,
 	return 1;
 }
 
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+			       struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	if (rdma_ndev == event_ndev)
+		return 1;
+
+	rcu_read_lock();
+	res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+	rcu_read_unlock();
+
+	return res;
+}
+
 static void update_gid_ip(enum gid_op_type gid_op,
 			  struct ib_device *ib_dev,
 			  u8 port, struct net_device *ndev,
@@ -125,13 +203,49 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
 				     u8 port, struct net_device *event_ndev,
 				     struct net_device *rdma_ndev)
 {
-	if (rdma_ndev != event_ndev)
+	rcu_read_lock();
+	if (!rdma_ndev ||
+	    ((rdma_ndev != event_ndev &&
+	      !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+	     is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+						netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+	     BONDING_SLAVE_STATE_INACTIVE)) {
+		rcu_read_unlock();
 		return;
+	}
+	rcu_read_unlock();
 
 	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
 				     IB_CACHE_GID_DEFAULT_MODE_SET);
 }
 
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+					    u8 port,
+					    struct net_device *event_ndev,
+					    struct net_device *rdma_ndev)
+{
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+	if (!rdma_ndev)
+		return;
+
+	if (!real_dev)
+		real_dev = event_ndev;
+
+	rcu_read_lock();
+
+	if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	    is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+	    BONDING_SLAVE_STATE_INACTIVE) {
+		rcu_read_unlock();
+
+		ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+					     IB_CACHE_GID_DEFAULT_MODE_DELETE);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 				 u8 port, struct net_device *ndev)
 {
@@ -205,15 +319,21 @@ static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
 	}
 }
 
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			    struct net_device *ndev)
+{
+	enum_netdev_ipv4_ips(ib_dev, port, ndev);
+	if (IS_ENABLED(CONFIG_IPV6))
+		enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
 static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
 			   struct net_device *rdma_ndev, void *cookie)
 {
 	struct net_device *event_ndev = (struct net_device *)cookie;
 
 	enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
-	enum_netdev_ipv4_ips(ib_dev, port, event_ndev);
-	if (IS_ENABLED(CONFIG_IPV6))
-		enum_netdev_ipv6_ips(ib_dev, port, event_ndev);
+	_add_netdev_ips(ib_dev, port, event_ndev);
 }
 
 static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
@@ -265,6 +385,94 @@ static void callback_for_addr_gid_device_scan(struct ib_device *device,
 			  &parsed->gid_attr);
 }
 
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+				void *cookie,
+				void (*handle_netdev)(struct ib_device *ib_dev,
+						      u8 port,
+						      struct net_device *ndev))
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+	struct upper_list {
+		struct list_head list;
+		struct net_device *upper;
+	};
+	struct net_device *upper;
+	struct list_head *iter;
+	struct upper_list *upper_iter;
+	struct upper_list *upper_temp;
+	LIST_HEAD(upper_list);
+
+	rcu_read_lock();
+	netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+		struct upper_list *entry = kmalloc(sizeof(*entry),
+						   GFP_ATOMIC);
+
+		if (!entry) {
+			pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+			continue;
+		}
+
+		list_add_tail(&entry->list, &upper_list);
+		dev_hold(upper);
+		entry->upper = upper;
+	}
+	rcu_read_unlock();
+
+	handle_netdev(ib_dev, port, ndev);
+	list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+				 list) {
+		handle_netdev(ib_dev, port, upper_iter->upper);
+		dev_put(upper_iter->upper);
+		list_del(&upper_iter->list);
+		kfree(upper_iter);
+	}
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				      struct net_device *event_ndev)
+{
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+					struct net_device *rdma_ndev,
+					void *cookie)
+{
+	struct net_device *master_ndev;
+
+	rcu_read_lock();
+	master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	if (master_ndev)
+		dev_hold(master_ndev);
+	rcu_read_unlock();
+
+	if (master_ndev) {
+		bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+						rdma_ndev);
+		dev_put(master_ndev);
+	}
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+				   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
 /* The following functions operate on all IB devices. netdevice_event and
  * addr_event execute ib_enum_all_roce_netdevs through a work.
  * ib_enum_all_roce_netdevs iterates through all IB devices.
@@ -276,17 +484,22 @@ static void netdevice_event_work_handler(struct work_struct *_work)
 		container_of(_work, struct netdev_event_work, work);
 	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++)
-		ib_enum_all_roce_netdevs(work->cmds[i].filter, work->ndev,
-					 work->cmds[i].cb, work->ndev);
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+		ib_enum_all_roce_netdevs(work->cmds[i].filter,
+					 work->cmds[i].filter_ndev,
+					 work->cmds[i].cb,
+					 work->cmds[i].ndev);
+		dev_put(work->cmds[i].ndev);
+		dev_put(work->cmds[i].filter_ndev);
+	}
 
-	dev_put(work->ndev);
 	kfree(work);
 }
 
 static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
 				struct net_device *ndev)
 {
+	unsigned int i;
 	struct netdev_event_work *ndev_work =
 		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
 
@@ -296,8 +509,14 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
 	}
 
 	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
-	ndev_work->ndev = ndev;
-	dev_hold(ndev);
+	for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+		if (!ndev_work->cmds[i].ndev)
+			ndev_work->cmds[i].ndev = ndev;
+		if (!ndev_work->cmds[i].filter_ndev)
+			ndev_work->cmds[i].filter_ndev = ndev;
+		dev_hold(ndev_work->cmds[i].ndev);
+		dev_hold(ndev_work->cmds[i].filter_ndev);
+	}
 	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
 
 	queue_work(ib_wq, &ndev_work->work);
@@ -305,13 +524,45 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
 	return NOTIFY_DONE;
 }
 
+static const struct netdev_event_work_cmd add_cmd = {
+	.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+	.cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_changeupper_info *changeupper_info,
+					struct netdev_event_work_cmd *cmds)
+{
+	static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+	if (changeupper_info->event ==
+	    NETDEV_CHANGEUPPER_UNLINK) {
+		cmds[0] = upper_ips_del_cmd;
+		cmds[0].ndev = changeupper_info->upper;
+		cmds[1] = add_cmd;
+	} else if (changeupper_info->event ==
+		   NETDEV_CHANGEUPPER_LINK) {
+		cmds[0] = bonding_default_del_cmd;
+		cmds[0].ndev = changeupper_info->upper;
+		cmds[1] = add_cmd_upper_ips;
+		cmds[1].ndev = changeupper_info->upper;
+		cmds[1].filter_ndev = changeupper_info->upper;
+	}
+}
+
 static int netdevice_event(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
-	static const struct netdev_event_work_cmd add_cmd = {
-		.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
 	static const struct netdev_event_work_cmd del_cmd = {
 		.cb = del_netdev_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+		.cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+	static const struct netdev_event_work_cmd default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
 
@@ -321,7 +572,8 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_REGISTER:
 	case NETDEV_UP:
-		cmds[0] = add_cmd;
+		cmds[0] = bonding_default_del_cmd_join;
+		cmds[1] = add_cmd;
 		break;
 
 	case NETDEV_UNREGISTER:
@@ -332,9 +584,22 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEADDR:
-		cmds[0] = del_cmd;
+		cmds[0] = default_del_cmd;
 		cmds[1] = add_cmd;
 		break;
+
+	case NETDEV_CHANGEUPPER:
+		netdevice_event_changeupper(
+			container_of(ptr, struct netdev_changeupper_info, info),
+			cmds);
+		break;
+
+	case NETDEV_BONDING_FAILOVER:
+		cmds[0] = bonding_event_ips_del_cmd;
+		cmds[1] = bonding_default_del_cmd_join;
+		cmds[2] = add_cmd_upper_ips;
+		break;
+
 	default:
 		return NOTIFY_DONE;
 	}
-- 
cgit v0.10.2


From 79857cd31fe70145ff007d4e968557af342c8ccd Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:28 +0300
Subject: net/mlx4: Postpone the registration of net_device

The mlx4 network driver was registered in the context of the 'add'
function of the core driver (called when HW should be registered).
This makes the netdev event NETDEV_REGISTER to be sent in a context
where the answer to get_protocol_dev() callback returns NULL. This may
be confusing to listeners of netdev events.
This patch is a preparation to the patch that implements the
get_netdev() callback in the IB/mlx4 driver.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 913b716..a946e4b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -224,6 +224,26 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 	kfree(mdev);
 }
 
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	/* register notifier */
+	mdev->nb.notifier_call = mlx4_en_netdev_event;
+	if (register_netdevice_notifier(&mdev->nb)) {
+		mdev->nb.notifier_call = NULL;
+		mlx4_err(mdev, "Failed to create notifier\n");
+	}
+}
+
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
 	struct mlx4_en_dev *mdev;
@@ -297,21 +317,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	mutex_init(&mdev->state_lock);
 	mdev->device_up = true;
 
-	/* Setup ports */
-
-	/* Create a netdev for each port */
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		mlx4_info(mdev, "Activating port:%d\n", i);
-		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
-			mdev->pndev[i] = NULL;
-	}
-	/* register notifier */
-	mdev->nb.notifier_call = mlx4_en_netdev_event;
-	if (register_netdevice_notifier(&mdev->nb)) {
-		mdev->nb.notifier_call = NULL;
-		mlx4_err(mdev, "Failed to create notifier\n");
-	}
-
 	return mdev;
 
 err_mr:
@@ -335,6 +340,7 @@ static struct mlx4_interface mlx4_en_interface = {
 	.event		= mlx4_en_event,
 	.get_dev	= mlx4_en_get_netdev,
 	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
 };
 
 static void mlx4_en_verify_params(void)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 0d80aed..0472941 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -63,8 +63,11 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
 		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
 	} else
 		kfree(dev_ctx);
+
 }
 
 static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 9553a73..5a06d96 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -59,6 +59,7 @@ struct mlx4_interface {
 	void			(*event) (struct mlx4_dev *dev, void *context,
 					  enum mlx4_dev_event event, unsigned long param);
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	void			(*activate)(struct mlx4_dev *dev, void *context);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
 	int			flags;
-- 
cgit v0.10.2


From e26be1bfef81a2314a075f54dd8930cf5e8656df Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:29 +0300
Subject: IB/mlx4: Implement ib_device callbacks

get_netdev: get the net_device on the physical port of the IB transport port. In
port aggregation mode it is required to return the netdev of the active port.

modify_gid: note for a change in the RoCE gid cache. Handle this by writing to
the harsware GID table. It is possible that indexes in cahce and hardware tables
won't match so a translation is required when modifying a QP or creating an
address handle.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index fc39a2f..8f66c67 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -55,7 +55,8 @@ struct ib_update_work {
 	u8                 port_num;
 };
 
-static union ib_gid zgid;
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
 
 static const struct ib_gid_attr zattr;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 9ab73a4..5286ea7 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -45,6 +45,9 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <net/bonding.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
@@ -93,8 +96,6 @@ static void init_query_mad(struct ib_smp *mad)
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
-static union ib_gid zgid;
-
 static int check_flow_steering_support(struct mlx4_dev *dev)
 {
 	int eth_num_ports = 0;
@@ -131,6 +132,237 @@ static int num_ib_ports(struct mlx4_dev *dev)
 	return ib_ports;
 }
 
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+
+	if (dev) {
+		if (mlx4_is_bonded(ibdev->dev)) {
+			struct net_device *upper = NULL;
+
+			upper = netdev_master_upper_dev_get_rcu(dev);
+			if (upper) {
+				struct net_device *active;
+
+				active = bond_option_active_slave_get_rcu(netdev_priv(upper));
+				if (active)
+					dev = active;
+			}
+		}
+	}
+	if (dev)
+		dev_hold(dev);
+
+	rcu_read_unlock();
+	return dev;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+			       struct mlx4_ib_dev *ibdev,
+			       u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	union ib_gid *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_GID_TABLE << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_add_gid(struct ib_device *device,
+			   u8 port_num,
+			   unsigned int index,
+			   const union ib_gid *gid,
+			   const struct ib_gid_attr *attr,
+			   void **context)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int free = -1, found = -1;
+	int ret = 0;
+	int hw_update = 0;
+	int i;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(device, port_num))
+		return -EINVAL;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (!context)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+			found = i;
+			break;
+		}
+		if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+			free = i; /* HW has space */
+	}
+
+	if (found < 0) {
+		if (free < 0) {
+			ret = -ENOSPC;
+		} else {
+			port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
+			if (!port_gid_table->gids[free].ctx) {
+				ret = -ENOMEM;
+			} else {
+				*context = port_gid_table->gids[free].ctx;
+				memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+				port_gid_table->gids[free].ctx->real_index = free;
+				port_gid_table->gids[free].ctx->refcount = 1;
+				hw_update = 1;
+			}
+		}
+	} else {
+		struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+		*context = ctx;
+		ctx->refcount++;
+	}
+	if (!ret && hw_update) {
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+		kfree(gids);
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_del_gid(struct ib_device *device,
+			   u8 port_num,
+			   unsigned int index,
+			   void **context)
+{
+	struct gid_cache_context *ctx = *context;
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int ret = 0;
+	int hw_update = 0;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(device, port_num))
+		return -EINVAL;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	if (ctx) {
+		ctx->refcount--;
+		if (!ctx->refcount) {
+			unsigned int real_index = ctx->real_index;
+
+			memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
+			kfree(port_gid_table->gids[real_index].ctx);
+			port_gid_table->gids[real_index].ctx = NULL;
+			hw_update = 1;
+		}
+	}
+	if (!ret && hw_update) {
+		int i;
+
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+		kfree(gids);
+	}
+	return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index)
+{
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct gid_cache_context *ctx = NULL;
+	union ib_gid gid;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int real_index = -EINVAL;
+	int i;
+	int ret;
+	unsigned long flags;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (mlx4_is_bonded(ibdev->dev))
+		port_num = 1;
+
+	if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
+		return index;
+
+	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid);
+	if (ret)
+		return ret;
+
+	if (!memcmp(&gid, &zgid, sizeof(gid)))
+		return -EINVAL;
+
+	spin_lock_irqsave(&iboe->lock, flags);
+	port_gid_table = &iboe->gids[port_num - 1];
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+			ctx = port_gid_table->gids[i].ctx;
+			break;
+		}
+	if (ctx)
+		real_index = ctx->real_index;
+	spin_unlock_irqrestore(&iboe->lock, flags);
+	return real_index;
+}
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props,
 				struct ib_udata *uhw)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 3b85f04..87a720f 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -457,6 +457,20 @@ struct mlx4_ib_sriov {
 	struct idr pv_id_table;
 };
 
+struct gid_cache_context {
+	int real_index;
+	int refcount;
+};
+
+struct gid_entry {
+	union ib_gid	gid;
+	struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+	struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
 struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
@@ -466,6 +480,7 @@ struct mlx4_ib_iboe {
 	struct notifier_block	nb_inet;
 	struct notifier_block	nb_inet6;
 	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
+	struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
 };
 
 struct pkey_mgt {
@@ -839,5 +854,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 			  u64 start, u64 length, u64 virt_addr,
 			  int mr_access_flags, struct ib_pd *pd,
 			  struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index);
 
 #endif /* MLX4_IB_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index fd13c1c..5c7687a 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -79,7 +79,8 @@ enum {
 
 enum {
 	MLX4_MAX_PORTS		= 2,
-	MLX4_MAX_PORT_PKEYS	= 128
+	MLX4_MAX_PORT_PKEYS	= 128,
+	MLX4_MAX_PORT_GIDS	= 128
 };
 
 /* base qkey for use in sriov tunnel-qp/proxy-qp communication.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2de5683..5eff55c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,8 @@ union ib_gid {
 	} global;
 };
 
+extern union ib_gid zgid;
+
 struct ib_gid_attr {
 	struct net_device	*ndev;
 };
-- 
cgit v0.10.2


From 5070cd2239bd4b382c55c212f10b845ec2de31fc Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:30 +0300
Subject: IB/mlx4: Replace mechanism for RoCE GID management

Manage RoCE gid table with logic in IB/core, which is common to all
vendors, and remove the mechanism from the mlx4 IB driver.
Since management of the GID cache may lead to index mismatch with the
hardware GID table, a translation between indexes is required when
modifying a QP or creating an address handle.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index f50a546..7ad6f96 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -89,7 +89,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
 	if (vlan_tag < 0x1000)
 		vlan_tag |= (ah_attr->sl & 7) << 13;
 	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
-	ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+	ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
 	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
 	if (ah_attr->static_rate) {
 		ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 5286ea7..1437ed5 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -77,13 +77,6 @@ static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
-struct update_gid_work {
-	struct work_struct	work;
-	union ib_gid		gids[128];
-	struct mlx4_ib_dev     *dev;
-	int			port;
-};
-
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 
 static struct workqueue_struct *wq;
@@ -647,12 +640,13 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->state		= IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 	props->active_mtu	= IB_MTU_256;
-	if (is_bonded)
-		rtnl_lock(); /* required to get upper dev */
 	spin_lock_bh(&iboe->lock);
 	ndev = iboe->netdevs[port - 1];
-	if (ndev && is_bonded)
-		ndev = netdev_master_upper_dev_get(ndev);
+	if (ndev && is_bonded) {
+		rcu_read_lock(); /* required to get upper dev */
+		ndev = netdev_master_upper_dev_get_rcu(ndev);
+		rcu_read_unlock();
+	}
 	if (!ndev)
 		goto out_unlock;
 
@@ -664,8 +658,6 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->phys_state	= state_to_phys_state(props->state);
 out_unlock:
 	spin_unlock_bh(&iboe->lock);
-	if (is_bonded)
-		rtnl_unlock();
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return err;
@@ -748,23 +740,27 @@ out:
 	return err;
 }
 
-static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
-			  union ib_gid *gid)
-{
-	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-
-	*gid = dev->iboe.gid_table[port - 1][index];
-
-	return 0;
-}
-
 static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			     union ib_gid *gid)
 {
-	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+	int ret;
+
+	if (rdma_protocol_ib(ibdev, port))
 		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
-	else
-		return iboe_query_gid(ibdev, port, index, gid);
+
+	if (!rdma_protocol_roce(ibdev, port))
+		return -ENODEV;
+
+	if (!rdma_cap_roce_gid_table(ibdev, port))
+		return -ENODEV;
+
+	ret = ib_get_cached_gid(ibdev, port, index, gid);
+	if (ret == -EAGAIN) {
+		memcpy(gid, &zgid, sizeof(*gid));
+		return 0;
+	}
+
+	return ret;
 }
 
 int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -1780,272 +1776,6 @@ static struct device_attribute *mlx4_class_attributes[] = {
 	&dev_attr_board_id
 };
 
-static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
-				     struct net_device *dev)
-{
-	memcpy(eui, dev->dev_addr, 3);
-	memcpy(eui + 5, dev->dev_addr + 3, 3);
-	if (vlan_id < 0x1000) {
-		eui[3] = vlan_id >> 8;
-		eui[4] = vlan_id & 0xff;
-	} else {
-		eui[3] = 0xff;
-		eui[4] = 0xfe;
-	}
-	eui[0] ^= 2;
-}
-
-static void update_gids_task(struct work_struct *work)
-{
-	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
-	struct mlx4_cmd_mailbox *mailbox;
-	union ib_gid *gids;
-	int err;
-	struct mlx4_dev	*dev = gw->dev->dev;
-	int is_bonded = mlx4_is_bonded(dev);
-
-	if (!gw->dev->ib_active)
-		return;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox)) {
-		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
-		return;
-	}
-
-	gids = mailbox->buf;
-	memcpy(gids, gw->gids, sizeof gw->gids);
-
-	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-		       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
-	if (err)
-		pr_warn("set port command failed\n");
-	else
-		if ((gw->port == 1) || !is_bonded)
-			mlx4_ib_dispatch_event(gw->dev,
-					       is_bonded ? 1 : gw->port,
-					       IB_EVENT_GID_CHANGE);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	kfree(gw);
-}
-
-static void reset_gids_task(struct work_struct *work)
-{
-	struct update_gid_work *gw =
-			container_of(work, struct update_gid_work, work);
-	struct mlx4_cmd_mailbox *mailbox;
-	union ib_gid *gids;
-	int err;
-	struct mlx4_dev	*dev = gw->dev->dev;
-
-	if (!gw->dev->ib_active)
-		return;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox)) {
-		pr_warn("reset gid table failed\n");
-		goto free;
-	}
-
-	gids = mailbox->buf;
-	memcpy(gids, gw->gids, sizeof(gw->gids));
-
-	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
-				    IB_LINK_LAYER_ETHERNET) {
-		err = mlx4_cmd(dev, mailbox->dma,
-			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-			       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
-			       MLX4_CMD_TIME_CLASS_B,
-			       MLX4_CMD_WRAPPED);
-		if (err)
-			pr_warn("set port %d command failed\n", gw->port);
-	}
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-free:
-	kfree(gw);
-}
-
-static int update_gid_table(struct mlx4_ib_dev *dev, int port,
-			    union ib_gid *gid, int clear,
-			    int default_gid)
-{
-	struct update_gid_work *work;
-	int i;
-	int need_update = 0;
-	int free = -1;
-	int found = -1;
-	int max_gids;
-
-	if (default_gid) {
-		free = 0;
-	} else {
-		max_gids = dev->dev->caps.gid_table_len[port];
-		for (i = 1; i < max_gids; ++i) {
-			if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
-				    sizeof(*gid)))
-				found = i;
-
-			if (clear) {
-				if (found >= 0) {
-					need_update = 1;
-					dev->iboe.gid_table[port - 1][found] =
-						zgid;
-					break;
-				}
-			} else {
-				if (found >= 0)
-					break;
-
-				if (free < 0 &&
-				    !memcmp(&dev->iboe.gid_table[port - 1][i],
-					    &zgid, sizeof(*gid)))
-					free = i;
-			}
-		}
-	}
-
-	if (found == -1 && !clear && free >= 0) {
-		dev->iboe.gid_table[port - 1][free] = *gid;
-		need_update = 1;
-	}
-
-	if (!need_update)
-		return 0;
-
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work)
-		return -ENOMEM;
-
-	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
-	INIT_WORK(&work->work, update_gids_task);
-	work->port = port;
-	work->dev = dev;
-	queue_work(wq, &work->work);
-
-	return 0;
-}
-
-static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid)
-{
-	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
-}
-
-
-static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
-{
-	struct update_gid_work *work;
-
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work)
-		return -ENOMEM;
-
-	memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
-	memset(work->gids, 0, sizeof(work->gids));
-	INIT_WORK(&work->work, reset_gids_task);
-	work->dev = dev;
-	work->port = port;
-	queue_work(wq, &work->work);
-	return 0;
-}
-
-static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
-			      struct mlx4_ib_dev *ibdev, union ib_gid *gid)
-{
-	struct mlx4_ib_iboe *iboe;
-	int port = 0;
-	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
-				rdma_vlan_dev_real_dev(event_netdev) :
-				event_netdev;
-	union ib_gid default_gid;
-
-	mlx4_make_default_gid(real_dev, &default_gid);
-
-	if (!memcmp(gid, &default_gid, sizeof(*gid)))
-		return 0;
-
-	if (event != NETDEV_DOWN && event != NETDEV_UP)
-		return 0;
-
-	if ((real_dev != event_netdev) &&
-	    (event == NETDEV_DOWN) &&
-	    rdma_link_local_addr((struct in6_addr *)gid))
-		return 0;
-
-	iboe = &ibdev->iboe;
-	spin_lock_bh(&iboe->lock);
-
-	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-		if ((netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->masters[port - 1])) ||
-		     (!netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->netdevs[port - 1])))
-			update_gid_table(ibdev, port, gid,
-					 event == NETDEV_DOWN, 0);
-
-	spin_unlock_bh(&iboe->lock);
-	return 0;
-
-}
-
-static u8 mlx4_ib_get_dev_port(struct net_device *dev,
-			       struct mlx4_ib_dev *ibdev)
-{
-	u8 port = 0;
-	struct mlx4_ib_iboe *iboe;
-	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
-				rdma_vlan_dev_real_dev(dev) : dev;
-
-	iboe = &ibdev->iboe;
-
-	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-		if ((netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->masters[port - 1])) ||
-		     (!netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->netdevs[port - 1])))
-			break;
-
-	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-		return 0;
-	else
-		return port;
-}
-
-static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
-				void *ptr)
-{
-	struct mlx4_ib_dev *ibdev;
-	struct in_ifaddr *ifa = ptr;
-	union ib_gid gid;
-	struct net_device *event_netdev = ifa->ifa_dev->dev;
-
-	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-
-	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
-
-	mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
-	return NOTIFY_DONE;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
-				void *ptr)
-{
-	struct mlx4_ib_dev *ibdev;
-	struct inet6_ifaddr *ifa = ptr;
-	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-	struct net_device *event_netdev = ifa->idev->dev;
-
-	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
-
-	mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
-	return NOTIFY_DONE;
-}
-#endif
-
 #define MLX4_IB_INVALID_MAC	((u64)-1)
 static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 			       struct net_device *dev,
@@ -2104,94 +1834,6 @@ unlock:
 	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 }
 
-static void mlx4_ib_get_dev_addr(struct net_device *dev,
-				 struct mlx4_ib_dev *ibdev, u8 port)
-{
-	struct in_device *in_dev;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_dev *in6_dev;
-	union ib_gid  *pgid;
-	struct inet6_ifaddr *ifp;
-	union ib_gid default_gid;
-#endif
-	union ib_gid gid;
-
-
-	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-		return;
-
-	/* IPv4 gids */
-	in_dev = in_dev_get(dev);
-	if (in_dev) {
-		for_ifa(in_dev) {
-			/*ifa->ifa_address;*/
-			ipv6_addr_set_v4mapped(ifa->ifa_address,
-					       (struct in6_addr *)&gid);
-			update_gid_table(ibdev, port, &gid, 0, 0);
-		}
-		endfor_ifa(in_dev);
-		in_dev_put(in_dev);
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	mlx4_make_default_gid(dev, &default_gid);
-	/* IPv6 gids */
-	in6_dev = in6_dev_get(dev);
-	if (in6_dev) {
-		read_lock_bh(&in6_dev->lock);
-		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-			pgid = (union ib_gid *)&ifp->addr;
-			if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
-				continue;
-			update_gid_table(ibdev, port, pgid, 0, 0);
-		}
-		read_unlock_bh(&in6_dev->lock);
-		in6_dev_put(in6_dev);
-	}
-#endif
-}
-
-static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
-				 struct  net_device *dev, u8 port)
-{
-	union ib_gid gid;
-	mlx4_make_default_gid(dev, &gid);
-	update_gid_table(ibdev, port, &gid, 0, 1);
-}
-
-static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
-{
-	struct	net_device *dev;
-	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
-	int i;
-	int err = 0;
-
-	for (i = 1; i <= ibdev->num_ports; ++i) {
-		if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
-		    IB_LINK_LAYER_ETHERNET) {
-			err = reset_gid_table(ibdev, i);
-			if (err)
-				goto out;
-		}
-	}
-
-	read_lock(&dev_base_lock);
-	spin_lock_bh(&iboe->lock);
-
-	for_each_netdev(&init_net, dev) {
-		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
-		/* port will be non-zero only for ETH ports */
-		if (port) {
-			mlx4_ib_set_default_gid(ibdev, dev, port);
-			mlx4_ib_get_dev_addr(dev, ibdev, port);
-		}
-	}
-
-	spin_unlock_bh(&iboe->lock);
-	read_unlock(&dev_base_lock);
-out:
-	return err;
-}
-
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 				 struct net_device *dev,
 				 unsigned long event)
@@ -2201,81 +1843,22 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 	int update_qps_port = -1;
 	int port;
 
+	ASSERT_RTNL();
+
 	iboe = &ibdev->iboe;
 
 	spin_lock_bh(&iboe->lock);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
-		enum ib_port_state	port_state = IB_PORT_NOP;
-		struct net_device *old_master = iboe->masters[port - 1];
-		struct net_device *curr_netdev;
-		struct net_device *curr_master;
 
 		iboe->netdevs[port - 1] =
 			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
-		if (iboe->netdevs[port - 1])
-			mlx4_ib_set_default_gid(ibdev,
-						iboe->netdevs[port - 1], port);
-		curr_netdev = iboe->netdevs[port - 1];
-
-		if (iboe->netdevs[port - 1] &&
-		    netif_is_bond_slave(iboe->netdevs[port - 1])) {
-			iboe->masters[port - 1] = netdev_master_upper_dev_get(
-				iboe->netdevs[port - 1]);
-		} else {
-			iboe->masters[port - 1] = NULL;
-		}
-		curr_master = iboe->masters[port - 1];
 
 		if (dev == iboe->netdevs[port - 1] &&
 		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
 		     event == NETDEV_UP || event == NETDEV_CHANGE))
 			update_qps_port = port;
 
-		if (curr_netdev) {
-			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
-						IB_PORT_ACTIVE : IB_PORT_DOWN;
-			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-			if (curr_master) {
-				/* if using bonding/team and a slave port is down, we
-				 * don't want the bond IP based gids in the table since
-				 * flows that select port by gid may get the down port.
-				*/
-				if (port_state == IB_PORT_DOWN &&
-				    !mlx4_is_bonded(ibdev->dev)) {
-					reset_gid_table(ibdev, port);
-					mlx4_ib_set_default_gid(ibdev,
-								curr_netdev,
-								port);
-				} else {
-					/* gids from the upper dev (bond/team)
-					 * should appear in port's gid table
-					*/
-					mlx4_ib_get_dev_addr(curr_master,
-							     ibdev, port);
-				}
-			}
-			/* if bonding is used it is possible that we add it to
-			 * masters only after IP address is assigned to the
-			 * net bonding interface.
-			*/
-			if (curr_master && (old_master != curr_master)) {
-				reset_gid_table(ibdev, port);
-				mlx4_ib_set_default_gid(ibdev,
-							curr_netdev, port);
-				mlx4_ib_get_dev_addr(curr_master, ibdev, port);
-			}
-
-			if (!curr_master && (old_master != curr_master)) {
-				reset_gid_table(ibdev, port);
-				mlx4_ib_set_default_gid(ibdev,
-							curr_netdev, port);
-				mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
-			}
-		} else {
-			reset_gid_table(ibdev, port);
-		}
 	}
-
 	spin_unlock_bh(&iboe->lock);
 
 	if (update_qps_port > 0)
@@ -2458,6 +2041,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 						1 : ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dma_device	= &dev->persist->pdev->dev;
+	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
+	ibdev->ib_dev.add_gid		= mlx4_ib_add_gid;
+	ibdev->ib_dev.del_gid		= mlx4_ib_del_gid;
 
 	if (dev->caps.userspace_caps)
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2668,26 +2254,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				goto err_notif;
 			}
 		}
-		if (!iboe->nb_inet.notifier_call) {
-			iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
-			err = register_inetaddr_notifier(&iboe->nb_inet);
-			if (err) {
-				iboe->nb_inet.notifier_call = NULL;
-				goto err_notif;
-			}
-		}
-#if IS_ENABLED(CONFIG_IPV6)
-		if (!iboe->nb_inet6.notifier_call) {
-			iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
-			err = register_inet6addr_notifier(&iboe->nb_inet6);
-			if (err) {
-				iboe->nb_inet6.notifier_call = NULL;
-				goto err_notif;
-			}
-		}
-#endif
-		if (mlx4_ib_init_gid_table(ibdev))
-			goto err_notif;
 	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
@@ -2718,18 +2284,6 @@ err_notif:
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
-	if (ibdev->iboe.nb_inet.notifier_call) {
-		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet.notifier_call = NULL;
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	if (ibdev->iboe.nb_inet6.notifier_call) {
-		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet6.notifier_call = NULL;
-	}
-#endif
 	flush_workqueue(wq);
 
 	mlx4_ib_close_sriov(ibdev);
@@ -2855,19 +2409,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 		kfree(ibdev->ib_uc_qpns_bitmap);
 	}
 
-	if (ibdev->iboe.nb_inet.notifier_call) {
-		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet.notifier_call = NULL;
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	if (ibdev->iboe.nb_inet6.notifier_call) {
-		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet6.notifier_call = NULL;
-	}
-#endif
-
 	iounmap(ibdev->uar_map);
 	for (p = 0; p < ibdev->num_ports; ++p)
 		if (ibdev->counters[p].index != -1 &&
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 87a720f..cb47c2c 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -474,12 +474,8 @@ struct mlx4_port_gid_table {
 struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
-	struct net_device      *masters[MLX4_MAX_PORTS];
 	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
-	struct notifier_block	nb_inet;
-	struct notifier_block	nb_inet6;
-	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
 	struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
 };
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index c5a3a5f..4ad9be3 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1292,14 +1292,18 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 		path->static_rate = 0;
 
 	if (ah->ah_flags & IB_AH_GRH) {
-		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+		int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev,
+								      port,
+								      ah->grh.sgid_index);
+
+		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
 			pr_err("sgid_index (%u) too large. max is %d\n",
-			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
 			return -1;
 		}
 
 		path->grh_mylmc |= 1 << 7;
-		path->mgid_index = ah->grh.sgid_index;
+		path->mgid_index = real_sgid_index;
 		path->hop_limit  = ah->grh.hop_limit;
 		path->tclass_flowlabel =
 			cpu_to_be32((ah->grh.traffic_class << 20) |
-- 
cgit v0.10.2


From cc36929e736a30a291ab543b633046eb57d67e68 Mon Sep 17 00:00:00 2001
From: Somnath Kotur <somnath.kotur@avagotech.com>
Date: Thu, 30 Jul 2015 18:33:31 +0300
Subject: RDMA/ocrdma: Incorporate the moving of GID Table mgmt to IB/Core

1.Change query_gid hook to return value from IB/Core GID
  management APIs.
2.Get rid of all the netdev notifier chain subscription code as well
  as maintenance of SGID Table in memory.
3.Implement get_netdev hook in driver.

Signed-off-by: Somnath Kotur <somnath.kotur@avagotech.com>
Signed-off-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index 6a36338..b4091ab 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -246,7 +246,6 @@ struct ocrdma_dev {
 	u16 base_eqid;
 	u16 max_eq;
 
-	union ib_gid *sgid_tbl;
 	/* provided synchronization to sgid table for
 	 * updating gid entries triggered by notifier.
 	 */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 81ed8a3..87aa55d 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -67,8 +67,6 @@ static LIST_HEAD(ocrdma_dev_list);
 static DEFINE_SPINLOCK(ocrdma_devlist_lock);
 static DEFINE_IDR(ocrdma_dev_id);
 
-static union ib_gid ocrdma_zero_sgid;
-
 void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 {
 	u8 mac_addr[6];
@@ -83,135 +81,6 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 	guid[6] = mac_addr[4];
 	guid[7] = mac_addr[5];
 }
-
-static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
-{
-	int i;
-	unsigned long flags;
-
-	memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
-
-
-	spin_lock_irqsave(&dev->sgid_lock, flags);
-	for (i = 0; i < OCRDMA_MAX_SGID; i++) {
-		if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
-			    sizeof(union ib_gid))) {
-			/* found free entry */
-			memcpy(&dev->sgid_tbl[i], new_sgid,
-			       sizeof(union ib_gid));
-			spin_unlock_irqrestore(&dev->sgid_lock, flags);
-			return true;
-		} else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
-				   sizeof(union ib_gid))) {
-			/* entry already present, no addition is required. */
-			spin_unlock_irqrestore(&dev->sgid_lock, flags);
-			return false;
-		}
-	}
-	spin_unlock_irqrestore(&dev->sgid_lock, flags);
-	return false;
-}
-
-static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
-{
-	int found = false;
-	int i;
-	unsigned long flags;
-
-
-	spin_lock_irqsave(&dev->sgid_lock, flags);
-	/* first is default sgid, which cannot be deleted. */
-	for (i = 1; i < OCRDMA_MAX_SGID; i++) {
-		if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
-			/* found matching entry */
-			memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
-			found = true;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&dev->sgid_lock, flags);
-	return found;
-}
-
-static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
-			     union ib_gid *gid)
-{
-	struct ib_event gid_event;
-	struct ocrdma_dev *dev;
-	bool found = false;
-	bool updated = false;
-	bool is_vlan = false;
-
-	is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
-	if (is_vlan)
-		netdev = rdma_vlan_dev_real_dev(netdev);
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
-		if (dev->nic_info.netdev == netdev) {
-			found = true;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	if (!found)
-		return NOTIFY_DONE;
-
-	mutex_lock(&dev->dev_lock);
-	switch (event) {
-	case NETDEV_UP:
-		updated = ocrdma_add_sgid(dev, gid);
-		break;
-	case NETDEV_DOWN:
-		updated = ocrdma_del_sgid(dev, gid);
-		break;
-	default:
-		break;
-	}
-	if (updated) {
-		/* GID table updated, notify the consumers about it */
-		gid_event.device = &dev->ibdev;
-		gid_event.element.port_num = 1;
-		gid_event.event = IB_EVENT_GID_CHANGE;
-		ib_dispatch_event(&gid_event);
-	}
-	mutex_unlock(&dev->dev_lock);
-	return NOTIFY_OK;
-}
-
-static int ocrdma_inetaddr_event(struct notifier_block *notifier,
-				  unsigned long event, void *ptr)
-{
-	struct in_ifaddr *ifa = ptr;
-	union ib_gid gid;
-	struct net_device *netdev = ifa->ifa_dev->dev;
-
-	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-	return ocrdma_addr_event(event, netdev, &gid);
-}
-
-static struct notifier_block ocrdma_inetaddr_notifier = {
-	.notifier_call = ocrdma_inetaddr_event
-};
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-static int ocrdma_inet6addr_event(struct notifier_block *notifier,
-				  unsigned long event, void *ptr)
-{
-	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
-	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-	struct net_device *netdev = ifa->idev->dev;
-	return ocrdma_addr_event(event, netdev, gid);
-}
-
-static struct notifier_block ocrdma_inet6addr_notifier = {
-	.notifier_call = ocrdma_inet6addr_event
-};
-
-#endif /* IPV6 and VLAN */
-
 static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
 					      u8 port_num)
 {
@@ -280,6 +149,9 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	dev->ibdev.query_port = ocrdma_query_port;
 	dev->ibdev.modify_port = ocrdma_modify_port;
 	dev->ibdev.query_gid = ocrdma_query_gid;
+	dev->ibdev.get_netdev = ocrdma_get_netdev;
+	dev->ibdev.add_gid = ocrdma_add_gid;
+	dev->ibdev.del_gid = ocrdma_del_gid;
 	dev->ibdev.get_link_layer = ocrdma_link_layer;
 	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
 	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
@@ -342,12 +214,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
 {
 	mutex_init(&dev->dev_lock);
-	dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
-				OCRDMA_MAX_SGID, GFP_KERNEL);
-	if (!dev->sgid_tbl)
-		goto alloc_err;
-	spin_lock_init(&dev->sgid_lock);
-
 	dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
 			      OCRDMA_MAX_CQ, GFP_KERNEL);
 	if (!dev->cq_tbl)
@@ -379,7 +245,6 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev)
 	kfree(dev->stag_arr);
 	kfree(dev->qp_tbl);
 	kfree(dev->cq_tbl);
-	kfree(dev->sgid_tbl);
 }
 
 /* OCRDMA sysfs interface */
@@ -425,68 +290,6 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
 		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
 }
 
-static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
-{
-	/* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
-	union ib_gid *sgid = &dev->sgid_tbl[0];
-
-	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-	ocrdma_get_guid(dev, &sgid->raw[8]);
-}
-
-static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
-				  struct net_device *net)
-{
-	struct in_device *in_dev;
-	union ib_gid gid;
-	in_dev = in_dev_get(net);
-	if (in_dev) {
-		for_ifa(in_dev) {
-			ipv6_addr_set_v4mapped(ifa->ifa_address,
-					       (struct in6_addr *)&gid);
-			ocrdma_add_sgid(dev, &gid);
-		}
-		endfor_ifa(in_dev);
-		in_dev_put(in_dev);
-	}
-}
-
-static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
-				  struct net_device *net)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_dev *in6_dev;
-	union ib_gid  *pgid;
-	struct inet6_ifaddr *ifp;
-	in6_dev = in6_dev_get(net);
-	if (in6_dev) {
-		read_lock_bh(&in6_dev->lock);
-		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-			pgid = (union ib_gid *)&ifp->addr;
-			ocrdma_add_sgid(dev, pgid);
-		}
-		read_unlock_bh(&in6_dev->lock);
-		in6_dev_put(in6_dev);
-	}
-#endif
-}
-
-static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
-{
-	struct  net_device *net_dev;
-
-	for_each_netdev(&init_net, net_dev) {
-		struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
-				rdma_vlan_dev_real_dev(net_dev) : net_dev;
-
-		if (real_dev == dev->nic_info.netdev) {
-			ocrdma_add_default_sgid(dev);
-			ocrdma_init_ipv4_gids(dev, net_dev);
-			ocrdma_init_ipv6_gids(dev, net_dev);
-		}
-	}
-}
-
 static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 {
 	int status = 0, i;
@@ -515,7 +318,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 		goto alloc_err;
 
 	ocrdma_init_service_level(dev);
-	ocrdma_init_gid_table(dev);
 	status = ocrdma_register_device(dev);
 	if (status)
 		goto alloc_err;
@@ -662,34 +464,12 @@ static struct ocrdma_driver ocrdma_drv = {
 	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,
 };
 
-static void ocrdma_unregister_inet6addr_notifier(void)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-#endif
-}
-
-static void ocrdma_unregister_inetaddr_notifier(void)
-{
-	unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-}
-
 static int __init ocrdma_init_module(void)
 {
 	int status;
 
 	ocrdma_init_debugfs();
 
-	status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-	if (status)
-		return status;
-
-#if IS_ENABLED(CONFIG_IPV6)
-	status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-	if (status)
-		goto err_notifier6;
-#endif
-
 	status = be_roce_register_driver(&ocrdma_drv);
 	if (status)
 		goto err_be_reg;
@@ -697,19 +477,13 @@ static int __init ocrdma_init_module(void)
 	return 0;
 
 err_be_reg:
-#if IS_ENABLED(CONFIG_IPV6)
-	ocrdma_unregister_inet6addr_notifier();
-err_notifier6:
-#endif
-	ocrdma_unregister_inetaddr_notifier();
+
 	return status;
 }
 
 static void __exit ocrdma_exit_module(void)
 {
 	be_roce_unregister_driver(&ocrdma_drv);
-	ocrdma_unregister_inet6addr_notifier();
-	ocrdma_unregister_inetaddr_notifier();
 	ocrdma_rem_debugfs();
 	idr_destroy(&ocrdma_dev_id);
 }
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 80006b2..6a38268 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -140,6 +140,8 @@ enum {
 	OCRDMA_DB_RQ_SHIFT		= 24
 };
 
+#define OCRDMA_ROUDP_FLAGS_SHIFT	0x03
+
 #define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */
 #define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00	/* bits 10-11 of qid at 12-11 */
 /* qid #2 msbits at 12-11 */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 8e5fb44..1f3affb 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -46,6 +46,7 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -64,6 +65,7 @@ int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
 		     int index, union ib_gid *sgid)
 {
+	int ret;
 	struct ocrdma_dev *dev;
 
 	dev = get_ocrdma_dev(ibdev);
@@ -71,8 +73,28 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
 	if (index >= OCRDMA_MAX_SGID)
 		return -EINVAL;
 
-	memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid));
+	ret = ib_get_cached_gid(ibdev, port, index, sgid);
+	if (ret == -EAGAIN) {
+		memcpy(sgid, &zgid, sizeof(*sgid));
+		return 0;
+	}
+
+	return ret;
+}
 
+int ocrdma_add_gid(struct ib_device *device,
+		   u8 port_num,
+		   unsigned int index,
+		   const union ib_gid *gid,
+		   const struct ib_gid_attr *attr,
+		   void **context) {
+	return  0;
+}
+
+int  ocrdma_del_gid(struct ib_device *device,
+		    u8 port_num,
+		    unsigned int index,
+		    void **context) {
 	return 0;
 }
 
@@ -125,6 +147,24 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
 	return 0;
 }
 
+struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+	struct ocrdma_dev *dev;
+	struct net_device *ndev = NULL;
+
+	rcu_read_lock();
+
+	dev = get_ocrdma_dev(ibdev);
+	if (dev)
+		ndev = dev->nic_info.netdev;
+	if (ndev)
+		dev_hold(ndev);
+
+	rcu_read_unlock();
+
+	return ndev;
+}
+
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
 					    u8 *ib_speed, u8 *ib_width)
 {
@@ -194,7 +234,8 @@ int ocrdma_query_port(struct ib_device *ibdev,
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
 	    IB_PORT_REINIT_SUP |
-	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;
+	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP |
+	    IB_PORT_IP_BASED_GIDS;
 	props->gid_tbl_len = OCRDMA_MAX_SGID;
 	props->pkey_tbl_len = 1;
 	props->bad_pkey_cntr = 0;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 68e026b..308c168 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -63,6 +63,17 @@ ocrdma_query_protocol(struct ib_device *device, u8 port_num);
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
 int ocrdma_query_gid(struct ib_device *, u8 port,
 		     int index, union ib_gid *gid);
+struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
+int ocrdma_add_gid(struct ib_device *device,
+		   u8 port_num,
+		   unsigned int index,
+		   const union ib_gid *gid,
+		   const struct ib_gid_attr *attr,
+		   void **context);
+int  ocrdma_del_gid(struct ib_device *device,
+		    u8 port_num,
+		    unsigned int index,
+		    void **context);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
 struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
-- 
cgit v0.10.2


From b8071ad893841aba967b7c54e712179864cdf5c3 Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Sat, 15 Aug 2015 10:16:14 -0400
Subject: IB/core: Remove needless bracketization

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7a4ee79..1763911 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -324,6 +324,7 @@ int ib_register_device(struct ib_device *device,
 					    u8, struct kobject *))
 {
 	int ret;
+	struct ib_client *client;
 
 	mutex_lock(&device_mutex);
 
@@ -361,13 +362,9 @@ int ib_register_device(struct ib_device *device,
 
 	device->reg_state = IB_DEV_REGISTERED;
 
-	{
-		struct ib_client *client;
-
-		list_for_each_entry(client, &client_list, list)
-			if (client->add && !add_client_context(device, client))
-				client->add(device);
-	}
+	list_for_each_entry(client, &client_list, list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
 
 	down_write(&lists_rwsem);
 	list_add_tail(&device->core_list, &device_list);
-- 
cgit v0.10.2


From 90c1d8b6350cca9d8a234f03c77a317a7613bcee Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 30 Jul 2015 17:34:21 +0300
Subject: IB/mlx4: Fix potential deadlock when sending mad to wire

send_mad_to_wire takes the same spinlock that is taken in
the interrupt context.  Therefore, it needs irqsave/restore.

Fixes: b9c5d6a64358 ('IB/mlx4: Add multicast group (MCG) paravirtualization for SR-IOV')
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index ed327e6..a0559a8 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -206,15 +206,16 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
 {
 	struct mlx4_ib_dev *dev = ctx->dev;
 	struct ib_ah_attr	ah_attr;
+	unsigned long flags;
 
-	spin_lock(&dev->sm_lock);
+	spin_lock_irqsave(&dev->sm_lock, flags);
 	if (!dev->sm_ah[ctx->port - 1]) {
 		/* port is not yet Active, sm_ah not ready */
-		spin_unlock(&dev->sm_lock);
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
 		return -EAGAIN;
 	}
 	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
-	spin_unlock(&dev->sm_lock);
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
 	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
 				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
 				    &ah_attr, NULL, mad);
-- 
cgit v0.10.2


From 2cb8e7f86e8cabd1e8aa608fc2a44e5bfa1d81ca Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 30 Jul 2015 17:34:22 +0300
Subject: IB/mlx4: Demote mcg message from warning to debug

The mcg "too many pending requests" warning message fills the log
when OpenSM is downed. Demote the message from  warning level to
debug level.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index a0559a8..2d5bccd 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -51,6 +51,10 @@
 	pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
 	(group)->name, group->demux->port, ## arg)
 
+#define mcg_debug_group(group, format, arg...) \
+	pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+		 (group)->name, (group)->demux->port, ## arg)
+
 #define mcg_error_group(group, format, arg...) \
 	pr_err("  %16s: " format, (group)->name, ## arg)
 
@@ -962,8 +966,8 @@ int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
 		mutex_lock(&group->lock);
 		if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
 			mutex_unlock(&group->lock);
-			mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
-				       port, slave, MAX_PEND_REQS_PER_FUNC);
+			mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+					port, slave, MAX_PEND_REQS_PER_FUNC);
 			release_group(group, 0);
 			kfree(req);
 			return -ENOMEM;
-- 
cgit v0.10.2


From 2b135db3e81301d0452e6aa107349abe67b097d6 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 30 Jul 2015 17:34:23 +0300
Subject: IB/mlx4: Forbid using sysfs to change RoCE pkeys

The pkey mapping for RoCE must remain the default mapping:
VFs:
  virtual index 0 = mapped to real index 0 (0xFFFF)
  All others indices: mapped to a real pkey index containing an
                      invalid pkey.
PF:
  virtual index i = real index i.

Don't allow users to change these mappings using files found in
sysfs.

Fixes: c1e7e466120b ('IB/mlx4: Add iov directory in sysfs under the ib device')
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index 6797108..69fb5ba 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -640,6 +640,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 	struct mlx4_port *p;
 	int i;
 	int ret;
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+			IB_LINK_LAYER_ETHERNET;
 
 	p = kzalloc(sizeof *p, GFP_KERNEL);
 	if (!p)
@@ -657,7 +659,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 
 	p->pkey_group.name  = "pkey_idx";
 	p->pkey_group.attrs =
-		alloc_group_attrs(show_port_pkey, store_port_pkey,
+		alloc_group_attrs(show_port_pkey,
+				  is_eth ? NULL : store_port_pkey,
 				  dev->dev->caps.pkey_table_len[port_num]);
 	if (!p->pkey_group.attrs) {
 		ret = -ENOMEM;
-- 
cgit v0.10.2


From 5e99b139f1b68acd65e36515ca347b03856dfb5a Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Thu, 30 Jul 2015 17:34:24 +0300
Subject: IB/mlx4: Use correct SL on AH query under RoCE

The mlx4 IB driver implementation for ib_query_ah used a wrong offset
(28 instead of 29) when link type is Ethernet. Fixed to use the correct one.

Fixes: fa417f7b520e ('IB/mlx4: Add support for IBoE')
Signed-off-by: Shani Michaeli <shanim@mellanox.com>
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 7ad6f96..1688a17 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -148,9 +148,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
 	enum rdma_link_layer ll;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
-	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
 	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
 	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29;
+	else
+		ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+
 	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
 	if (ah->av.ib.stat_rate)
 		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
-- 
cgit v0.10.2


From 799cdaf8a98f13d4fba3162e21e1e63f21045010 Mon Sep 17 00:00:00 2001
From: Ariel Nahum <arieln@mellanox.com>
Date: Sun, 9 Aug 2015 11:16:27 +0300
Subject: IB/mlx4: Fix incorrect cq flushing in error state

When handling a device internal error, the driver is responsible to
drain the completion queue with flush errors.

In case a completion queue was assigned to multiple send queues, the
driver iterates over the send queues and generates flush errors of
inflight wqes. The driver must correctly pass the wc array with an
offset as a result of the previous send queue iteration. Not doing so
will overwrite previously set completions and return a wrong number
of polled completions which includes ones which were not correctly set.

Fixes: 35f05dabf95a (IB/mlx4: Reset flow support for IB kernel ULPs)
Signed-off-by: Ariel Nahum <arieln@mellanox.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Cc: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 36eb3d0..2f42595 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -638,7 +638,7 @@ static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
 	 * simulated FLUSH_ERR completions
 	 */
 	list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
-		mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
 		if (*npolled >= num_entries)
 			goto out;
 	}
-- 
cgit v0.10.2


From e6300cbd9b09e7bf12f5f7b79e77b58d62b9d990 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 31 Jul 2015 14:12:48 -0700
Subject: IB/srp: Constify a function argument

This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 4326560..a0b92a1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2172,7 +2172,7 @@ static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
 }
 
 static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
-			       struct srp_login_rsp *lrsp,
+			       const struct srp_login_rsp *lrsp,
 			       struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
-- 
cgit v0.10.2


From c257ea6f9f9aed0b173e0c2932bb8dac5612cdc6 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 31 Jul 2015 14:13:22 -0700
Subject: IB/srp: Handle partial connection success correctly

Avoid that the following kernel warning is reported if the SRP
target system accepts fewer channels per connection than what
was requested by the initiator system:

WARNING: at drivers/infiniband/ulp/srp/ib_srp.c:617 srp_destroy_qp+0xb1/0x120 [ib_srp]()
Call Trace:
[<ffffffff8105d67f>] warn_slowpath_common+0x7f/0xc0
[<ffffffff8105d6da>] warn_slowpath_null+0x1a/0x20
[<ffffffffa05419e1>] srp_destroy_qp+0xb1/0x120 [ib_srp]
[<ffffffffa05445fb>] srp_create_ch_ib+0x19b/0x420 [ib_srp]
[<ffffffffa0545257>] srp_create_target+0x7d7/0xa94 [ib_srp]
[<ffffffff8138dac0>] dev_attr_store+0x20/0x30
[<ffffffff812079ef>] sysfs_write_file+0xef/0x170
[<ffffffff81191fc4>] vfs_write+0xb4/0x130
[<ffffffff8119276f>] sys_write+0x5f/0xa0
[<ffffffff815a0a59>] system_call_fastpath+0x16/0x1b

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index a0b92a1..ae13c96 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3263,7 +3263,7 @@ static ssize_t srp_create_target(struct device *dev,
 					srp_free_ch_ib(target, ch);
 					srp_free_req_data(target, ch);
 					target->ch_count = ch - target->ch;
-					break;
+					goto connected;
 				}
 			}
 
@@ -3273,6 +3273,7 @@ static ssize_t srp_create_target(struct device *dev,
 		node_idx++;
 	}
 
+connected:
 	target->scsi_host->nr_hw_queues = target->ch_count;
 
 	ret = srp_add_target(host, target);
-- 
cgit v0.10.2


From 713ef24e41757561c7f0bfc9bf4436f7e4a5b527 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 31 Jul 2015 14:13:52 -0700
Subject: IB/srp: Bump driver version and release date

Since version 1.0 e.g. scsi-mq has been added. Since this is
a significant change, bump the driver version and release date.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ae13c96..0a18e03 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -55,8 +55,8 @@
 
 #define DRV_NAME	"ib_srp"
 #define PFX		DRV_NAME ": "
-#define DRV_VERSION	"1.0"
-#define DRV_RELDATE	"July 1, 2013"
+#define DRV_VERSION	"2.0"
+#define DRV_RELDATE	"July 26, 2015"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator");
-- 
cgit v0.10.2


From bc44bd1d864664f3658352c6aaaa02557d49165d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 14 Aug 2015 11:01:09 -0700
Subject: IB/srp: Stop the scsi_eh_<n> and scsi_tmf_<n> threads if login fails

scsi_host_alloc() not only allocates memory for a SCSI host but also
creates the scsi_eh_<n> kernel thread and the scsi_tmf_<n> workqueue.
Stop these threads if login fails by calling scsi_host_put().

Reported-by: Konstantin Krotov <kkv@clodo.ru>
Fixes: fb49c8bbaae7 ("Remove an extraneous scsi_host_put() from an error path")
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Cc: <stable@vger.kernel.org> #v3.19
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 0a18e03..c9c4f64 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2758,6 +2758,13 @@ static int srp_sdev_count(struct Scsi_Host *host)
 	return c;
 }
 
+/*
+ * Return values:
+ * < 0 upon failure. Caller is responsible for SRP target port cleanup.
+ * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port
+ *    removal has been scheduled.
+ * 0 and target->state != SRP_TARGET_REMOVED upon success.
+ */
 static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
 {
 	struct srp_rport_identifiers ids;
@@ -3296,6 +3303,8 @@ out:
 	mutex_unlock(&host->add_target_mutex);
 
 	scsi_host_put(target->scsi_host);
+	if (ret < 0)
+		scsi_host_put(target->scsi_host);
 
 	return ret;
 
-- 
cgit v0.10.2


From 6431eb87065ffd24dfc7c0b6954e80a4eb74e177 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Fri, 14 Aug 2015 08:52:06 -0400
Subject: IB/netlink: Add defines for local service requests through netlink

This patch adds netlink defines for local service client, local service
group, local service operations, and related attributes.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 6e4bb42..c19a5dc 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -7,12 +7,14 @@ enum {
 	RDMA_NL_RDMA_CM = 1,
 	RDMA_NL_NES,
 	RDMA_NL_C4IW,
+	RDMA_NL_LS,	/* RDMA Local Services */
 	RDMA_NL_NUM_CLIENTS
 };
 
 enum {
 	RDMA_NL_GROUP_CM = 1,
 	RDMA_NL_GROUP_IWPM,
+	RDMA_NL_GROUP_LS,
 	RDMA_NL_NUM_GROUPS
 };
 
@@ -128,5 +130,85 @@ enum {
 	IWPM_NLA_ERR_MAX
 };
 
+/*
+ * Local service operations:
+ *   RESOLVE - The client requests the local service to resolve a path.
+ *   SET_TIMEOUT - The local service requests the client to set the timeout.
+ */
+enum {
+	RDMA_NL_LS_OP_RESOLVE = 0,
+	RDMA_NL_LS_OP_SET_TIMEOUT,
+	RDMA_NL_LS_NUM_OPS
+};
+
+/* Local service netlink message flags */
+#define RDMA_NL_LS_F_ERR	0x0100	/* Failed response */
+
+/*
+ * Local service resolve operation family header.
+ * The layout for the resolve operation:
+ *    nlmsg header
+ *    family header
+ *    attributes
+ */
+
+/*
+ * Local service path use:
+ * Specify how the path(s) will be used.
+ *   ALL - For connected CM operation (6 pathrecords)
+ *   UNIDIRECTIONAL - For unidirectional UD (1 pathrecord)
+ *   GMP - For miscellaneous GMP like operation (at least 1 reversible
+ *         pathrecord)
+ */
+enum {
+	LS_RESOLVE_PATH_USE_ALL = 0,
+	LS_RESOLVE_PATH_USE_UNIDIRECTIONAL,
+	LS_RESOLVE_PATH_USE_GMP,
+	LS_RESOLVE_PATH_USE_MAX
+};
+
+#define LS_DEVICE_NAME_MAX 64
+
+struct rdma_ls_resolve_header {
+	__u8 device_name[LS_DEVICE_NAME_MAX];
+	__u8 port_num;
+	__u8 path_use;
+};
+
+/* Local service attribute type */
+#define RDMA_NLA_F_MANDATORY	(1 << 13)
+#define RDMA_NLA_TYPE_MASK	(~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
+				  RDMA_NLA_F_MANDATORY))
+
+/*
+ * Local service attributes:
+ *   Attr Name       Size                       Byte order
+ *   -----------------------------------------------------
+ *   PATH_RECORD     struct ib_path_rec_data
+ *   TIMEOUT         u32                        cpu
+ *   SERVICE_ID      u64                        cpu
+ *   DGID            u8[16]                     BE
+ *   SGID            u8[16]                     BE
+ *   TCLASS          u8
+ *   PKEY            u16                        cpu
+ *   QOS_CLASS       u16                        cpu
+ */
+enum {
+	LS_NLA_TYPE_UNSPEC = 0,
+	LS_NLA_TYPE_PATH_RECORD,
+	LS_NLA_TYPE_TIMEOUT,
+	LS_NLA_TYPE_SERVICE_ID,
+	LS_NLA_TYPE_DGID,
+	LS_NLA_TYPE_SGID,
+	LS_NLA_TYPE_TCLASS,
+	LS_NLA_TYPE_PKEY,
+	LS_NLA_TYPE_QOS_CLASS,
+	LS_NLA_TYPE_MAX
+};
+
+/* Local service DGID/SGID attribute: big endian */
+struct rdma_nla_ls_gid {
+	__u8		gid[16];
+};
 
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v0.10.2


From bc10ed7d3d19ff61427007b4d7bf98d3e57bb333 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Fri, 14 Aug 2015 08:52:07 -0400
Subject: IB/core: Add rdma netlink helper functions

This patch adds a function to check if listeners for a netlink multicast
group are present. It also adds a function to receive netlink response
messages.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 23dd5a5..d47df93 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
 static struct sock *nls;
 static LIST_HEAD(client_list);
 
+int ibnl_chk_listeners(unsigned int group)
+{
+	if (netlink_has_listeners(nls, group) == 0)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
 int ibnl_add_client(int index, int nops,
 		    const struct ibnl_client_cbs cb_table[])
 {
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			    !client->cb_table[op].dump)
 				return -EINVAL;
 
+			/*
+			 * For response or local service set_timeout request,
+			 * there is no need to use netlink_dump_start.
+			 */
+			if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+			    (index == RDMA_NL_LS &&
+			     op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+				struct netlink_callback cb = {
+					.skb = skb,
+					.nlh = nlh,
+					.dump = client->cb_table[op].dump,
+					.module = client->cb_table[op].module,
+				};
+
+				return cb.dump(skb, &cb);
+			}
+
 			{
 				struct netlink_dump_control c = {
 					.dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return -EINVAL;
 }
 
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh;
+	int msglen;
+
+	/*
+	 * Process responses until there is no more message or the first
+	 * request. Generally speaking, it is not recommended to mix responses
+	 * with requests.
+	 */
+	while (skb->len >= nlmsg_total_size(0)) {
+		nlh = nlmsg_hdr(skb);
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+			return;
+
+		/* Handle response only */
+		if (nlh->nlmsg_flags & NLM_F_REQUEST)
+			return;
+
+		ibnl_rcv_msg(skb, nlh);
+
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
+	}
+}
+
 static void ibnl_rcv(struct sk_buff *skb)
 {
 	mutex_lock(&ibnl_mutex);
+	ibnl_rcv_reply_skb(skb);
 	netlink_rcv_skb(skb, &ibnl_rcv_msg);
 	mutex_unlock(&ibnl_mutex);
 }
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 0790882..5852661 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -77,4 +77,11 @@ int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 			unsigned int group, gfp_t flags);
 
+/**
+ * Check if there are any listeners to the netlink group
+ * @group: the netlink group ID
+ * Returns 0 on success or a negative for no listeners.
+ */
+int ibnl_chk_listeners(unsigned int group);
+
 #endif /* _RDMA_NETLINK_H */
-- 
cgit v0.10.2


From 5d2657708ec25b9fb3dd174443b1f647babcbe62 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Fri, 14 Aug 2015 08:52:08 -0400
Subject: IB/sa: Allocate SA query with kzalloc

Replace kmalloc with kzalloc so that all uninitialized fields in SA query
will be zero-ed out to avoid unintentional consequence. This prepares the
SA query structure to accept new fields in the future.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index d40be36..968c66f 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -740,7 +740,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -862,7 +862,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
 	    method != IB_SA_METHOD_DELETE)
 		return -EINVAL;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -954,7 +954,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -1051,7 +1051,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 2ca546b92a024d07adedd15b4c262b1c2c0786ec Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Fri, 14 Aug 2015 08:52:09 -0400
Subject: IB/sa: Route SA pathrecord query through netlink

This patch routes a SA pathrecord query to netlink first and processes the
response appropriately. If a failure is returned, the request will be sent
through IB. The decision whether to route the request to netlink first is
determined by the presence of a listener for the local service netlink
multicast group. If the user-space local service netlink multicast group
listener is not present, the request will be sent through IB, just like
what is currently being done.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 968c66f..edcf568 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
 #include "sa.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
 struct ib_sa_sm_ah {
 	struct ib_ah        *ah;
 	struct kref          ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
 	struct ib_mad_send_buf *mad_buf;
 	struct ib_sa_sm_ah     *sm_ah;
 	int			id;
+	u32			flags;
+	struct list_head	list; /* Local svc request list */
+	u32			seq; /* Local svc request sequence number */
+	unsigned long		timeout; /* Local svc timeout */
+	u8			path_use; /* How will the pathrecord be used */
 };
 
+#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
+#define IB_SA_CANCEL			0x00000002
+
 struct ib_sa_service_query {
 	void (*callback)(int, struct ib_sa_service_rec *, void *);
 	void *context;
@@ -106,6 +123,26 @@ struct ib_sa_mcmember_query {
 	struct ib_sa_query sa_query;
 };
 
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+	[LS_NLA_TYPE_PATH_RECORD]	= {.type = NLA_BINARY,
+		.len = sizeof(struct ib_path_rec_data)},
+	[LS_NLA_TYPE_TIMEOUT]		= {.type = NLA_U32},
+	[LS_NLA_TYPE_SERVICE_ID]	= {.type = NLA_U64},
+	[LS_NLA_TYPE_DGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_SGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_TCLASS]		= {.type = NLA_U8},
+	[LS_NLA_TYPE_PKEY]		= {.type = NLA_U16},
+	[LS_NLA_TYPE_QOS_CLASS]		= {.type = NLA_U16},
+};
+
+
 static void ib_sa_add_one(struct ib_device *device);
 static void ib_sa_remove_one(struct ib_device *device, void *client_data);
 
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
 	  .size_bits    = 512 },
 };
 
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+	query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+	return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+				     struct ib_sa_query *query)
+{
+	struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+	struct ib_sa_mad *mad = query->mad_buf->mad;
+	ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+	u16 val16;
+	u64 val64;
+	struct rdma_ls_resolve_header *header;
+
+	query->mad_buf->context[1] = NULL;
+
+	/* Construct the family header first */
+	header = (struct rdma_ls_resolve_header *)
+		skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+	memcpy(header->device_name, query->port->agent->device->name,
+	       LS_DEVICE_NAME_MAX);
+	header->port_num = query->port->port_num;
+
+	if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+	    sa_rec->reversible != 0)
+		query->path_use = LS_RESOLVE_PATH_USE_GMP;
+	else
+		query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+	header->path_use = query->path_use;
+
+	/* Now build the attributes */
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+		val64 = be64_to_cpu(sa_rec->service_id);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+			sizeof(val64), &val64);
+	}
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+			sizeof(sa_rec->dgid), &sa_rec->dgid);
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+			sizeof(sa_rec->sgid), &sa_rec->sgid);
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+			sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+	if (comp_mask & IB_SA_PATH_REC_PKEY) {
+		val16 = be16_to_cpu(sa_rec->pkey);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+			sizeof(val16), &val16);
+	}
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+		val16 = be16_to_cpu(sa_rec->qos_class);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+			sizeof(val16), &val16);
+	}
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+	int len = 0;
+
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+		len += nla_total_size(sizeof(u64));
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		len += nla_total_size(sizeof(u8));
+	if (comp_mask & IB_SA_PATH_REC_PKEY)
+		len += nla_total_size(sizeof(u16));
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+		len += nla_total_size(sizeof(u16));
+
+	/*
+	 * Make sure that at least some of the required comp_mask bits are
+	 * set.
+	 */
+	if (WARN_ON(len == 0))
+		return len;
+
+	/* Add the family header */
+	len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+	return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	void *data;
+	int ret = 0;
+	struct ib_sa_mad *mad;
+	int len;
+
+	mad = query->mad_buf->mad;
+	len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+	if (len <= 0)
+		return -EMSGSIZE;
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Put nlmsg header only for now */
+	data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+			    RDMA_NL_LS_OP_RESOLVE, (int) GFP_KERNEL);
+	if (!data) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/* Add attributes */
+	ib_nl_set_path_rec_attrs(skb, query);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	if (!ret)
+		ret = len;
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	unsigned long delay;
+	int ret;
+
+	INIT_LIST_HEAD(&query->list);
+	query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	ret = ib_nl_send_msg(query);
+	if (ret <= 0) {
+		ret = -EIO;
+		goto request_out;
+	} else {
+		ret = 0;
+	}
+
+	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+	query->timeout = delay + jiffies;
+	list_add_tail(&query->list, &ib_nl_request_list);
+	/* Start the timeout if this is the only request */
+	if (ib_nl_request_list.next == &query->list)
+		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_sa_query *wait_query;
+	int found = 0;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+		/* Let the timeout to take care of the callback */
+		if (query == wait_query) {
+			query->flags |= IB_SA_CANCEL;
+			query->timeout = jiffies;
+			list_move(&query->list, &ib_nl_request_list);
+			found = 1;
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+					   const struct nlmsghdr *nlh)
+{
+	struct ib_mad_send_wc mad_send_wc;
+	struct ib_sa_mad *mad = NULL;
+	const struct nlattr *head, *curr;
+	struct ib_path_rec_data  *rec;
+	int len, rem;
+	u32 mask = 0;
+	int status = -EIO;
+
+	if (query->callback) {
+		head = (const struct nlattr *) nlmsg_data(nlh);
+		len = nlmsg_len(nlh);
+		switch (query->path_use) {
+		case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+			mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+			break;
+
+		case LS_RESOLVE_PATH_USE_ALL:
+		case LS_RESOLVE_PATH_USE_GMP:
+		default:
+			mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+				IB_PATH_BIDIRECTIONAL;
+			break;
+		}
+		nla_for_each_attr(curr, head, len, rem) {
+			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+				rec = nla_data(curr);
+				/*
+				 * Get the first one. In the future, we may
+				 * need to get up to 6 pathrecords.
+				 */
+				if ((rec->flags & mask) == mask) {
+					mad = query->mad_buf->mad;
+					mad->mad_hdr.method |=
+						IB_MGMT_METHOD_RESP;
+					memcpy(mad->data, rec->path_rec,
+					       sizeof(rec->path_rec));
+					status = 0;
+					break;
+				}
+			}
+		}
+		query->callback(query, status, mad);
+	}
+
+	mad_send_wc.send_buf = query->mad_buf;
+	mad_send_wc.status = IB_WC_SUCCESS;
+	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+	unsigned long flags;
+	struct ib_sa_query *query;
+	unsigned long delay;
+	struct ib_mad_send_wc mad_send_wc;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	while (!list_empty(&ib_nl_request_list)) {
+		query = list_entry(ib_nl_request_list.next,
+				   struct ib_sa_query, list);
+
+		if (time_after(query->timeout, jiffies)) {
+			delay = query->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+			break;
+		}
+
+		list_del(&query->list);
+		ib_sa_disable_local_svc(query);
+		/* Hold the lock to protect against query cancellation */
+		if (ib_sa_query_cancelled(query))
+			ret = -1;
+		else
+			ret = ib_post_send_mad(query->mad_buf, NULL);
+		if (ret) {
+			mad_send_wc.send_buf = query->mad_buf;
+			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+			send_handler(query->port->agent, &mad_send_wc);
+			spin_lock_irqsave(&ib_nl_request_lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	int timeout, delta, abs_delta;
+	const struct nlattr *attr;
+	unsigned long flags;
+	struct ib_sa_query *query;
+	long delay = 0;
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy);
+	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+	if (ret || !attr)
+		goto settimeout_out;
+
+	timeout = *(int *) nla_data(attr);
+	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+	delta = timeout - sa_local_svc_timeout_ms;
+	if (delta < 0)
+		abs_delta = -delta;
+	else
+		abs_delta = delta;
+
+	if (delta != 0) {
+		spin_lock_irqsave(&ib_nl_request_lock, flags);
+		sa_local_svc_timeout_ms = timeout;
+		list_for_each_entry(query, &ib_nl_request_list, list) {
+			if (delta < 0 && abs_delta > query->timeout)
+				query->timeout = 0;
+			else
+				query->timeout += delta;
+
+			/* Get the new delay from the first entry */
+			if (!delay) {
+				delay = query->timeout - jiffies;
+				if (delay <= 0)
+					delay = 1;
+			}
+		}
+		if (delay)
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+					 (unsigned long)delay);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+	}
+
+settimeout_out:
+	return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return 0;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	unsigned long flags;
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_send_wc mad_send_wc;
+	int found = 0;
+	int ret;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(query, &ib_nl_request_list, list) {
+		/*
+		 * If the query is cancelled, let the timeout routine
+		 * take care of it.
+		 */
+		if (nlh->nlmsg_seq == query->seq) {
+			found = !ib_sa_query_cancelled(query);
+			if (found)
+				list_del(&query->list);
+			break;
+		}
+	}
+
+	if (!found) {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		goto resp_out;
+	}
+
+	send_buf = query->mad_buf;
+
+	if (!ib_nl_is_good_resolve_resp(nlh)) {
+		/* if the result is a failure, send out the packet via IB */
+		ib_sa_disable_local_svc(query);
+		ret = ib_post_send_mad(query->mad_buf, NULL);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		if (ret) {
+			mad_send_wc.send_buf = send_buf;
+			mad_send_wc.status = IB_WC_GENERAL_ERR;
+			send_handler(query->port->agent, &mad_send_wc);
+		}
+	} else {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		ib_nl_process_good_resolve_rsp(query, nlh);
+	}
+
+resp_out:
+	return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+	[RDMA_NL_LS_OP_RESOLVE] = {
+		.dump = ib_nl_handle_resolve_resp,
+		.module = THIS_MODULE },
+	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
+		.dump = ib_nl_handle_set_timeout,
+		.module = THIS_MODULE },
+};
+
 static void free_sm_ah(struct kref *kref)
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 	mad_buf = query->mad_buf;
 	spin_unlock_irqrestore(&idr_lock, flags);
 
-	ib_cancel_mad(agent, mad_buf);
+	/*
+	 * If the query is still on the netlink request list, schedule
+	 * it to be cancelled by the timeout routine. Otherwise, it has been
+	 * sent to the MAD layer and has to be cancelled from there.
+	 */
+	if (!ib_nl_cancel_request(query))
+		ib_cancel_mad(agent, mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 	query->mad_buf->context[0] = query;
 	query->id = id;
 
+	if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+			if (!ib_nl_make_request(query))
+				return id;
+		}
+		ib_sa_disable_local_svc(query);
+	}
+
 	ret = ib_post_send_mad(query->mad_buf, NULL);
 	if (ret) {
 		spin_lock_irqsave(&idr_lock, flags);
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 
 	*sa_query = &query->sa_query;
 
+	query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+	query->sa_query.mad_buf->context[1] = rec;
+
 	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
 	if (ret < 0)
 		goto err2;
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
 
 	get_random_bytes(&tid, sizeof tid);
 
+	atomic_set(&ib_nl_sa_request_seq, 0);
+
 	ret = ib_register_client(&sa_client);
 	if (ret) {
 		printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
 		goto err2;
 	}
 
+	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+	if (!ib_nl_wq) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+			    ib_sa_cb_table)) {
+		pr_err("Failed to add netlink callback\n");
+		ret = -EINVAL;
+		goto err4;
+	}
+	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
 	return 0;
+err4:
+	destroy_workqueue(ib_nl_wq);
+err3:
+	mcast_cleanup();
 err2:
 	ib_unregister_client(&sa_client);
 err1:
@@ -1272,6 +1767,10 @@ err1:
 
 static void __exit ib_sa_cleanup(void)
 {
+	ibnl_remove_client(RDMA_NL_LS);
+	cancel_delayed_work(&ib_nl_timed_work);
+	flush_workqueue(ib_nl_wq);
+	destroy_workqueue(ib_nl_wq);
 	mcast_cleanup();
 	ib_unregister_client(&sa_client);
 	idr_destroy(&query_idr);
-- 
cgit v0.10.2


From db0a6cbd21d6ece6587c157b3d183521bc8a2781 Mon Sep 17 00:00:00 2001
From: Jenny Falkovich <jennyf@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:45 +0300
Subject: IB/iser: Change some module parameters to be RO

While we're at it, use permission defines instead
of octal values and rearrange a little bit.

Signed-off-by: Jenny Derzhavetz <jennyf@mellanox.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index de8730d..cbe013d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -74,34 +74,33 @@
 
 #include "iscsi_iser.h"
 
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+MODULE_VERSION(DRV_VER);
+
 static struct scsi_host_template iscsi_iser_sht;
 static struct iscsi_transport iscsi_iser_transport;
 static struct scsi_transport_template *iscsi_iser_scsi_transport;
+static struct workqueue_struct *release_wq;
+struct iser_global ig;
+
+int iser_debug_level = 0;
+module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
 
 static unsigned int iscsi_max_lun = 512;
 module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
 
-int iser_debug_level = 0;
 bool iser_pi_enable = false;
-int iser_pi_guard = 1;
-
-MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
-MODULE_VERSION(DRV_VER);
-
-module_param_named(debug_level, iser_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
-
-module_param_named(pi_enable, iser_pi_enable, bool, 0644);
+module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
 MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
 
-module_param_named(pi_guard, iser_pi_guard, int, 0644);
+int iser_pi_guard;
+module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO);
 MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
 
-static struct workqueue_struct *release_wq;
-struct iser_global ig;
-
 /*
  * iscsi_iser_recv() - Process a successfull recv completion
  * @conn:         iscsi connection
-- 
cgit v0.10.2


From 74ce897b7c9be9f5913bbffafcac10f0871c503b Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:46 +0300
Subject: IB/iser: Change minor assignments and logging prints

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index cbe013d..eddf39d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -847,10 +847,9 @@ failure:
 static int
 iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 {
-	struct iser_conn *iser_conn;
+	struct iser_conn *iser_conn = ep->dd_data;
 	int rc;
 
-	iser_conn = ep->dd_data;
 	rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
 						       msecs_to_jiffies(timeout_ms));
 	/* if conn establishment failed, return error code to iscsi */
@@ -862,7 +861,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 		mutex_unlock(&iser_conn->state_mutex);
 	}
 
-	iser_info("ib conn %p rc = %d\n", iser_conn, rc);
+	iser_info("iser conn %p rc = %d\n", iser_conn, rc);
 
 	if (rc > 0)
 		return 1; /* success, this is the equivalent of POLLOUT */
@@ -884,11 +883,9 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 static void
 iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
 {
-	struct iser_conn *iser_conn;
+	struct iser_conn *iser_conn = ep->dd_data;
 
-	iser_conn = ep->dd_data;
-	iser_info("ep %p iser conn %p state %d\n",
-		  ep, iser_conn, iser_conn->state);
+	iser_info("ep %p iser conn %p\n", ep, iser_conn);
 
 	mutex_lock(&iser_conn->state_mutex);
 	iser_conn_terminate(iser_conn);
-- 
cgit v0.10.2


From 1156cc80f8fc31f8c52740a9c8051afd8d70faf3 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:47 +0300
Subject: IB/iser: Remove '.' from log message

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index eddf39d..e112cbe 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -905,6 +905,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
 		mutex_unlock(&iser_conn->state_mutex);
 		iser_conn_release(iser_conn);
 	}
+
 	iscsi_destroy_endpoint(ep);
 }
 
@@ -1079,7 +1080,7 @@ static void __exit iser_exit(void)
 
 	if (!connlist_empty) {
 		iser_err("Error cleanup stage completed but we still have iser "
-			 "connections, destroying them anyway.\n");
+			 "connections, destroying them anyway\n");
 		list_for_each_entry_safe(iser_conn, n, &ig.connlist,
 					 conn_list) {
 			iser_conn_release(iser_conn);
-- 
cgit v0.10.2


From d16739055bd1f562ae4d83e69f7f7f1cefcfbe16 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:48 +0300
Subject: IB/iser: Fix missing return status check in iser_send_data_out

Since commit "IB/iser: Fix race between iser connection teardown..."
iser_initialize_task_headers() might fail, so we need to check that.

Fixes: 7414dde0a6c3a958e (IB/iser: Fix race between iser connection ...)
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 3e2118e..0a47f42 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -454,7 +454,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
 	unsigned long buf_offset;
 	unsigned long data_seg_len;
 	uint32_t itt;
-	int err = 0;
+	int err;
 	struct ib_sge *tx_dsg;
 
 	itt = (__force uint32_t)hdr->itt;
@@ -475,7 +475,9 @@ int iser_send_data_out(struct iscsi_conn *conn,
 	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
 	/* build the tx desc */
-	iser_initialize_task_headers(task, tx_desc);
+	err = iser_initialize_task_headers(task, tx_desc);
+	if (err)
+		goto send_data_out_error;
 
 	mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
 	tx_dsg = &tx_desc->tx_sg[1];
@@ -502,7 +504,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
 
 send_data_out_error:
 	kmem_cache_free(ig.desc_cache, tx_desc);
-	iser_err("conn %p failed err %d\n",conn, err);
+	iser_err("conn %p failed err %d\n", conn, err);
 	return err;
 }
 
-- 
cgit v0.10.2


From 02816a8b8881f9dea68883c9b72672e87cb91d36 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:49 +0300
Subject: IB/iser: Get rid of un-maintained counters

We don't update those anywhere in the code and they
seem pretty useless (no one seem to care about those).

qp_tx_queue_full: We never should get this
fmr_map_not_avail: We can never get to this
eh_abort_cnt: We don't monitor aborts

Go ahead and remove them.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index e112cbe..9cc7319 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -750,15 +750,9 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s
 	stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
 	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
 	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
-	stats->custom_length = 4;
-	strcpy(stats->custom[0].desc, "qp_tx_queue_full");
-	stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */
-	strcpy(stats->custom[1].desc, "fmr_map_not_avail");
-	stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
-	strcpy(stats->custom[2].desc, "eh_abort_cnt");
-	stats->custom[2].value = conn->eh_abort_cnt;
-	strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
-	stats->custom[3].value = conn->fmr_unalign_cnt;
+	stats->custom_length = 1;
+	strcpy(stats->custom[0].desc, "fmr_unalign_cnt");
+	stats->custom[0].value = conn->fmr_unalign_cnt;
 }
 
 static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
-- 
cgit v0.10.2


From 8d5944d80359e645feb2ebd069a6f4caf7825e40 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:50 +0300
Subject: IB/iser: Fix possible bogus DMA unmapping

If iser_initialize_task_headers() routine failed before
dma mapping, we should not attempt to unmap in cleanup_task().

Fixes: 7414dde0a6c3a958e (IB/iser: Fix race between iser connection ...)
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 9cc7319..169cc3e 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -200,6 +200,7 @@ iser_initialize_task_headers(struct iscsi_task *task,
 		goto out;
 	}
 
+	tx_desc->mapped = true;
 	tx_desc->dma_addr = dma_addr;
 	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
 	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
@@ -359,16 +360,19 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
 static void iscsi_iser_cleanup_task(struct iscsi_task *task)
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_tx_desc    *tx_desc   = &iser_task->desc;
-	struct iser_conn       *iser_conn	  = task->conn->dd_data;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct iser_conn *iser_conn = task->conn->dd_data;
 	struct iser_device *device = iser_conn->ib_conn.device;
 
 	/* DEVICE_REMOVAL event might have already released the device */
 	if (!device)
 		return;
 
-	ib_dma_unmap_single(device->ib_device,
-		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (likely(tx_desc->mapped)) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->mapped = false;
+	}
 
 	/* mgmt tasks do not need special cleanup */
 	if (!task->sc)
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 262ba1f..d2b6caf 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -270,6 +270,7 @@ enum iser_desc_type {
  *                 sg[1] optionally points to either of immediate data
  *                 unsolicited data-out or control
  * @num_sge:       number sges used on this TX task
+ * @mapped:        Is the task header mapped
  */
 struct iser_tx_desc {
 	struct iser_hdr              iser_header;
@@ -278,6 +279,7 @@ struct iser_tx_desc {
 	u64		             dma_addr;
 	struct ib_sge		     tx_sg[2];
 	int                          num_sge;
+	bool			     mapped;
 };
 
 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
-- 
cgit v0.10.2


From b9abd8d21d3a04903aefcb7742efe3390f2fac57 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:51 +0300
Subject: IB/iser: Remove a redundant always-false condition

We always call iser_initialize_task_headers() and set
the header tx_sg.lkey to the device mr lkey, so no
point in checking it in iser_create_send_desc().

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 0a47f42..9ff6dec 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -170,13 +170,7 @@ static void iser_create_send_desc(struct iser_conn	*iser_conn,
 
 	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
 	tx_desc->iser_header.flags = ISER_VER;
-
 	tx_desc->num_sge = 1;
-
-	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
-		tx_desc->tx_sg[0].lkey = device->mr->lkey;
-		iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
-	}
 }
 
 static void iser_free_login_buf(struct iser_conn *iser_conn)
-- 
cgit v0.10.2


From ea18f5d7777dc4fa9c18c4919281301cf4fd921a Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:52 +0300
Subject: IB/iser: Remove an unneeded print for unaligned memory

We can do it in iser_aligned_data_len instead and
it will save us an argument that is passed to
fall_to_counce_buf just for the print.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index f0cdc96..b6889f6 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -330,8 +330,11 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
 			break;
 	}
 	ret_len = (next_sg) ? i : i+1;
-	iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
-		 ret_len, data->dma_nents, data);
+
+	if (unlikely(ret_len != data->dma_nents))
+		iser_warn("rdma alignment violation (%d/%d aligned)\n",
+			  ret_len, data->dma_nents);
+
 	return ret_len;
 }
 
@@ -407,15 +410,12 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 
 static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
 			      struct iser_data_buf *mem,
-			      enum iser_data_dir cmd_dir,
-			      int aligned_len)
+			      enum iser_data_dir cmd_dir)
 {
 	struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn;
 	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
 
 	iscsi_conn->fmr_unalign_cnt++;
-	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
-		  aligned_len, mem->size);
 
 	if (iser_debug_level > 0)
 		iser_data_buf_dump(mem, device->ib_device);
@@ -537,8 +537,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 
 	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
 	if (aligned_len != mem->dma_nents) {
-		err = fall_to_bounce_buf(iser_task, mem,
-					 cmd_dir, aligned_len);
+		err = fall_to_bounce_buf(iser_task, mem, cmd_dir);
 		if (err) {
 			iser_err("failed to allocate bounce buffer\n");
 			return err;
@@ -800,8 +799,7 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 
 	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
 	if (aligned_len != mem->dma_nents) {
-		err = fall_to_bounce_buf(iser_task, mem,
-					 cmd_dir, aligned_len);
+		err = fall_to_bounce_buf(iser_task, mem, cmd_dir);
 		if (err) {
 			iser_err("failed to allocate bounce buffer\n");
 			return err;
@@ -828,7 +826,7 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
 			if (aligned_len != mem->dma_nents) {
 				err = fall_to_bounce_buf(iser_task, mem,
-							 cmd_dir, aligned_len);
+							 cmd_dir);
 				if (err) {
 					iser_err("failed to allocate bounce buffer\n");
 					return err;
-- 
cgit v0.10.2


From d711d81d6463ecf566b93810e16949f2d159aa50 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:53 +0300
Subject: IB/iser: Introduce struct iser_reg_resources

Have fast_reg_descriptor hold struct iser_reg_resources
(mr, frpl, valid flag). This will be useful when the
actual buffer registration routines will be passed with
the needed registration resources (i.e. iser_reg_resources)
without being aware of their nature (i.e. data or protection).

In order to achieve this, we remove reg_indicators flags container
and place specific flags (mr_valid) within iser_reg_resources struct.
We also place the sig_mr_valid and sig_protcted flags in iser_pi_context.

This patch also modifies iser_fast_reg_mr to receive the
reg_resources instead of the fast_reg_descriptor and a data/protection
indicator.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index d2b6caf..9cdfdbd 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -367,41 +367,45 @@ struct iser_device {
 #define ISER_CHECK_REFTAG	0x0f
 #define ISER_CHECK_APPTAG	0x30
 
-enum iser_reg_indicator {
-	ISER_DATA_KEY_VALID	= 1 << 0,
-	ISER_PROT_KEY_VALID	= 1 << 1,
-	ISER_SIG_KEY_VALID	= 1 << 2,
-	ISER_FASTREG_PROTECTED	= 1 << 3,
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr:         memory region
+ * @frpl:       fast reg page list
+ * @mr_valid:   is mr valid indicator
+ */
+struct iser_reg_resources {
+	struct ib_mr			 *mr;
+	struct ib_fast_reg_page_list     *frpl;
+	u8				  mr_valid:1;
 };
 
 /**
  * struct iser_pi_context - Protection information context
  *
- * @prot_mr:        protection memory region
- * @prot_frpl:      protection fastreg page list
- * @sig_mr:         signature feature enabled memory region
+ * @rsc:             protection buffer registration resources
+ * @sig_mr:          signature enable memory region
+ * @sig_mr_valid:    is sig_mr valid indicator
+ * @sig_protected:   is region protected indicator
  */
 struct iser_pi_context {
-	struct ib_mr                   *prot_mr;
-	struct ib_fast_reg_page_list   *prot_frpl;
+	struct iser_reg_resources	rsc;
 	struct ib_mr                   *sig_mr;
+	u8                              sig_mr_valid:1;
+	u8                              sig_protected:1;
 };
 
 /**
  * struct fast_reg_descriptor - Fast registration descriptor
  *
  * @list:           entry in connection fastreg pool
- * @data_mr:        data memory region
- * @data_frpl:      data fastreg page list
+ * @rsc:            data buffer registration resources
  * @pi_ctx:         protection information context
- * @reg_indicators: fast registration indicators
  */
 struct fast_reg_descriptor {
 	struct list_head		  list;
-	struct ib_mr			 *data_mr;
-	struct ib_fast_reg_page_list     *data_frpl;
+	struct iser_reg_resources	  rsc;
 	struct iser_pi_context		 *pi_ctx;
-	u8				  reg_indicators;
 };
 
 /**
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index b6889f6..82a3304 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -647,13 +647,12 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
 
 static int
 iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
-		struct fast_reg_descriptor *desc,
+		struct iser_pi_context *pi_ctx,
 		struct iser_mem_reg *data_reg,
 		struct iser_mem_reg *prot_reg,
 		struct iser_mem_reg *sig_reg)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct iser_pi_context *pi_ctx = desc->pi_ctx;
 	struct ib_send_wr sig_wr, inv_wr;
 	struct ib_send_wr *bad_wr, *wr = NULL;
 	struct ib_sig_attrs sig_attrs;
@@ -666,7 +665,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 
 	iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
 
-	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
+	if (!pi_ctx->sig_mr_valid) {
 		iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
 		wr = &inv_wr;
 	}
@@ -694,7 +693,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 		iser_err("reg_sig_mr failed, ret:%d\n", ret);
 		goto err;
 	}
-	desc->reg_indicators &= ~ISER_SIG_KEY_VALID;
+	pi_ctx->sig_mr_valid = 0;
 
 	sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
 	sig_reg->rkey = pi_ctx->sig_mr->rkey;
@@ -710,8 +709,7 @@ err:
 
 static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 			    struct iser_data_buf *mem,
-			    struct fast_reg_descriptor *desc,
-			    enum iser_reg_indicator ind,
+			    struct iser_reg_resources *rsc,
 			    struct iser_mem_reg *reg)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
@@ -726,13 +724,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 	if (mem->dma_nents == 1)
 		return iser_reg_dma(device, mem, reg);
 
-	if (ind == ISER_DATA_KEY_VALID) {
-		mr = desc->data_mr;
-		frpl = desc->data_frpl;
-	} else {
-		mr = desc->pi_ctx->prot_mr;
-		frpl = desc->pi_ctx->prot_frpl;
-	}
+	mr = rsc->mr;
+	frpl = rsc->frpl;
 
 	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
 				   &offset, &size);
@@ -741,7 +734,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 		return -EINVAL;
 	}
 
-	if (!(desc->reg_indicators & ind)) {
+	if (!rsc->mr_valid) {
 		iser_inv_rkey(&inv_wr, mr);
 		wr = &inv_wr;
 	}
@@ -770,7 +763,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 		iser_err("fast registration failed, ret:%d\n", ret);
 		return ret;
 	}
-	desc->reg_indicators &= ~ind;
+	rsc->mr_valid = 0;
 
 	reg->sge.lkey = mr->lkey;
 	reg->rkey = mr->rkey;
@@ -812,8 +805,8 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 		mem_reg->mem_h = desc;
 	}
 
-	err = iser_fast_reg_mr(iser_task, mem, desc,
-			       ISER_DATA_KEY_VALID, mem_reg);
+	err = iser_fast_reg_mr(iser_task, mem,
+			       desc ? &desc->rsc : NULL, mem_reg);
 	if (err)
 		goto err_reg;
 
@@ -833,19 +826,19 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 				}
 			}
 
-			err = iser_fast_reg_mr(iser_task, mem, desc,
-					       ISER_PROT_KEY_VALID, &prot_reg);
+			err = iser_fast_reg_mr(iser_task, mem,
+					       &desc->pi_ctx->rsc, &prot_reg);
 			if (err)
 				goto err_reg;
 		}
 
-		err = iser_reg_sig_mr(iser_task, desc, mem_reg,
+		err = iser_reg_sig_mr(iser_task, desc->pi_ctx, mem_reg,
 				      &prot_reg, mem_reg);
 		if (err) {
 			iser_err("Failed to register signature mr\n");
 			return err;
 		}
-		desc->reg_indicators |= ISER_FASTREG_PROTECTED;
+		desc->pi_ctx->sig_protected = 1;
 	}
 
 	return 0;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 8f24728..1cadcd9 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -280,6 +280,45 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn)
 }
 
 static int
+iser_alloc_reg_res(struct ib_device *ib_device, struct ib_pd *pd,
+		   struct iser_reg_resources *res)
+{
+	int ret;
+
+	res->frpl = ib_alloc_fast_reg_page_list(ib_device,
+						ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(res->frpl)) {
+		ret = PTR_ERR(res->frpl);
+		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
+			 ret);
+		return PTR_ERR(res->frpl);
+	}
+
+	res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+			      ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(res->mr)) {
+		ret = PTR_ERR(res->mr);
+		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+		goto fast_reg_mr_failure;
+	}
+	res->mr_valid = 1;
+
+	return 0;
+
+fast_reg_mr_failure:
+	ib_free_fast_reg_page_list(res->frpl);
+
+	return ret;
+}
+
+static void
+iser_free_reg_res(struct iser_reg_resources *rsc)
+{
+	ib_dereg_mr(rsc->mr);
+	ib_free_fast_reg_page_list(rsc->frpl);
+}
+
+static int
 iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 		  struct fast_reg_descriptor *desc)
 {
@@ -292,36 +331,25 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 
 	pi_ctx = desc->pi_ctx;
 
-	pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device,
-					    ISCSI_ISER_SG_TABLESIZE);
-	if (IS_ERR(pi_ctx->prot_frpl)) {
-		ret = PTR_ERR(pi_ctx->prot_frpl);
-		goto prot_frpl_failure;
-	}
-
-	pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
-				      ISCSI_ISER_SG_TABLESIZE + 1);
-	if (IS_ERR(pi_ctx->prot_mr)) {
-		ret = PTR_ERR(pi_ctx->prot_mr);
-		goto prot_mr_failure;
+	ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc);
+	if (ret) {
+		iser_err("failed to allocate reg_resources\n");
+		goto alloc_reg_res_err;
 	}
-	desc->reg_indicators |= ISER_PROT_KEY_VALID;
 
 	pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
 	if (IS_ERR(pi_ctx->sig_mr)) {
 		ret = PTR_ERR(pi_ctx->sig_mr);
 		goto sig_mr_failure;
 	}
-	desc->reg_indicators |= ISER_SIG_KEY_VALID;
-	desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+	pi_ctx->sig_mr_valid = 1;
+	desc->pi_ctx->sig_protected = 0;
 
 	return 0;
 
 sig_mr_failure:
-	ib_dereg_mr(desc->pi_ctx->prot_mr);
-prot_mr_failure:
-	ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
-prot_frpl_failure:
+	iser_free_reg_res(&pi_ctx->rsc);
+alloc_reg_res_err:
 	kfree(desc->pi_ctx);
 
 	return ret;
@@ -330,8 +358,7 @@ prot_frpl_failure:
 static void
 iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 {
-	ib_free_fast_reg_page_list(pi_ctx->prot_frpl);
-	ib_dereg_mr(pi_ctx->prot_mr);
+	iser_free_reg_res(&pi_ctx->rsc);
 	ib_dereg_mr(pi_ctx->sig_mr);
 	kfree(pi_ctx);
 }
@@ -342,23 +369,11 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
 {
 	int ret;
 
-	desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
-						      ISCSI_ISER_SG_TABLESIZE + 1);
-	if (IS_ERR(desc->data_frpl)) {
-		ret = PTR_ERR(desc->data_frpl);
-		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
-			 ret);
-		return PTR_ERR(desc->data_frpl);
-	}
-
-	desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
-				    ISCSI_ISER_SG_TABLESIZE + 1);
-	if (IS_ERR(desc->data_mr)) {
-		ret = PTR_ERR(desc->data_mr);
-		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
-		goto fast_reg_mr_failure;
+	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
+	if (ret) {
+		iser_err("failed to allocate reg_resources\n");
+		return ret;
 	}
-	desc->reg_indicators |= ISER_DATA_KEY_VALID;
 
 	if (pi_enable) {
 		ret = iser_alloc_pi_ctx(ib_device, pd, desc);
@@ -367,10 +382,9 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
 	}
 
 	return 0;
+
 pi_ctx_alloc_failure:
-	ib_dereg_mr(desc->data_mr);
-fast_reg_mr_failure:
-	ib_free_fast_reg_page_list(desc->data_frpl);
+	iser_free_reg_res(&desc->rsc);
 
 	return ret;
 }
@@ -431,8 +445,7 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 
 	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
 		list_del(&desc->list);
-		ib_free_fast_reg_page_list(desc->data_frpl);
-		ib_dereg_mr(desc->data_mr);
+		iser_free_reg_res(&desc->rsc);
 		if (desc->pi_ctx)
 			iser_free_pi_ctx(desc->pi_ctx);
 		kfree(desc);
@@ -1244,8 +1257,8 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 	struct ib_mr_status mr_status;
 	int ret;
 
-	if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) {
-		desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+	if (desc && desc->pi_ctx->sig_protected) {
+		desc->pi_ctx->sig_protected = 0;
 		ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
 					 IB_MR_CHECK_SIG_STATUS, &mr_status);
 		if (ret) {
-- 
cgit v0.10.2


From 5190cc2664972f2c51502e928fcb7a608dddab5f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:54 +0300
Subject: IB/iser: Rename struct fast_reg_descriptor -> iser_fr_desc

Avoid struct names without iser_ prefix.

This patch does not change any functionality.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 9cdfdbd..70bf6e7 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -396,13 +396,13 @@ struct iser_pi_context {
 };
 
 /**
- * struct fast_reg_descriptor - Fast registration descriptor
+ * struct iser_fr_desc - Fast registration descriptor
  *
  * @list:           entry in connection fastreg pool
  * @rsc:            data buffer registration resources
  * @pi_ctx:         protection information context
  */
-struct fast_reg_descriptor {
+struct iser_fr_desc {
 	struct list_head		  list;
 	struct iser_reg_resources	  rsc;
 	struct iser_pi_context		 *pi_ctx;
@@ -642,9 +642,9 @@ int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
 void iser_free_fastreg_pool(struct ib_conn *ib_conn);
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector);
-struct fast_reg_descriptor *
+struct iser_fr_desc *
 iser_reg_desc_get(struct ib_conn *ib_conn);
 void
 iser_reg_desc_put(struct ib_conn *ib_conn,
-		  struct fast_reg_descriptor *desc);
+		  struct iser_fr_desc *desc);
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 82a3304..710ac7d 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -146,15 +146,15 @@ iser_copy_to_bounce(struct iser_data_buf *data)
 	iser_copy_bounce(data, true);
 }
 
-struct fast_reg_descriptor *
+struct iser_fr_desc *
 iser_reg_desc_get(struct ib_conn *ib_conn)
 {
-	struct fast_reg_descriptor *desc;
+	struct iser_fr_desc *desc;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ib_conn->lock, flags);
 	desc = list_first_entry(&ib_conn->fastreg.pool,
-				struct fast_reg_descriptor, list);
+				struct iser_fr_desc, list);
 	list_del(&desc->list);
 	spin_unlock_irqrestore(&ib_conn->lock, flags);
 
@@ -163,7 +163,7 @@ iser_reg_desc_get(struct ib_conn *ib_conn)
 
 void
 iser_reg_desc_put(struct ib_conn *ib_conn,
-		  struct fast_reg_descriptor *desc)
+		  struct iser_fr_desc *desc)
 {
 	unsigned long flags;
 
@@ -787,7 +787,7 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 	struct ib_device *ibdev = device->ib_device;
 	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
 	struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir];
-	struct fast_reg_descriptor *desc = NULL;
+	struct iser_fr_desc *desc = NULL;
 	int err, aligned_len;
 
 	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 1cadcd9..e4f8967 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -320,7 +320,7 @@ iser_free_reg_res(struct iser_reg_resources *rsc)
 
 static int
 iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
-		  struct fast_reg_descriptor *desc)
+		  struct iser_fr_desc *desc)
 {
 	struct iser_pi_context *pi_ctx = NULL;
 	int ret;
@@ -365,7 +365,7 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 
 static int
 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
-			 bool pi_enable, struct fast_reg_descriptor *desc)
+			 bool pi_enable, struct iser_fr_desc *desc)
 {
 	int ret;
 
@@ -397,7 +397,7 @@ pi_ctx_alloc_failure:
 int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 {
 	struct iser_device *device = ib_conn->device;
-	struct fast_reg_descriptor *desc;
+	struct iser_fr_desc *desc;
 	int i, ret;
 
 	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
@@ -435,7 +435,7 @@ err:
  */
 void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 {
-	struct fast_reg_descriptor *desc, *tmp;
+	struct iser_fr_desc *desc, *tmp;
 	int i = 0;
 
 	if (list_empty(&ib_conn->fastreg.pool))
@@ -1252,7 +1252,7 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector)
 {
 	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
-	struct fast_reg_descriptor *desc = reg->mem_h;
+	struct iser_fr_desc *desc = reg->mem_h;
 	unsigned long sector_size = iser_task->sc->device->sector_size;
 	struct ib_mr_status mr_status;
 	int ret;
-- 
cgit v0.10.2


From 8c18ed03a95cb6c3543b0a9e0df5e9366baea5df Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:55 +0300
Subject: IB/iser: Remove dead code in fmr_pool alloc/free

In the past the we always tried to allocate an fmr_pool
and if it failed on ENOSYS (not supported) then we continued
with dma mr. This is not the case anymore and if we tried to
allocate an fmr_pool then it is supported and we expect to succeed.

Also, the check if fmr_pool is allocated when free is called is
redundant as well as we are guaranteed it exists.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index e4f8967..adec4d7 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -244,22 +244,18 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 				    IB_ACCESS_REMOTE_READ);
 
 	ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params);
-	if (!IS_ERR(ib_conn->fmr.pool))
-		return 0;
+	if (IS_ERR(ib_conn->fmr.pool)) {
+		ret = PTR_ERR(ib_conn->fmr.pool);
+		iser_err("FMR allocation failed, err %d\n", ret);
+		goto err;
+	}
+
+	return 0;
 
-	/* no FMR => no need for page_vec */
+err:
 	kfree(ib_conn->fmr.page_vec);
 	ib_conn->fmr.page_vec = NULL;
-
-	ret = PTR_ERR(ib_conn->fmr.pool);
-	ib_conn->fmr.pool = NULL;
-	if (ret != -ENOSYS) {
-		iser_err("FMR allocation failed, err %d\n", ret);
-		return ret;
-	} else {
-		iser_warn("FMRs are not supported, using unaligned mode\n");
-		return 0;
-	}
+	return ret;
 }
 
 /**
@@ -270,9 +266,7 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn)
 	iser_info("freeing conn %p fmr pool %p\n",
 		  ib_conn, ib_conn->fmr.pool);
 
-	if (ib_conn->fmr.pool != NULL)
-		ib_destroy_fmr_pool(ib_conn->fmr.pool);
-
+	ib_destroy_fmr_pool(ib_conn->fmr.pool);
 	ib_conn->fmr.pool = NULL;
 
 	kfree(ib_conn->fmr.page_vec);
-- 
cgit v0.10.2


From 48afbff673d3d2ff6c52342574392db504dae301 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:56 +0300
Subject: IB/iser: Introduce iser_reg_ops

Move all the per-device function pointers to an easy
extensible iser_reg_ops structure that contains all
the iser registration operations.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 70bf6e7..9ce090c 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -326,6 +326,25 @@ struct iser_comp {
 };
 
 /**
+ * struct iser_device - Memory registration operations
+ *     per-device registration schemes
+ *
+ * @alloc_reg_res:     Allocate registration resources
+ * @free_reg_res:      Free registration resources
+ * @reg_rdma_mem:      Register memory buffers
+ * @unreg_rdma_mem:    Un-register memory buffers
+ */
+struct iser_reg_ops {
+	int            (*alloc_reg_res)(struct ib_conn *ib_conn,
+					unsigned cmds_max);
+	void           (*free_reg_res)(struct ib_conn *ib_conn);
+	int            (*reg_rdma_mem)(struct iscsi_iser_task *iser_task,
+				       enum iser_data_dir cmd_dir);
+	void           (*unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
+					 enum iser_data_dir cmd_dir);
+};
+
+/**
  * struct iser_device - iSER device handle
  *
  * @ib_device:     RDMA device
@@ -338,11 +357,7 @@ struct iser_comp {
  * @comps_used:    Number of completion contexts used, Min between online
  *                 cpus and device max completion vectors
  * @comps:         Dinamically allocated array of completion handlers
- * Memory registration pool Function pointers (FMR or Fastreg):
- *     @iser_alloc_rdma_reg_res: Allocation of memory regions pool
- *     @iser_free_rdma_reg_res:  Free of memory regions pool
- *     @iser_reg_rdma_mem:       Memory registration routine
- *     @iser_unreg_rdma_mem:     Memory deregistration routine
+ * @reg_ops:       Registration ops
  */
 struct iser_device {
 	struct ib_device             *ib_device;
@@ -354,13 +369,7 @@ struct iser_device {
 	int                          refcount;
 	int			     comps_used;
 	struct iser_comp	     *comps;
-	int                          (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
-								unsigned cmds_max);
-	void                         (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
-	int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
-							  enum iser_data_dir cmd_dir);
-	void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
-							    enum iser_data_dir cmd_dir);
+	struct iser_reg_ops          *reg_ops;
 };
 
 #define ISER_CHECK_GUARD	0xc0
@@ -563,6 +572,8 @@ extern int iser_debug_level;
 extern bool iser_pi_enable;
 extern int iser_pi_guard;
 
+int iser_assign_reg_ops(struct iser_device *device);
+
 int iser_send_control(struct iscsi_conn *conn,
 		      struct iscsi_task *task);
 
@@ -636,9 +647,9 @@ int  iser_initialize_task_headers(struct iscsi_task *task,
 			struct iser_tx_desc *tx_desc);
 int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 			      struct iscsi_session *session);
-int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
 void iser_free_fmr_pool(struct ib_conn *ib_conn);
-int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
 void iser_free_fastreg_pool(struct ib_conn *ib_conn);
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector);
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 9ff6dec..beacd5f 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -73,7 +73,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 			return err;
 	}
 
-	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+	err = device->reg_ops->reg_rdma_mem(iser_task, ISER_DIR_IN);
 	if (err) {
 		iser_err("Failed to set up Data-IN RDMA\n");
 		return err;
@@ -128,7 +128,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 			return err;
 	}
 
-	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+	err = device->reg_ops->reg_rdma_mem(iser_task, ISER_DIR_OUT);
 	if (err != 0) {
 		iser_err("Failed to register write cmd RDMA mem\n");
 		return err;
@@ -260,7 +260,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
 	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
 
-	if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max))
+	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max))
 		goto create_rdma_reg_res_failed;
 
 	if (iser_alloc_login_buf(iser_conn))
@@ -301,7 +301,7 @@ rx_desc_dma_map_failed:
 rx_desc_alloc_fail:
 	iser_free_login_buf(iser_conn);
 alloc_login_buf_fail:
-	device->iser_free_rdma_reg_res(ib_conn);
+	device->reg_ops->free_reg_res(ib_conn);
 create_rdma_reg_res_failed:
 	iser_err("failed allocating rx descriptors / data buffers\n");
 	return -ENOMEM;
@@ -314,8 +314,8 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
 	struct iser_device *device = ib_conn->device;
 
-	if (device->iser_free_rdma_reg_res)
-		device->iser_free_rdma_reg_res(ib_conn);
+	if (device->reg_ops->free_reg_res)
+		device->reg_ops->free_reg_res(ib_conn);
 
 	rx_desc = iser_conn->rx_descs;
 	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
@@ -699,7 +699,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 	}
 
 	if (iser_task->dir[ISER_DIR_IN]) {
-		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
+		device->reg_ops->unreg_rdma_mem(iser_task, ISER_DIR_IN);
 		if (is_rdma_data_aligned)
 			iser_dma_unmap_task_data(iser_task,
 						 &iser_task->data[ISER_DIR_IN],
@@ -711,7 +711,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 	}
 
 	if (iser_task->dir[ISER_DIR_OUT]) {
-		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+		device->reg_ops->unreg_rdma_mem(iser_task, ISER_DIR_OUT);
 		if (is_rdma_data_aligned)
 			iser_dma_unmap_task_data(iser_task,
 						 &iser_task->data[ISER_DIR_OUT],
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 710ac7d..9febfc1 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -39,6 +39,41 @@
 
 #include "iscsi_iser.h"
 
+static struct iser_reg_ops fastreg_ops = {
+	.alloc_reg_res	= iser_alloc_fastreg_pool,
+	.free_reg_res	= iser_free_fastreg_pool,
+	.reg_rdma_mem	= iser_reg_rdma_mem_fastreg,
+	.unreg_rdma_mem	= iser_unreg_mem_fastreg,
+};
+
+static struct iser_reg_ops fmr_ops = {
+	.alloc_reg_res	= iser_alloc_fmr_pool,
+	.free_reg_res	= iser_free_fmr_pool,
+	.reg_rdma_mem	= iser_reg_rdma_mem_fmr,
+	.unreg_rdma_mem	= iser_unreg_mem_fmr,
+};
+
+int iser_assign_reg_ops(struct iser_device *device)
+{
+	struct ib_device_attr *dev_attr = &device->dev_attr;
+
+	/* Assign function handles  - based on FMR support */
+	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
+	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+		iser_info("FMR supported, using FMR for registration\n");
+		device->reg_ops = &fmr_ops;
+	} else
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		iser_info("FastReg supported, using FastReg for registration\n");
+		device->reg_ops = &fastreg_ops;
+	} else {
+		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 static void
 iser_free_bounce_sg(struct iser_data_buf *data)
 {
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index adec4d7..17f43cf 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -87,25 +87,9 @@ static int iser_create_device_ib_res(struct iser_device *device)
 		return ret;
 	}
 
-	/* Assign function handles  - based on FMR support */
-	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
-	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
-		iser_info("FMR supported, using FMR for registration\n");
-		device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
-		device->iser_free_rdma_reg_res = iser_free_fmr_pool;
-		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
-		device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
-	} else
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-		iser_info("FastReg supported, using FastReg for registration\n");
-		device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool;
-		device->iser_free_rdma_reg_res = iser_free_fastreg_pool;
-		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg;
-		device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg;
-	} else {
-		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
-		return -1;
-	}
+	ret = iser_assign_reg_ops(device);
+	if (ret)
+		return ret;
 
 	device->comps_used = min_t(int, num_online_cpus(),
 				 device->ib_device->num_comp_vectors);
@@ -211,11 +195,11 @@ static void iser_free_device_ib_res(struct iser_device *device)
 }
 
 /**
- * iser_create_fmr_pool - Creates FMR pool and page_vector
+ * iser_alloc_fmr_pool - Creates FMR pool and page_vector
  *
  * returns 0 on success, or errno code on failure
  */
-int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 {
 	struct iser_device *device = ib_conn->device;
 	struct ib_fmr_pool_param params;
@@ -384,11 +368,11 @@ pi_ctx_alloc_failure:
 }
 
 /**
- * iser_create_fastreg_pool - Creates pool of fast_reg descriptors
+ * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors
  * for fast registration work requests.
  * returns 0 on success, or errno code on failure
  */
-int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 {
 	struct iser_device *device = ib_conn->device;
 	struct iser_fr_desc *desc;
-- 
cgit v0.10.2


From eb6ea8c36c90f022dd9603530286e0707a9c467b Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:57 +0300
Subject: IB/iser: Move fastreg descriptor allocation to
 iser_create_fastreg_desc

Don't have the caller allocate the structure and worry about
freeing it in case the routine failed.

This patch does not change any functionality.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 17f43cf..c41bd42 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -341,17 +341,20 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 	kfree(pi_ctx);
 }
 
-static int
+static struct iser_fr_desc *
 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
-			 bool pi_enable, struct iser_fr_desc *desc)
+			 bool pi_enable)
 {
+	struct iser_fr_desc *desc;
 	int ret;
 
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
 	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
-	if (ret) {
-		iser_err("failed to allocate reg_resources\n");
-		return ret;
-	}
+	if (ret)
+		goto reg_res_alloc_failure;
 
 	if (pi_enable) {
 		ret = iser_alloc_pi_ctx(ib_device, pd, desc);
@@ -359,12 +362,14 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
 			goto pi_ctx_alloc_failure;
 	}
 
-	return 0;
+	return desc;
 
 pi_ctx_alloc_failure:
 	iser_free_reg_res(&desc->rsc);
+reg_res_alloc_failure:
+	kfree(desc);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /**
@@ -381,19 +386,10 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
 	ib_conn->fastreg.pool_size = 0;
 	for (i = 0; i < cmds_max; i++) {
-		desc = kzalloc(sizeof(*desc), GFP_KERNEL);
-		if (!desc) {
-			iser_err("Failed to allocate a new fast_reg descriptor\n");
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		ret = iser_create_fastreg_desc(device->ib_device, device->pd,
-					       ib_conn->pi_support, desc);
-		if (ret) {
-			iser_err("Failed to create fastreg descriptor err=%d\n",
-				 ret);
-			kfree(desc);
+		desc = iser_create_fastreg_desc(device->ib_device, device->pd,
+						ib_conn->pi_support);
+		if (IS_ERR(desc)) {
+			ret = PTR_ERR(desc);
 			goto err;
 		}
 
-- 
cgit v0.10.2


From 385ad87d4b637c1ebdb54bc93274fc2c267dfc16 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:58 +0300
Subject: IB/iser: Introduce iser registration pool struct

Instead of having it a part of the connection structure,
have it be under a dedicated (embedded) structure in the
connection. A logical separation of the registration pool
and the connection structure.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 9ce090c..1fc4c23 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -418,6 +418,33 @@ struct iser_fr_desc {
 };
 
 /**
+ * struct iser_fr_pool: connection fast registration pool
+ *
+ * @lock:                protects fmr/fastreg pool
+ * @union.fmr:
+ *     @pool:            FMR pool for fast registrations
+ *     @page_vec:        fast reg page list to hold mapped commands pages
+ *                       used for registration
+ * @union.fastreg:
+ *     @pool:            Fast registration descriptors pool for fast
+ *                       registrations
+ *     @pool_size:       Size of pool
+ */
+struct iser_fr_pool {
+	spinlock_t		     lock;
+	union {
+		struct {
+			struct ib_fmr_pool              *pool;
+			struct iser_page_vec            *page_vec;
+		} fmr;
+		struct {
+			struct list_head	 pool;
+			int			 pool_size;
+		} fastreg;
+	};
+};
+
+/**
  * struct ib_conn - Infiniband related objects
  *
  * @cma_id:              rdma_cm connection maneger handle
@@ -430,15 +457,7 @@ struct iser_fr_desc {
  * @pi_support:          Indicate device T10-PI support
  * @beacon:              beacon send wr to signal all flush errors were drained
  * @flush_comp:          completes when all connection completions consumed
- * @lock:                protects fmr/fastreg pool
- * @union.fmr:
- *     @pool:            FMR pool for fast registrations
- *     @page_vec:        page vector to hold mapped commands pages
- *                       used for registration
- * @union.fastreg:
- *     @pool:            Fast registration descriptors pool for fast
- *                       registrations
- *     @pool_size:       Size of pool
+ * @fr_pool:             connection fast registration poool
  */
 struct ib_conn {
 	struct rdma_cm_id           *cma_id;
@@ -451,17 +470,7 @@ struct ib_conn {
 	bool			     pi_support;
 	struct ib_send_wr	     beacon;
 	struct completion	     flush_comp;
-	spinlock_t		     lock;
-	union {
-		struct {
-			struct ib_fmr_pool      *pool;
-			struct iser_page_vec	*page_vec;
-		} fmr;
-		struct {
-			struct list_head	 pool;
-			int			 pool_size;
-		} fastreg;
-	};
+	struct iser_fr_pool          fr_pool;
 };
 
 /**
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 9febfc1..d731ed7 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -184,14 +184,15 @@ iser_copy_to_bounce(struct iser_data_buf *data)
 struct iser_fr_desc *
 iser_reg_desc_get(struct ib_conn *ib_conn)
 {
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_fr_desc *desc;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ib_conn->lock, flags);
-	desc = list_first_entry(&ib_conn->fastreg.pool,
+	spin_lock_irqsave(&fr_pool->lock, flags);
+	desc = list_first_entry(&fr_pool->fastreg.pool,
 				struct iser_fr_desc, list);
 	list_del(&desc->list);
-	spin_unlock_irqrestore(&ib_conn->lock, flags);
+	spin_unlock_irqrestore(&fr_pool->lock, flags);
 
 	return desc;
 }
@@ -200,11 +201,12 @@ void
 iser_reg_desc_put(struct ib_conn *ib_conn,
 		  struct iser_fr_desc *desc)
 {
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ib_conn->lock, flags);
-	list_add(&desc->list, &ib_conn->fastreg.pool);
-	spin_unlock_irqrestore(&ib_conn->lock, flags);
+	spin_lock_irqsave(&fr_pool->lock, flags);
+	list_add(&desc->list, &fr_pool->fastreg.pool);
+	spin_unlock_irqrestore(&fr_pool->lock, flags);
 }
 
 /**
@@ -480,6 +482,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
 		      struct iser_mem_reg *mem_reg)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_device *device = ib_conn->device;
 	struct ib_pool_fmr *fmr;
 	int ret, plen;
@@ -496,7 +499,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
 		return -EINVAL;
 	}
 
-	fmr  = ib_fmr_pool_map_phys(ib_conn->fmr.pool,
+	fmr  = ib_fmr_pool_map_phys(fr_pool->fmr.pool,
 				    page_vec->pages,
 				    page_vec->length,
 				    page_vec->pages[0]);
@@ -560,6 +563,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 			  enum iser_data_dir cmd_dir)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_device   *device = ib_conn->device;
 	struct ib_device     *ibdev = device->ib_device;
 	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
@@ -583,20 +587,20 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 	if (mem->dma_nents == 1) {
 		return iser_reg_dma(device, mem, mem_reg);
 	} else { /* use FMR for multiple dma entries */
-		err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec,
-					mem_reg);
+		err = iser_reg_page_vec(iser_task, mem,
+					fr_pool->fmr.page_vec, mem_reg);
 		if (err && err != -EAGAIN) {
 			iser_data_buf_dump(mem, ibdev);
 			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
 				 mem->dma_nents,
 				 ntoh24(iser_task->desc.iscsi_header.dlength));
 			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
-				 ib_conn->fmr.page_vec->data_size,
-				 ib_conn->fmr.page_vec->length,
-				 ib_conn->fmr.page_vec->offset);
-			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
+				 fr_pool->fmr.page_vec->data_size,
+				 fr_pool->fmr.page_vec->length,
+				 fr_pool->fmr.page_vec->offset);
+			for (i = 0; i < fr_pool->fmr.page_vec->length; i++)
 				iser_err("page_vec[%d] = 0x%llx\n", i,
-					 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
+					 (unsigned long long)fr_pool->fmr.page_vec->pages[i]);
 		}
 		if (err)
 			return err;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index c41bd42..1b8c9c2 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -202,16 +202,21 @@ static void iser_free_device_ib_res(struct iser_device *device)
 int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 {
 	struct iser_device *device = ib_conn->device;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_page_vec *page_vec;
+	struct ib_fmr_pool *fmr_pool;
 	struct ib_fmr_pool_param params;
 	int ret = -ENOMEM;
 
-	ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) +
-					(sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
-					GFP_KERNEL);
-	if (!ib_conn->fmr.page_vec)
+	spin_lock_init(&fr_pool->lock);
+
+	page_vec = kmalloc(sizeof(*page_vec) +
+			   (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE + 1)),
+			   GFP_KERNEL);
+	if (!page_vec)
 		return ret;
 
-	ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1);
+	page_vec->pages = (u64 *)(page_vec + 1);
 
 	params.page_shift        = SHIFT_4K;
 	/* when the first/last SG element are not start/end *
@@ -227,18 +232,20 @@ int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 				    IB_ACCESS_REMOTE_WRITE |
 				    IB_ACCESS_REMOTE_READ);
 
-	ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params);
-	if (IS_ERR(ib_conn->fmr.pool)) {
-		ret = PTR_ERR(ib_conn->fmr.pool);
+	fmr_pool = ib_create_fmr_pool(device->pd, &params);
+	if (IS_ERR(fmr_pool)) {
+		ret = PTR_ERR(fmr_pool);
 		iser_err("FMR allocation failed, err %d\n", ret);
 		goto err;
 	}
 
+	fr_pool->fmr.page_vec = page_vec;
+	fr_pool->fmr.pool = fmr_pool;
+
 	return 0;
 
 err:
-	kfree(ib_conn->fmr.page_vec);
-	ib_conn->fmr.page_vec = NULL;
+	kfree(page_vec);
 	return ret;
 }
 
@@ -247,14 +254,15 @@ err:
  */
 void iser_free_fmr_pool(struct ib_conn *ib_conn)
 {
-	iser_info("freeing conn %p fmr pool %p\n",
-		  ib_conn, ib_conn->fmr.pool);
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 
-	ib_destroy_fmr_pool(ib_conn->fmr.pool);
-	ib_conn->fmr.pool = NULL;
+	iser_info("freeing conn %p fmr pool %p\n",
+		  ib_conn, fr_pool->fmr.pool);
 
-	kfree(ib_conn->fmr.page_vec);
-	ib_conn->fmr.page_vec = NULL;
+	ib_destroy_fmr_pool(fr_pool->fmr.pool);
+	fr_pool->fmr.pool = NULL;
+	kfree(fr_pool->fmr.page_vec);
+	fr_pool->fmr.page_vec = NULL;
 }
 
 static int
@@ -380,11 +388,13 @@ reg_res_alloc_failure:
 int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 {
 	struct iser_device *device = ib_conn->device;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_fr_desc *desc;
 	int i, ret;
 
-	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
-	ib_conn->fastreg.pool_size = 0;
+	INIT_LIST_HEAD(&fr_pool->fastreg.pool);
+	spin_lock_init(&fr_pool->lock);
+	fr_pool->fastreg.pool_size = 0;
 	for (i = 0; i < cmds_max; i++) {
 		desc = iser_create_fastreg_desc(device->ib_device, device->pd,
 						ib_conn->pi_support);
@@ -393,8 +403,8 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 			goto err;
 		}
 
-		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
-		ib_conn->fastreg.pool_size++;
+		list_add_tail(&desc->list, &fr_pool->fastreg.pool);
+		fr_pool->fastreg.pool_size++;
 	}
 
 	return 0;
@@ -409,15 +419,16 @@ err:
  */
 void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 {
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_fr_desc *desc, *tmp;
 	int i = 0;
 
-	if (list_empty(&ib_conn->fastreg.pool))
+	if (list_empty(&fr_pool->fastreg.pool))
 		return;
 
 	iser_info("freeing conn %p fr pool\n", ib_conn);
 
-	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
+	list_for_each_entry_safe(desc, tmp, &fr_pool->fastreg.pool, list) {
 		list_del(&desc->list);
 		iser_free_reg_res(&desc->rsc);
 		if (desc->pi_ctx)
@@ -426,9 +437,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 		++i;
 	}
 
-	if (i < ib_conn->fastreg.pool_size)
+	if (i < fr_pool->fastreg.pool_size)
 		iser_warn("pool still has %d regions registered\n",
-			  ib_conn->fastreg.pool_size - i);
+			  fr_pool->fastreg.pool_size - i);
 }
 
 /**
@@ -924,7 +935,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
 	init_completion(&iser_conn->ib_completion);
 	init_completion(&iser_conn->up_completion);
 	INIT_LIST_HEAD(&iser_conn->conn_list);
-	spin_lock_init(&iser_conn->ib_conn.lock);
 	mutex_init(&iser_conn->state_mutex);
 }
 
-- 
cgit v0.10.2


From 2b3bf958103899a96d230c9f2e0d87606f08a7be Mon Sep 17 00:00:00 2001
From: Adir Lev <adirl@mellanox.com>
Date: Thu, 6 Aug 2015 18:32:59 +0300
Subject: IB/iser: Maintain connection fmr_pool under a single registration
 descriptor

This will allow us to unify the memory registration code path between
the various methods which vary by the device capabilities. This change
will make it easier and less intrusive to remove fmr_pools from the
code when we'd want to.

The reason we use a single descriptor is to avoid taking a
redundant spinlock when working with FMRs.

We also change the signature of iser_reg_page_vec to make it match
iser_fast_reg_mr (and the future indirect registration method).

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 1fc4c23..a8c8177 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -380,12 +380,20 @@ struct iser_device {
  * struct iser_reg_resources - Fast registration recources
  *
  * @mr:         memory region
- * @frpl:       fast reg page list
+ * @fmr_pool:   pool of fmrs
+ * @frpl:       fast reg page list used by frwrs
+ * @page_vec:   fast reg page list used by fmr pool
  * @mr_valid:   is mr valid indicator
  */
 struct iser_reg_resources {
-	struct ib_mr			 *mr;
-	struct ib_fast_reg_page_list     *frpl;
+	union {
+		struct ib_mr             *mr;
+		struct ib_fmr_pool       *fmr_pool;
+	};
+	union {
+		struct ib_fast_reg_page_list     *frpl;
+		struct iser_page_vec             *page_vec;
+	};
 	u8				  mr_valid:1;
 };
 
@@ -420,28 +428,14 @@ struct iser_fr_desc {
 /**
  * struct iser_fr_pool: connection fast registration pool
  *
+ * @list:                list of fastreg descriptors
  * @lock:                protects fmr/fastreg pool
- * @union.fmr:
- *     @pool:            FMR pool for fast registrations
- *     @page_vec:        fast reg page list to hold mapped commands pages
- *                       used for registration
- * @union.fastreg:
- *     @pool:            Fast registration descriptors pool for fast
- *                       registrations
- *     @pool_size:       Size of pool
+ * @size:                size of the pool
  */
 struct iser_fr_pool {
-	spinlock_t		     lock;
-	union {
-		struct {
-			struct ib_fmr_pool              *pool;
-			struct iser_page_vec            *page_vec;
-		} fmr;
-		struct {
-			struct list_head	 pool;
-			int			 pool_size;
-		} fastreg;
-	};
+	struct list_head        list;
+	spinlock_t              lock;
+	int                     size;
 };
 
 /**
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index d731ed7..c792929 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -189,7 +189,7 @@ iser_reg_desc_get(struct ib_conn *ib_conn)
 	unsigned long flags;
 
 	spin_lock_irqsave(&fr_pool->lock, flags);
-	desc = list_first_entry(&fr_pool->fastreg.pool,
+	desc = list_first_entry(&fr_pool->list,
 				struct iser_fr_desc, list);
 	list_del(&desc->list);
 	spin_unlock_irqrestore(&fr_pool->lock, flags);
@@ -205,7 +205,7 @@ iser_reg_desc_put(struct ib_conn *ib_conn,
 	unsigned long flags;
 
 	spin_lock_irqsave(&fr_pool->lock, flags);
-	list_add(&desc->list, &fr_pool->fastreg.pool);
+	list_add(&desc->list, &fr_pool->list);
 	spin_unlock_irqrestore(&fr_pool->lock, flags);
 }
 
@@ -478,12 +478,13 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
 static
 int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
 		      struct iser_data_buf *mem,
-		      struct iser_page_vec *page_vec,
+		      struct iser_reg_resources *rsc,
 		      struct iser_mem_reg *mem_reg)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_device *device = ib_conn->device;
+	struct iser_page_vec *page_vec = rsc->page_vec;
+	struct ib_fmr_pool *fmr_pool = rsc->fmr_pool;
 	struct ib_pool_fmr *fmr;
 	int ret, plen;
 
@@ -499,7 +500,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
 		return -EINVAL;
 	}
 
-	fmr  = ib_fmr_pool_map_phys(fr_pool->fmr.pool,
+	fmr  = ib_fmr_pool_map_phys(fmr_pool,
 				    page_vec->pages,
 				    page_vec->length,
 				    page_vec->pages[0]);
@@ -587,20 +588,23 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 	if (mem->dma_nents == 1) {
 		return iser_reg_dma(device, mem, mem_reg);
 	} else { /* use FMR for multiple dma entries */
-		err = iser_reg_page_vec(iser_task, mem,
-					fr_pool->fmr.page_vec, mem_reg);
+		struct iser_fr_desc *desc;
+
+		desc = list_first_entry(&fr_pool->list,
+					struct iser_fr_desc, list);
+		err = iser_reg_page_vec(iser_task, mem, &desc->rsc, mem_reg);
 		if (err && err != -EAGAIN) {
 			iser_data_buf_dump(mem, ibdev);
 			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
 				 mem->dma_nents,
 				 ntoh24(iser_task->desc.iscsi_header.dlength));
 			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
-				 fr_pool->fmr.page_vec->data_size,
-				 fr_pool->fmr.page_vec->length,
-				 fr_pool->fmr.page_vec->offset);
-			for (i = 0; i < fr_pool->fmr.page_vec->length; i++)
+				 desc->rsc.page_vec->data_size,
+				 desc->rsc.page_vec->length,
+				 desc->rsc.page_vec->offset);
+			for (i = 0; i < desc->rsc.page_vec->length; i++)
 				iser_err("page_vec[%d] = 0x%llx\n", i,
-					 (unsigned long long)fr_pool->fmr.page_vec->pages[i]);
+					 (unsigned long long)desc->rsc.page_vec->pages[i]);
 		}
 		if (err)
 			return err;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 1b8c9c2..5b5432d 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -204,17 +204,25 @@ int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	struct iser_device *device = ib_conn->device;
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_page_vec *page_vec;
+	struct iser_fr_desc *desc;
 	struct ib_fmr_pool *fmr_pool;
 	struct ib_fmr_pool_param params;
-	int ret = -ENOMEM;
+	int ret;
 
+	INIT_LIST_HEAD(&fr_pool->list);
 	spin_lock_init(&fr_pool->lock);
 
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
 	page_vec = kmalloc(sizeof(*page_vec) +
 			   (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE + 1)),
 			   GFP_KERNEL);
-	if (!page_vec)
-		return ret;
+	if (!page_vec) {
+		ret = -ENOMEM;
+		goto err_frpl;
+	}
 
 	page_vec->pages = (u64 *)(page_vec + 1);
 
@@ -236,16 +244,20 @@ int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	if (IS_ERR(fmr_pool)) {
 		ret = PTR_ERR(fmr_pool);
 		iser_err("FMR allocation failed, err %d\n", ret);
-		goto err;
+		goto err_fmr;
 	}
 
-	fr_pool->fmr.page_vec = page_vec;
-	fr_pool->fmr.pool = fmr_pool;
+	desc->rsc.page_vec = page_vec;
+	desc->rsc.fmr_pool = fmr_pool;
+	list_add(&desc->list, &fr_pool->list);
 
 	return 0;
 
-err:
+err_fmr:
 	kfree(page_vec);
+err_frpl:
+	kfree(desc);
+
 	return ret;
 }
 
@@ -255,14 +267,18 @@ err:
 void iser_free_fmr_pool(struct ib_conn *ib_conn)
 {
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_fr_desc *desc;
+
+	desc = list_first_entry(&fr_pool->list,
+				struct iser_fr_desc, list);
+	list_del(&desc->list);
 
 	iser_info("freeing conn %p fmr pool %p\n",
-		  ib_conn, fr_pool->fmr.pool);
+		  ib_conn, desc->rsc.fmr_pool);
 
-	ib_destroy_fmr_pool(fr_pool->fmr.pool);
-	fr_pool->fmr.pool = NULL;
-	kfree(fr_pool->fmr.page_vec);
-	fr_pool->fmr.page_vec = NULL;
+	ib_destroy_fmr_pool(desc->rsc.fmr_pool);
+	kfree(desc->rsc.page_vec);
+	kfree(desc);
 }
 
 static int
@@ -392,9 +408,9 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	struct iser_fr_desc *desc;
 	int i, ret;
 
-	INIT_LIST_HEAD(&fr_pool->fastreg.pool);
+	INIT_LIST_HEAD(&fr_pool->list);
 	spin_lock_init(&fr_pool->lock);
-	fr_pool->fastreg.pool_size = 0;
+	fr_pool->size = 0;
 	for (i = 0; i < cmds_max; i++) {
 		desc = iser_create_fastreg_desc(device->ib_device, device->pd,
 						ib_conn->pi_support);
@@ -403,8 +419,8 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 			goto err;
 		}
 
-		list_add_tail(&desc->list, &fr_pool->fastreg.pool);
-		fr_pool->fastreg.pool_size++;
+		list_add_tail(&desc->list, &fr_pool->list);
+		fr_pool->size++;
 	}
 
 	return 0;
@@ -423,12 +439,12 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 	struct iser_fr_desc *desc, *tmp;
 	int i = 0;
 
-	if (list_empty(&fr_pool->fastreg.pool))
+	if (list_empty(&fr_pool->list))
 		return;
 
 	iser_info("freeing conn %p fr pool\n", ib_conn);
 
-	list_for_each_entry_safe(desc, tmp, &fr_pool->fastreg.pool, list) {
+	list_for_each_entry_safe(desc, tmp, &fr_pool->list, list) {
 		list_del(&desc->list);
 		iser_free_reg_res(&desc->rsc);
 		if (desc->pi_ctx)
@@ -437,9 +453,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 		++i;
 	}
 
-	if (i < fr_pool->fastreg.pool_size)
+	if (i < fr_pool->size)
 		iser_warn("pool still has %d regions registered\n",
-			  fr_pool->fastreg.pool_size - i);
+			  fr_pool->size - i);
 }
 
 /**
-- 
cgit v0.10.2


From 7d0483c927f429c7aece47e730eaa91007577d99 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:00 +0300
Subject: IB/iser: Rename iser_reg_page_vec to iser_fast_reg_fmr

Also, change a name of a local variable.

This patch does not change any functionality.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index c792929..300e23c 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -476,10 +476,10 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
  * returns: 0 on success, errno code on failure
  */
 static
-int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 		      struct iser_data_buf *mem,
 		      struct iser_reg_resources *rsc,
-		      struct iser_mem_reg *mem_reg)
+		      struct iser_mem_reg *reg)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
 	struct iser_device *device = ib_conn->device;
@@ -510,11 +510,11 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
 		return ret;
 	}
 
-	mem_reg->sge.lkey = fmr->fmr->lkey;
-	mem_reg->rkey = fmr->fmr->rkey;
-	mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset;
-	mem_reg->sge.length = page_vec->data_size;
-	mem_reg->mem_h = fmr;
+	reg->sge.lkey = fmr->fmr->lkey;
+	reg->rkey = fmr->fmr->rkey;
+	reg->sge.addr = page_vec->pages[0] + page_vec->offset;
+	reg->sge.length = page_vec->data_size;
+	reg->mem_h = fmr;
 
 	return 0;
 }
@@ -592,7 +592,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 
 		desc = list_first_entry(&fr_pool->list,
 					struct iser_fr_desc, list);
-		err = iser_reg_page_vec(iser_task, mem, &desc->rsc, mem_reg);
+		err = iser_fast_reg_fmr(iser_task, mem, &desc->rsc, mem_reg);
 		if (err && err != -EAGAIN) {
 			iser_data_buf_dump(mem, ibdev);
 			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
-- 
cgit v0.10.2


From 81722909c8f78ee2db62373a74ec2ecb709c112e Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:01 +0300
Subject: IB/iser: Make reg_desc_get a per device routine

As for fmrs we will hold a single registration descriptor
as no need for multiple like in the frwr mode (descriptor
for each task). This change helps unifying the duplicate
registration code paths.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index a8c8177..611abaa 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -333,6 +333,8 @@ struct iser_comp {
  * @free_reg_res:      Free registration resources
  * @reg_rdma_mem:      Register memory buffers
  * @unreg_rdma_mem:    Un-register memory buffers
+ * @reg_desc_get:      Get a registration descriptor for pool
+ * @reg_desc_put:      Get a registration descriptor to pool
  */
 struct iser_reg_ops {
 	int            (*alloc_reg_res)(struct ib_conn *ib_conn,
@@ -342,6 +344,9 @@ struct iser_reg_ops {
 				       enum iser_data_dir cmd_dir);
 	void           (*unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
 					 enum iser_data_dir cmd_dir);
+	struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
+	void           (*reg_desc_put)(struct ib_conn *ib_conn,
+				       struct iser_fr_desc *desc);
 };
 
 /**
@@ -657,8 +662,13 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn);
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector);
 struct iser_fr_desc *
-iser_reg_desc_get(struct ib_conn *ib_conn);
+iser_reg_desc_get_fr(struct ib_conn *ib_conn);
 void
-iser_reg_desc_put(struct ib_conn *ib_conn,
-		  struct iser_fr_desc *desc);
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+		     struct iser_fr_desc *desc);
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+		      struct iser_fr_desc *desc);
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 300e23c..7ee7f44 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -44,6 +44,8 @@ static struct iser_reg_ops fastreg_ops = {
 	.free_reg_res	= iser_free_fastreg_pool,
 	.reg_rdma_mem	= iser_reg_rdma_mem_fastreg,
 	.unreg_rdma_mem	= iser_unreg_mem_fastreg,
+	.reg_desc_get	= iser_reg_desc_get_fr,
+	.reg_desc_put	= iser_reg_desc_put_fr,
 };
 
 static struct iser_reg_ops fmr_ops = {
@@ -51,6 +53,8 @@ static struct iser_reg_ops fmr_ops = {
 	.free_reg_res	= iser_free_fmr_pool,
 	.reg_rdma_mem	= iser_reg_rdma_mem_fmr,
 	.unreg_rdma_mem	= iser_unreg_mem_fmr,
+	.reg_desc_get	= iser_reg_desc_get_fmr,
+	.reg_desc_put	= iser_reg_desc_put_fmr,
 };
 
 int iser_assign_reg_ops(struct iser_device *device)
@@ -182,7 +186,7 @@ iser_copy_to_bounce(struct iser_data_buf *data)
 }
 
 struct iser_fr_desc *
-iser_reg_desc_get(struct ib_conn *ib_conn)
+iser_reg_desc_get_fr(struct ib_conn *ib_conn)
 {
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_fr_desc *desc;
@@ -198,8 +202,8 @@ iser_reg_desc_get(struct ib_conn *ib_conn)
 }
 
 void
-iser_reg_desc_put(struct ib_conn *ib_conn,
-		  struct iser_fr_desc *desc)
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+		     struct iser_fr_desc *desc)
 {
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	unsigned long flags;
@@ -209,6 +213,21 @@ iser_reg_desc_put(struct ib_conn *ib_conn,
 	spin_unlock_irqrestore(&fr_pool->lock, flags);
 }
 
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+
+	return list_first_entry(&fr_pool->list,
+				struct iser_fr_desc, list);
+}
+
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+		      struct iser_fr_desc *desc)
+{
+}
+
 /**
  * iser_start_rdma_unaligned_sg
  */
@@ -544,13 +563,14 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
 void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
 			    enum iser_data_dir cmd_dir)
 {
+	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
 	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
 
 	if (!reg->mem_h)
 		return;
 
-	iser_reg_desc_put(&iser_task->iser_conn->ib_conn,
-			  reg->mem_h);
+	device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
+				     reg->mem_h);
 	reg->mem_h = NULL;
 }
 
@@ -564,7 +584,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 			  enum iser_data_dir cmd_dir)
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
 	struct iser_device   *device = ib_conn->device;
 	struct ib_device     *ibdev = device->ib_device;
 	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
@@ -590,8 +609,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
 	} else { /* use FMR for multiple dma entries */
 		struct iser_fr_desc *desc;
 
-		desc = list_first_entry(&fr_pool->list,
-					struct iser_fr_desc, list);
+		desc = device->reg_ops->reg_desc_get(ib_conn);
 		err = iser_fast_reg_fmr(iser_task, mem, &desc->rsc, mem_reg);
 		if (err && err != -EAGAIN) {
 			iser_data_buf_dump(mem, ibdev);
@@ -844,7 +862,7 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 
 	if (mem->dma_nents != 1 ||
 	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
-		desc = iser_reg_desc_get(ib_conn);
+		desc = device->reg_ops->reg_desc_get(ib_conn);
 		mem_reg->mem_h = desc;
 	}
 
@@ -887,7 +905,7 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
 	return 0;
 err_reg:
 	if (desc)
-		iser_reg_desc_put(ib_conn, desc);
+		device->reg_ops->reg_desc_put(ib_conn, desc);
 
 	return err;
 }
-- 
cgit v0.10.2


From 32467c420bb68776ebaa53ddf6712e1dba7bb5da Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:02 +0300
Subject: IB/iser: Unify fast memory registration flows

iser_reg_rdma_mem_[fastreg|fmr] share a lot of code, and
logically do the same thing other than the buffer registration
method itself (iser_fast_reg_mr vs. iser_fast_reg_fmr).
The DIF logic is not implemented in the FMR flow as there is no
existing device that supports FMRs and Signature feature.

This patch unifies the flow in a single routine iser_reg_rdma_mem
and just split to fmr/frwr for the buffer registration itself.

Also, for symmetry reasons, unify iser_unreg_rdma_mem (which will
call the relevant device specific unreg routine).

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 611abaa..e6105d2 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -239,6 +239,7 @@ struct iser_data_buf {
 struct iser_device;
 struct iscsi_iser_task;
 struct iscsi_endpoint;
+struct iser_reg_resources;
 
 /**
  * struct iser_mem_reg - iSER memory registration info
@@ -331,8 +332,8 @@ struct iser_comp {
  *
  * @alloc_reg_res:     Allocate registration resources
  * @free_reg_res:      Free registration resources
- * @reg_rdma_mem:      Register memory buffers
- * @unreg_rdma_mem:    Un-register memory buffers
+ * @fast_reg_mem:      Register memory buffers
+ * @unreg_mem:         Un-register memory buffers
  * @reg_desc_get:      Get a registration descriptor for pool
  * @reg_desc_put:      Get a registration descriptor to pool
  */
@@ -340,10 +341,12 @@ struct iser_reg_ops {
 	int            (*alloc_reg_res)(struct ib_conn *ib_conn,
 					unsigned cmds_max);
 	void           (*free_reg_res)(struct ib_conn *ib_conn);
-	int            (*reg_rdma_mem)(struct iscsi_iser_task *iser_task,
-				       enum iser_data_dir cmd_dir);
-	void           (*unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
-					 enum iser_data_dir cmd_dir);
+	int            (*reg_mem)(struct iscsi_iser_task *iser_task,
+				  struct iser_data_buf *mem,
+				  struct iser_reg_resources *rsc,
+				  struct iser_mem_reg *reg);
+	void           (*unreg_mem)(struct iscsi_iser_task *iser_task,
+				    enum iser_data_dir cmd_dir);
 	struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
 	void           (*reg_desc_put)(struct ib_conn *ib_conn,
 				       struct iser_fr_desc *desc);
@@ -622,10 +625,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 				     struct iser_data_buf *mem,
 				     enum iser_data_dir cmd_dir);
 
-int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
-			   enum iser_data_dir cmd_dir);
-int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task,
-			       enum iser_data_dir cmd_dir);
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+		      enum iser_data_dir dir);
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+			 enum iser_data_dir dir);
 
 int  iser_connect(struct iser_conn *iser_conn,
 		  struct sockaddr *src_addr,
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index beacd5f..ae19c69 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -49,7 +49,6 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
 	struct iser_mem_reg *mem_reg;
 	int err;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -73,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 			return err;
 	}
 
-	err = device->reg_ops->reg_rdma_mem(iser_task, ISER_DIR_IN);
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
 	if (err) {
 		iser_err("Failed to set up Data-IN RDMA\n");
 		return err;
@@ -103,7 +102,6 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 		       unsigned int edtl)
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
 	struct iser_mem_reg *mem_reg;
 	int err;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -128,7 +126,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 			return err;
 	}
 
-	err = device->reg_ops->reg_rdma_mem(iser_task, ISER_DIR_OUT);
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
 	if (err != 0) {
 		iser_err("Failed to register write cmd RDMA mem\n");
 		return err;
@@ -662,7 +660,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
-	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
 	int is_rdma_data_aligned = 1;
 	int is_rdma_prot_aligned = 1;
 	int prot_count = scsi_prot_sg_count(iser_task->sc);
@@ -699,7 +696,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 	}
 
 	if (iser_task->dir[ISER_DIR_IN]) {
-		device->reg_ops->unreg_rdma_mem(iser_task, ISER_DIR_IN);
+		iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
 		if (is_rdma_data_aligned)
 			iser_dma_unmap_task_data(iser_task,
 						 &iser_task->data[ISER_DIR_IN],
@@ -711,7 +708,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 	}
 
 	if (iser_task->dir[ISER_DIR_OUT]) {
-		device->reg_ops->unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+		iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
 		if (is_rdma_data_aligned)
 			iser_dma_unmap_task_data(iser_task,
 						 &iser_task->data[ISER_DIR_OUT],
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 7ee7f44..b1261d5 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -38,12 +38,22 @@
 #include <linux/scatterlist.h>
 
 #include "iscsi_iser.h"
+static
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
+		      struct iser_data_buf *mem,
+		      struct iser_reg_resources *rsc,
+		      struct iser_mem_reg *mem_reg);
+static
+int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+		     struct iser_data_buf *mem,
+		     struct iser_reg_resources *rsc,
+		     struct iser_mem_reg *mem_reg);
 
 static struct iser_reg_ops fastreg_ops = {
 	.alloc_reg_res	= iser_alloc_fastreg_pool,
 	.free_reg_res	= iser_free_fastreg_pool,
-	.reg_rdma_mem	= iser_reg_rdma_mem_fastreg,
-	.unreg_rdma_mem	= iser_unreg_mem_fastreg,
+	.reg_mem	= iser_fast_reg_mr,
+	.unreg_mem	= iser_unreg_mem_fastreg,
 	.reg_desc_get	= iser_reg_desc_get_fr,
 	.reg_desc_put	= iser_reg_desc_put_fr,
 };
@@ -51,8 +61,8 @@ static struct iser_reg_ops fastreg_ops = {
 static struct iser_reg_ops fmr_ops = {
 	.alloc_reg_res	= iser_alloc_fmr_pool,
 	.free_reg_res	= iser_free_fmr_pool,
-	.reg_rdma_mem	= iser_reg_rdma_mem_fmr,
-	.unreg_rdma_mem	= iser_unreg_mem_fmr,
+	.reg_mem	= iser_fast_reg_fmr,
+	.unreg_mem	= iser_unreg_mem_fmr,
 	.reg_desc_get	= iser_reg_desc_get_fmr,
 	.reg_desc_put	= iser_reg_desc_put_fmr,
 };
@@ -574,62 +584,6 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
 	reg->mem_h = NULL;
 }
 
-/**
- * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
- * using FMR (if possible) obtaining rkey and va
- *
- * returns 0 on success, errno code on failure
- */
-int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
-			  enum iser_data_dir cmd_dir)
-{
-	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct iser_device   *device = ib_conn->device;
-	struct ib_device     *ibdev = device->ib_device;
-	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
-	struct iser_mem_reg *mem_reg;
-	int aligned_len;
-	int err;
-	int i;
-
-	mem_reg = &iser_task->rdma_reg[cmd_dir];
-
-	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
-	if (aligned_len != mem->dma_nents) {
-		err = fall_to_bounce_buf(iser_task, mem, cmd_dir);
-		if (err) {
-			iser_err("failed to allocate bounce buffer\n");
-			return err;
-		}
-	}
-
-	/* if there a single dma entry, FMR is not needed */
-	if (mem->dma_nents == 1) {
-		return iser_reg_dma(device, mem, mem_reg);
-	} else { /* use FMR for multiple dma entries */
-		struct iser_fr_desc *desc;
-
-		desc = device->reg_ops->reg_desc_get(ib_conn);
-		err = iser_fast_reg_fmr(iser_task, mem, &desc->rsc, mem_reg);
-		if (err && err != -EAGAIN) {
-			iser_data_buf_dump(mem, ibdev);
-			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
-				 mem->dma_nents,
-				 ntoh24(iser_task->desc.iscsi_header.dlength));
-			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
-				 desc->rsc.page_vec->data_size,
-				 desc->rsc.page_vec->length,
-				 desc->rsc.page_vec->offset);
-			for (i = 0; i < desc->rsc.page_vec->length; i++)
-				iser_err("page_vec[%d] = 0x%llx\n", i,
-					 (unsigned long long)desc->rsc.page_vec->pages[i]);
-		}
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
 static void
 iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
 		    struct ib_sig_domain *domain)
@@ -775,19 +729,12 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 {
 	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
 	struct iser_device *device = ib_conn->device;
-	struct ib_mr *mr;
-	struct ib_fast_reg_page_list *frpl;
+	struct ib_mr *mr = rsc->mr;
+	struct ib_fast_reg_page_list *frpl = rsc->frpl;
 	struct ib_send_wr fastreg_wr, inv_wr;
 	struct ib_send_wr *bad_wr, *wr = NULL;
 	int ret, offset, size, plen;
 
-	/* if there a single dma entry, dma mr suffices */
-	if (mem->dma_nents == 1)
-		return iser_reg_dma(device, mem, reg);
-
-	mr = rsc->mr;
-	frpl = rsc->frpl;
-
 	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
 				   &offset, &size);
 	if (plen * SIZE_4K < size) {
@@ -834,78 +781,113 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 	return ret;
 }
 
-/**
- * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
- * using Fast Registration WR (if possible) obtaining rkey and va
- *
- * returns 0 on success, errno code on failure
- */
-int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
-			      enum iser_data_dir cmd_dir)
+static int
+iser_handle_unaligned_buf(struct iscsi_iser_task *task,
+			  struct iser_data_buf *mem,
+			  enum iser_data_dir dir)
 {
-	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct iser_device *device = ib_conn->device;
-	struct ib_device *ibdev = device->ib_device;
-	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
-	struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir];
-	struct iser_fr_desc *desc = NULL;
+	struct iser_conn *iser_conn = task->iser_conn;
+	struct iser_device *device = iser_conn->ib_conn.device;
 	int err, aligned_len;
 
-	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	aligned_len = iser_data_buf_aligned_len(mem, device->ib_device);
 	if (aligned_len != mem->dma_nents) {
-		err = fall_to_bounce_buf(iser_task, mem, cmd_dir);
-		if (err) {
-			iser_err("failed to allocate bounce buffer\n");
+		err = fall_to_bounce_buf(task, mem, dir);
+		if (err)
 			return err;
-		}
 	}
 
+	return 0;
+}
+
+static int
+iser_reg_prot_sg(struct iscsi_iser_task *task,
+		 struct iser_data_buf *mem,
+		 struct iser_fr_desc *desc,
+		 struct iser_mem_reg *reg)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	if (mem->dma_nents == 1)
+		return iser_reg_dma(device, mem, reg);
+
+	return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
+}
+
+static int
+iser_reg_data_sg(struct iscsi_iser_task *task,
+		 struct iser_data_buf *mem,
+		 struct iser_fr_desc *desc,
+		 struct iser_mem_reg *reg)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	if (mem->dma_nents == 1)
+		return iser_reg_dma(device, mem, reg);
+
+	return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg);
+}
+
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+		      enum iser_data_dir dir)
+{
+	struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct iser_data_buf *mem = &task->data[dir];
+	struct iser_mem_reg *reg = &task->rdma_reg[dir];
+	struct iser_fr_desc *desc = NULL;
+	int err;
+
+	err = iser_handle_unaligned_buf(task, mem, dir);
+	if (unlikely(err))
+		return err;
+
 	if (mem->dma_nents != 1 ||
-	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+	    scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
 		desc = device->reg_ops->reg_desc_get(ib_conn);
-		mem_reg->mem_h = desc;
+		reg->mem_h = desc;
 	}
 
-	err = iser_fast_reg_mr(iser_task, mem,
-			       desc ? &desc->rsc : NULL, mem_reg);
-	if (err)
+	err = iser_reg_data_sg(task, mem, desc, reg);
+	if (unlikely(err))
 		goto err_reg;
 
-	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+	if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
 		struct iser_mem_reg prot_reg;
 
 		memset(&prot_reg, 0, sizeof(prot_reg));
-		if (scsi_prot_sg_count(iser_task->sc)) {
-			mem = &iser_task->prot[cmd_dir];
-			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
-			if (aligned_len != mem->dma_nents) {
-				err = fall_to_bounce_buf(iser_task, mem,
-							 cmd_dir);
-				if (err) {
-					iser_err("failed to allocate bounce buffer\n");
-					return err;
-				}
-			}
+		if (scsi_prot_sg_count(task->sc)) {
+			mem = &task->prot[dir];
+			err = iser_handle_unaligned_buf(task, mem, dir);
+			if (unlikely(err))
+				goto err_reg;
 
-			err = iser_fast_reg_mr(iser_task, mem,
-					       &desc->pi_ctx->rsc, &prot_reg);
-			if (err)
+			err = iser_reg_prot_sg(task, mem, desc, &prot_reg);
+			if (unlikely(err))
 				goto err_reg;
 		}
 
-		err = iser_reg_sig_mr(iser_task, desc->pi_ctx, mem_reg,
-				      &prot_reg, mem_reg);
-		if (err) {
-			iser_err("Failed to register signature mr\n");
-			return err;
-		}
+		err = iser_reg_sig_mr(task, desc->pi_ctx, reg,
+				      &prot_reg, reg);
+		if (unlikely(err))
+			goto err_reg;
+
 		desc->pi_ctx->sig_protected = 1;
 	}
 
 	return 0;
+
 err_reg:
 	if (desc)
 		device->reg_ops->reg_desc_put(ib_conn, desc);
 
 	return err;
 }
+
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+			 enum iser_data_dir dir)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	device->reg_ops->unreg_mem(task, dir);
+}
-- 
cgit v0.10.2


From f8db651da29bcad213d43328ebf8ce8459f526a7 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:03 +0300
Subject: IB/iser: Pass registration pool a size parameter

Hard coded for now. This will allow to allocate different
sized MRs depending on the IO size needed (and device
capabilities).

This patch does not change any functionality.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index e6105d2..e9ebe0b 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -339,7 +339,8 @@ struct iser_comp {
  */
 struct iser_reg_ops {
 	int            (*alloc_reg_res)(struct ib_conn *ib_conn,
-					unsigned cmds_max);
+					unsigned cmds_max,
+					unsigned int size);
 	void           (*free_reg_res)(struct ib_conn *ib_conn);
 	int            (*reg_mem)(struct iscsi_iser_task *iser_task,
 				  struct iser_data_buf *mem,
@@ -658,9 +659,13 @@ int  iser_initialize_task_headers(struct iscsi_task *task,
 			struct iser_tx_desc *tx_desc);
 int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 			      struct iscsi_session *session);
-int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+			unsigned cmds_max,
+			unsigned int size);
 void iser_free_fmr_pool(struct ib_conn *ib_conn);
-int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+			    unsigned cmds_max,
+			    unsigned int size);
 void iser_free_fastreg_pool(struct ib_conn *ib_conn);
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector);
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index ae19c69..f1200f2 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -258,7 +258,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
 	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
 
-	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max))
+	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
+					   ISCSI_ISER_SG_TABLESIZE + 1))
 		goto create_rdma_reg_res_failed;
 
 	if (iser_alloc_login_buf(iser_conn))
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 5b5432d..0ade0e8 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -199,7 +199,9 @@ static void iser_free_device_ib_res(struct iser_device *device)
  *
  * returns 0 on success, or errno code on failure
  */
-int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+			unsigned cmds_max,
+			unsigned int size)
 {
 	struct iser_device *device = ib_conn->device;
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
@@ -216,8 +218,7 @@ int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	if (!desc)
 		return -ENOMEM;
 
-	page_vec = kmalloc(sizeof(*page_vec) +
-			   (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE + 1)),
+	page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size),
 			   GFP_KERNEL);
 	if (!page_vec) {
 		ret = -ENOMEM;
@@ -227,9 +228,7 @@ int iser_alloc_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	page_vec->pages = (u64 *)(page_vec + 1);
 
 	params.page_shift        = SHIFT_4K;
-	/* when the first/last SG element are not start/end *
-	 * page aligned, the map whould be of N+1 pages     */
-	params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
+	params.max_pages_per_fmr = size;
 	/* make the pool size twice the max number of SCSI commands *
 	 * the ML is expected to queue, watermark for unmap at 50%  */
 	params.pool_size	 = cmds_max * 2;
@@ -282,13 +281,14 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn)
 }
 
 static int
-iser_alloc_reg_res(struct ib_device *ib_device, struct ib_pd *pd,
-		   struct iser_reg_resources *res)
+iser_alloc_reg_res(struct ib_device *ib_device,
+		   struct ib_pd *pd,
+		   struct iser_reg_resources *res,
+		   unsigned int size)
 {
 	int ret;
 
-	res->frpl = ib_alloc_fast_reg_page_list(ib_device,
-						ISCSI_ISER_SG_TABLESIZE + 1);
+	res->frpl = ib_alloc_fast_reg_page_list(ib_device, size);
 	if (IS_ERR(res->frpl)) {
 		ret = PTR_ERR(res->frpl);
 		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
@@ -296,8 +296,7 @@ iser_alloc_reg_res(struct ib_device *ib_device, struct ib_pd *pd,
 		return PTR_ERR(res->frpl);
 	}
 
-	res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
-			      ISCSI_ISER_SG_TABLESIZE + 1);
+	res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size);
 	if (IS_ERR(res->mr)) {
 		ret = PTR_ERR(res->mr);
 		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
@@ -321,8 +320,10 @@ iser_free_reg_res(struct iser_reg_resources *rsc)
 }
 
 static int
-iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
-		  struct iser_fr_desc *desc)
+iser_alloc_pi_ctx(struct ib_device *ib_device,
+		  struct ib_pd *pd,
+		  struct iser_fr_desc *desc,
+		  unsigned int size)
 {
 	struct iser_pi_context *pi_ctx = NULL;
 	int ret;
@@ -333,7 +334,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 
 	pi_ctx = desc->pi_ctx;
 
-	ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc);
+	ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size);
 	if (ret) {
 		iser_err("failed to allocate reg_resources\n");
 		goto alloc_reg_res_err;
@@ -366,8 +367,10 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 }
 
 static struct iser_fr_desc *
-iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
-			 bool pi_enable)
+iser_create_fastreg_desc(struct ib_device *ib_device,
+			 struct ib_pd *pd,
+			 bool pi_enable,
+			 unsigned int size)
 {
 	struct iser_fr_desc *desc;
 	int ret;
@@ -376,12 +379,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
 	if (!desc)
 		return ERR_PTR(-ENOMEM);
 
-	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
+	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size);
 	if (ret)
 		goto reg_res_alloc_failure;
 
 	if (pi_enable) {
-		ret = iser_alloc_pi_ctx(ib_device, pd, desc);
+		ret = iser_alloc_pi_ctx(ib_device, pd, desc, size);
 		if (ret)
 			goto pi_ctx_alloc_failure;
 	}
@@ -401,7 +404,9 @@ reg_res_alloc_failure:
  * for fast registration work requests.
  * returns 0 on success, or errno code on failure
  */
-int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+			    unsigned cmds_max,
+			    unsigned int size)
 {
 	struct iser_device *device = ib_conn->device;
 	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
@@ -413,7 +418,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
 	fr_pool->size = 0;
 	for (i = 0; i < cmds_max; i++) {
 		desc = iser_create_fastreg_desc(device->ib_device, device->pd,
-						ib_conn->pi_support);
+						ib_conn->pi_support, size);
 		if (IS_ERR(desc)) {
 			ret = PTR_ERR(desc);
 			goto err;
-- 
cgit v0.10.2


From df749cdc45d9f97cb0a5e6ceab80e2e00ee9bf85 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:04 +0300
Subject: IB/iser: Support up to 8MB data transfer in a single command

iser support up to 512KB data transfer in a single scsi command.
This means that larger IOs will split to different request. While
iser can easily saturate FDR/EDR wires, some arrays are fine tuned
for 1MB (or larger) IO sizes, hence add an option to support larger
transfers (up to 8MB) if the device allows it.

Given that a few target implementations don't support data transfers
of more than 512KB by default and the fact that larger IO sizes require
more resources, we introduce a module parameter to determine the
maximum number of 512B sectors in a single scsi command.
Users that are interested in larger transfers can change this value given
that the target supports larger transfers.

At the moment, iser works in 4K pages granularity, In a later stage
we will get it to work with system page size instead.

IO operations that consists of N pages will need a page vector
of size N+1 in case the first SG element contains an offset. Given
that some devices allocates memory regions in powers of 2, this
means that allocating a region with N+1 pages, will result in
region resources allocation of the next power of 2. Since we don't
want that to happen, in case we are in the limit of IO size supported
and the first SG element has an offset, we align the SG list using a
bounce buffer (which is OK given that this is not likely to happen a lot).

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 169cc3e..0720bb4 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -93,6 +93,10 @@ static unsigned int iscsi_max_lun = 512;
 module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
 MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
 
+unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS;
+module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024");
+
 bool iser_pi_enable = false;
 module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
 MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
@@ -625,6 +629,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	if (ep) {
 		iser_conn = ep->dd_data;
 		max_cmds = iser_conn->max_cmds;
+		shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
+		shost->max_sectors = iser_conn->scsi_max_sectors;
 
 		mutex_lock(&iser_conn->state_mutex);
 		if (iser_conn->state != ISER_CONN_UP) {
@@ -966,8 +972,8 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.name                   = "iSCSI Initiator over iSER",
 	.queuecommand           = iscsi_queuecommand,
 	.change_queue_depth	= scsi_change_queue_depth,
-	.sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
-	.max_sectors		= 1024,
+	.sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
+	.max_sectors            = ISER_DEF_MAX_SECTORS,
 	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
 	.eh_abort_handler       = iscsi_eh_abort,
 	.eh_device_reset_handler= iscsi_eh_device_reset,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index e9ebe0b..0bdd7e7 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -98,8 +98,13 @@
 #define SHIFT_4K	12
 #define SIZE_4K	(1ULL << SHIFT_4K)
 #define MASK_4K	(~(SIZE_4K-1))
-					/* support up to 512KB in one RDMA */
-#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+
+/* Default support is 512KB I/O size */
+#define ISER_DEF_MAX_SECTORS		1024
+#define ISCSI_ISER_DEF_SG_TABLESIZE	((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K)
+/* Maximum support is 8MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE	((16384 * 512) >> SHIFT_4K)
+
 #define ISER_DEF_XMIT_CMDS_DEFAULT		512
 #if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
 	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX
@@ -504,6 +509,8 @@ struct ib_conn {
  * @rx_desc_head:     head of rx_descs cyclic buffer
  * @rx_descs:         rx buffers array (cyclic buffer)
  * @num_rx_descs:     number of rx descriptors
+ * @scsi_sg_tablesize: scsi host sg_tablesize
+ * @scsi_max_sectors: scsi host max sectors
  */
 struct iser_conn {
 	struct ib_conn		     ib_conn;
@@ -528,6 +535,8 @@ struct iser_conn {
 	unsigned int 		     rx_desc_head;
 	struct iser_rx_desc	     *rx_descs;
 	u32                          num_rx_descs;
+	unsigned short               scsi_sg_tablesize;
+	unsigned int                 scsi_max_sectors;
 };
 
 /**
@@ -583,6 +592,7 @@ extern struct iser_global ig;
 extern int iser_debug_level;
 extern bool iser_pi_enable;
 extern int iser_pi_guard;
+extern unsigned int iser_max_sectors;
 
 int iser_assign_reg_ops(struct iser_device *device);
 
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index f1200f2..0b1f3b5 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -259,7 +259,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
 
 	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
-					   ISCSI_ISER_SG_TABLESIZE + 1))
+					   iser_conn->scsi_sg_tablesize))
 		goto create_rdma_reg_res_failed;
 
 	if (iser_alloc_login_buf(iser_conn))
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index b1261d5..384fd0a 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -363,7 +363,8 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,
  * consecutive SG elements are actually fragments of the same physcial page.
  */
 static int iser_data_buf_aligned_len(struct iser_data_buf *data,
-				      struct ib_device *ibdev)
+				     struct ib_device *ibdev,
+				     unsigned sg_tablesize)
 {
 	struct scatterlist *sg, *sgl, *next_sg = NULL;
 	u64 start_addr, end_addr;
@@ -375,6 +376,14 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
 	sgl = data->sg;
 	start_addr  = ib_sg_dma_address(ibdev, sgl);
 
+	if (unlikely(sgl[0].offset &&
+		     data->data_len >= sg_tablesize * PAGE_SIZE)) {
+		iser_dbg("can't register length %lx with offset %x "
+			 "fall to bounce buffer\n", data->data_len,
+			 sgl[0].offset);
+		return 0;
+	}
+
 	for_each_sg(sgl, sg, data->dma_nents, i) {
 		if (start_check && !IS_4K_ALIGNED(start_addr))
 			break;
@@ -790,7 +799,8 @@ iser_handle_unaligned_buf(struct iscsi_iser_task *task,
 	struct iser_device *device = iser_conn->ib_conn.device;
 	int err, aligned_len;
 
-	aligned_len = iser_data_buf_aligned_len(mem, device->ib_device);
+	aligned_len = iser_data_buf_aligned_len(mem, device->ib_device,
+						iser_conn->scsi_sg_tablesize);
 	if (aligned_len != mem->dma_nents) {
 		err = fall_to_bounce_buf(task, mem, dir);
 		if (err)
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 0ade0e8..187c712 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -756,6 +756,31 @@ static void iser_connect_error(struct rdma_cm_id *cma_id)
 	iser_conn->state = ISER_CONN_TERMINATING;
 }
 
+static void
+iser_calc_scsi_params(struct iser_conn *iser_conn,
+		      unsigned int max_sectors)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	unsigned short sg_tablesize, sup_sg_tablesize;
+
+	sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
+	sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
+				 device->dev_attr.max_fast_reg_page_list_len);
+
+	if (sg_tablesize > sup_sg_tablesize) {
+		sg_tablesize = sup_sg_tablesize;
+		iser_conn->scsi_max_sectors = sg_tablesize * SIZE_4K / 512;
+	} else {
+		iser_conn->scsi_max_sectors = max_sectors;
+	}
+
+	iser_conn->scsi_sg_tablesize = sg_tablesize;
+
+	iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n",
+		 iser_conn, iser_conn->scsi_sg_tablesize,
+		 iser_conn->scsi_max_sectors);
+}
+
 /**
  * Called with state mutex held
  **/
@@ -794,6 +819,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 		}
 	}
 
+	iser_calc_scsi_params(iser_conn, iser_max_sectors);
+
 	ret = rdma_resolve_route(cma_id, 1000);
 	if (ret) {
 		iser_err("resolve route failed: %d\n", ret);
-- 
cgit v0.10.2


From 1b16c9894b63c8dfbc578ecf1186be4508b2c49e Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:05 +0300
Subject: IB/iser: Add debug prints to the various memory registration methods

Easier to debug when we have the registration details.

This patch does not change any functionality.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Adir Lev <adirl@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 384fd0a..6eadf51 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -554,6 +554,10 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 	reg->sge.length = page_vec->data_size;
 	reg->mem_h = fmr;
 
+	iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
+		 reg->sge.addr, reg->sge.length);
+
 	return 0;
 }
 
@@ -724,7 +728,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 	sig_reg->sge.addr = 0;
 	sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
 
-	iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
+	iser_dbg("sig reg: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
 		 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
 		 sig_reg->sge.length);
 err:
@@ -787,6 +791,10 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 	reg->sge.addr = frpl->page_list[0] + offset;
 	reg->sge.length = size;
 
+	iser_dbg("fast reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
+		 reg->sge.addr, reg->sge.length);
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 7332bed085c68fc76462583a1003c6dca2c31e11 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 6 Aug 2015 18:33:06 +0300
Subject: IB/iser: Chain all iser transaction send work requests

Chaning of send work requests benefits performance by
reducing the send queue lock contention (acquired in
ib_post_send) and saves us HW doorbells which is posted
only once.

Currently, in normal IO flows iser does not chain the CDB send
work request with the registration work request. Also in PI
flows, signature work requests are not chained as well.

Lets chain those and post only once.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 0720bb4..e908a1e 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -204,6 +204,7 @@ iser_initialize_task_headers(struct iscsi_task *task,
 		goto out;
 	}
 
+	tx_desc->wr_idx = 0;
 	tx_desc->mapped = true;
 	tx_desc->dma_addr = dma_addr;
 	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 0bdd7e7..86f6583 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -265,6 +265,14 @@ enum iser_desc_type {
 	ISCSI_TX_DATAOUT
 };
 
+/* Maximum number of work requests per task:
+ * Data memory region local invalidate + fast registration
+ * Protection memory region local invalidate + fast registration
+ * Signature memory region local invalidate + fast registration
+ * PDU send
+ */
+#define ISER_MAX_WRS 7
+
 /**
  * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
  *
@@ -277,6 +285,11 @@ enum iser_desc_type {
  *                 unsolicited data-out or control
  * @num_sge:       number sges used on this TX task
  * @mapped:        Is the task header mapped
+ * @wr_idx:        Current WR index
+ * @wrs:           Array of WRs per task
+ * @data_reg:      Data buffer registration details
+ * @prot_reg:      Protection buffer registration details
+ * @sig_attrs:     Signature attributes
  */
 struct iser_tx_desc {
 	struct iser_hdr              iser_header;
@@ -286,6 +299,11 @@ struct iser_tx_desc {
 	struct ib_sge		     tx_sg[2];
 	int                          num_sge;
 	bool			     mapped;
+	u8                           wr_idx;
+	struct ib_send_wr            wrs[ISER_MAX_WRS];
+	struct iser_mem_reg          data_reg;
+	struct iser_mem_reg          prot_reg;
+	struct ib_sig_attrs          sig_attrs;
 };
 
 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
@@ -689,4 +707,20 @@ iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
 void
 iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
 		      struct iser_fr_desc *desc);
+
+static inline struct ib_send_wr *
+iser_tx_next_wr(struct iser_tx_desc *tx_desc)
+{
+	struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx];
+	struct ib_send_wr *last_wr;
+
+	if (tx_desc->wr_idx) {
+		last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1];
+		last_wr->next = cur_wr;
+	}
+	tx_desc->wr_idx++;
+
+	return cur_wr;
+}
+
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 6eadf51..09640c8 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -664,10 +664,11 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
 {
 	u32 rkey;
 
-	memset(inv_wr, 0, sizeof(*inv_wr));
 	inv_wr->opcode = IB_WR_LOCAL_INV;
 	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
 	inv_wr->ex.invalidate_rkey = mr->rkey;
+	inv_wr->send_flags = 0;
+	inv_wr->num_sge = 0;
 
 	rkey = ib_inc_rkey(mr->rkey);
 	ib_update_fast_reg_key(mr, rkey);
@@ -680,47 +681,38 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 		struct iser_mem_reg *prot_reg,
 		struct iser_mem_reg *sig_reg)
 {
-	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-	struct ib_send_wr sig_wr, inv_wr;
-	struct ib_send_wr *bad_wr, *wr = NULL;
-	struct ib_sig_attrs sig_attrs;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+	struct ib_send_wr *wr;
 	int ret;
 
-	memset(&sig_attrs, 0, sizeof(sig_attrs));
-	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
+	memset(sig_attrs, 0, sizeof(*sig_attrs));
+	ret = iser_set_sig_attrs(iser_task->sc, sig_attrs);
 	if (ret)
 		goto err;
 
-	iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
+	iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
 	if (!pi_ctx->sig_mr_valid) {
-		iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
-		wr = &inv_wr;
+		wr = iser_tx_next_wr(tx_desc);
+		iser_inv_rkey(wr, pi_ctx->sig_mr);
 	}
 
-	memset(&sig_wr, 0, sizeof(sig_wr));
-	sig_wr.opcode = IB_WR_REG_SIG_MR;
-	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
-	sig_wr.sg_list = &data_reg->sge;
-	sig_wr.num_sge = 1;
-	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
-	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+	wr = iser_tx_next_wr(tx_desc);
+	wr->opcode = IB_WR_REG_SIG_MR;
+	wr->wr_id = ISER_FASTREG_LI_WRID;
+	wr->sg_list = &data_reg->sge;
+	wr->num_sge = 1;
+	wr->send_flags = 0;
+	wr->wr.sig_handover.sig_attrs = sig_attrs;
+	wr->wr.sig_handover.sig_mr = pi_ctx->sig_mr;
 	if (scsi_prot_sg_count(iser_task->sc))
-		sig_wr.wr.sig_handover.prot = &prot_reg->sge;
-	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
-					      IB_ACCESS_REMOTE_READ |
-					      IB_ACCESS_REMOTE_WRITE;
-
-	if (!wr)
-		wr = &sig_wr;
+		wr->wr.sig_handover.prot = &prot_reg->sge;
 	else
-		wr->next = &sig_wr;
-
-	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
-	if (ret) {
-		iser_err("reg_sig_mr failed, ret:%d\n", ret);
-		goto err;
-	}
+		wr->wr.sig_handover.prot = NULL;
+	wr->wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
+					   IB_ACCESS_REMOTE_READ |
+					   IB_ACCESS_REMOTE_WRITE;
 	pi_ctx->sig_mr_valid = 0;
 
 	sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
@@ -744,9 +736,9 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 	struct iser_device *device = ib_conn->device;
 	struct ib_mr *mr = rsc->mr;
 	struct ib_fast_reg_page_list *frpl = rsc->frpl;
-	struct ib_send_wr fastreg_wr, inv_wr;
-	struct ib_send_wr *bad_wr, *wr = NULL;
-	int ret, offset, size, plen;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_send_wr *wr;
+	int offset, size, plen;
 
 	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
 				   &offset, &size);
@@ -756,34 +748,23 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 	}
 
 	if (!rsc->mr_valid) {
-		iser_inv_rkey(&inv_wr, mr);
-		wr = &inv_wr;
+		wr = iser_tx_next_wr(tx_desc);
+		iser_inv_rkey(wr, mr);
 	}
 
-	/* Prepare FASTREG WR */
-	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
-	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
-	fastreg_wr.wr.fast_reg.page_list = frpl;
-	fastreg_wr.wr.fast_reg.page_list_len = plen;
-	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
-	fastreg_wr.wr.fast_reg.length = size;
-	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
-	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
-					       IB_ACCESS_REMOTE_WRITE |
-					       IB_ACCESS_REMOTE_READ);
-
-	if (!wr)
-		wr = &fastreg_wr;
-	else
-		wr->next = &fastreg_wr;
-
-	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
-	if (ret) {
-		iser_err("fast registration failed, ret:%d\n", ret);
-		return ret;
-	}
+	wr = iser_tx_next_wr(tx_desc);
+	wr->opcode = IB_WR_FAST_REG_MR;
+	wr->wr_id = ISER_FASTREG_LI_WRID;
+	wr->send_flags = 0;
+	wr->wr.fast_reg.iova_start = frpl->page_list[0] + offset;
+	wr->wr.fast_reg.page_list = frpl;
+	wr->wr.fast_reg.page_list_len = plen;
+	wr->wr.fast_reg.page_shift = SHIFT_4K;
+	wr->wr.fast_reg.length = size;
+	wr->wr.fast_reg.rkey = mr->rkey;
+	wr->wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
+					IB_ACCESS_REMOTE_WRITE |
+					IB_ACCESS_REMOTE_READ);
 	rsc->mr_valid = 0;
 
 	reg->sge.lkey = mr->lkey;
@@ -795,7 +776,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
 		 reg->sge.addr, reg->sge.length);
 
-	return ret;
+	return 0;
 }
 
 static int
@@ -853,6 +834,7 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
 	struct iser_device *device = ib_conn->device;
 	struct iser_data_buf *mem = &task->data[dir];
 	struct iser_mem_reg *reg = &task->rdma_reg[dir];
+	struct iser_mem_reg *data_reg;
 	struct iser_fr_desc *desc = NULL;
 	int err;
 
@@ -866,27 +848,31 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
 		reg->mem_h = desc;
 	}
 
-	err = iser_reg_data_sg(task, mem, desc, reg);
+	if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
+		data_reg = reg;
+	else
+		data_reg = &task->desc.data_reg;
+
+	err = iser_reg_data_sg(task, mem, desc, data_reg);
 	if (unlikely(err))
 		goto err_reg;
 
 	if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
-		struct iser_mem_reg prot_reg;
+		struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
 
-		memset(&prot_reg, 0, sizeof(prot_reg));
 		if (scsi_prot_sg_count(task->sc)) {
 			mem = &task->prot[dir];
 			err = iser_handle_unaligned_buf(task, mem, dir);
 			if (unlikely(err))
 				goto err_reg;
 
-			err = iser_reg_prot_sg(task, mem, desc, &prot_reg);
+			err = iser_reg_prot_sg(task, mem, desc, prot_reg);
 			if (unlikely(err))
 				goto err_reg;
 		}
 
-		err = iser_reg_sig_mr(task, desc->pi_ctx, reg,
-				      &prot_reg, reg);
+		err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
+				      prot_reg, reg);
 		if (unlikely(err))
 			goto err_reg;
 
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 187c712..6f5be47 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -1116,23 +1116,24 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 		   bool signal)
 {
-	int		  ib_ret;
-	struct ib_send_wr send_wr, *send_wr_failed;
+	struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc);
+	int ib_ret;
 
 	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
 				      tx_desc->dma_addr, ISER_HEADERS_LEN,
 				      DMA_TO_DEVICE);
 
-	send_wr.next	   = NULL;
-	send_wr.wr_id	   = (uintptr_t)tx_desc;
-	send_wr.sg_list	   = tx_desc->tx_sg;
-	send_wr.num_sge	   = tx_desc->num_sge;
-	send_wr.opcode	   = IB_WR_SEND;
-	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
+	wr->next = NULL;
+	wr->wr_id = (uintptr_t)tx_desc;
+	wr->sg_list = tx_desc->tx_sg;
+	wr->num_sge = tx_desc->num_sge;
+	wr->opcode = IB_WR_SEND;
+	wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
 
-	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
+	ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0], &bad_wr);
 	if (ib_ret)
-		iser_err("ib_post_send failed, ret:%d\n", ib_ret);
+		iser_err("ib_post_send failed, ret:%d opcode:%d\n",
+			 ib_ret, bad_wr->opcode);
 
 	return ib_ret;
 }
-- 
cgit v0.10.2


From 96249d70dd70496084c7ec1465ec449cd032955a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Wed, 5 Aug 2015 14:14:45 -0600
Subject: IB/core: Guarantee that a local_dma_lkey is available

Every single ULP requires a local_dma_lkey to do anything with
a QP, so let us ensure one exists for every PD created.

If the driver can supply a global local_dma_lkey then use that, otherwise
ask the driver to create a local use all physical memory MR associated
with the new PD.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@dev.mellanox.co.il>
Acked-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bbb02ff..258485e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -562,6 +562,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 
 	pd->device  = file->device->ib_dev;
 	pd->uobject = uobj;
+	pd->local_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
 
 	uobj->object = pd;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index c80ed17..2e5fd89 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -213,24 +213,61 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
 
 /* Protection domains */
 
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
 {
 	struct ib_pd *pd;
+	struct ib_device_attr devattr;
+	int rc;
+
+	rc = ib_query_device(device, &devattr);
+	if (rc)
+		return ERR_PTR(rc);
 
 	pd = device->alloc_pd(device, NULL, NULL);
+	if (IS_ERR(pd))
+		return pd;
+
+	pd->device = device;
+	pd->uobject = NULL;
+	pd->local_mr = NULL;
+	atomic_set(&pd->usecnt, 0);
+
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+		pd->local_dma_lkey = device->local_dma_lkey;
+	else {
+		struct ib_mr *mr;
+
+		mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(mr)) {
+			ib_dealloc_pd(pd);
+			return (struct ib_pd *)mr;
+		}
 
-	if (!IS_ERR(pd)) {
-		pd->device  = device;
-		pd->uobject = NULL;
-		atomic_set(&pd->usecnt, 0);
+		pd->local_mr = mr;
+		pd->local_dma_lkey = pd->local_mr->lkey;
 	}
-
 	return pd;
 }
 EXPORT_SYMBOL(ib_alloc_pd);
 
 int ib_dealloc_pd(struct ib_pd *pd)
 {
+	if (pd->local_mr) {
+		if (ib_dereg_mr(pd->local_mr))
+			return -EBUSY;
+		pd->local_mr = NULL;
+	}
+
 	if (atomic_read(&pd->usecnt))
 		return -EBUSY;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5eff55c..0940051 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1257,9 +1257,11 @@ struct ib_udata {
 };
 
 struct ib_pd {
+	u32			local_dma_lkey;
 	struct ib_device       *device;
 	struct ib_uobject      *uobject;
 	atomic_t          	usecnt; /* count all resources */
+	struct ib_mr	       *local_mr;
 };
 
 struct ib_xrcd {
@@ -2192,13 +2194,6 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 int ib_find_pkey(struct ib_device *device,
 		 u8 port_num, u16 pkey, u16 *index);
 
-/**
- * ib_alloc_pd - Allocates an unused protection domain.
- * @device: The device on which to allocate the protection domain.
- *
- * A protection domain object provides an association between QPs, shared
- * receive queues, address handles, memory regions, and memory windows.
- */
 struct ib_pd *ib_alloc_pd(struct ib_device *device);
 
 /**
-- 
cgit v0.10.2


From 4be90bc60df47f6268b594c4fb6c90f0ff2f519f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:16 -0600
Subject: IB/mad: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 66b4b3e..4b5c723 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 		goto error1;
 	}
 
-	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
-						 IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(mad_agent_priv->agent.mr)) {
-		ret = ERR_PTR(-ENOMEM);
-		goto error2;
-	}
-
 	if (mad_reg_req) {
 		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
 		if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
 	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 	kfree(reg_req);
 error3:
-	ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
 	kfree(mad_agent_priv);
 error1:
 	return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
 	wait_for_completion(&mad_agent_priv->comp);
 
 	kfree(mad_agent_priv->reg_req);
-	ib_dereg_mr(mad_agent_priv->agent.mr);
 	kfree(mad_agent_priv);
 }
 
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 
 	mad_send_wr->mad_agent_priv = mad_agent_priv;
 	mad_send_wr->sg_list[0].length = hdr_len;
-	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
 
 	/* OPA MADs don't have to be the full 2048 bytes */
 	if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 	else
 		mad_send_wr->sg_list[1].length = mad_size - hdr_len;
 
-	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
 
 	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
 	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
 	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
 
 	/* Initialize common scatter list fields */
-	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+	sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
 
 	/* Initialize common receive WR fields */
 	recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
 		goto error4;
 	}
 
-	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(port_priv->mr)) {
-		dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
-		ret = PTR_ERR(port_priv->mr);
-		goto error5;
-	}
-
 	if (has_smi) {
 		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
 		if (ret)
@@ -3248,8 +3231,6 @@ error8:
 error7:
 	destroy_mad_qp(&port_priv->qp_info[0]);
 error6:
-	ib_dereg_mr(port_priv->mr);
-error5:
 	ib_dealloc_pd(port_priv->pd);
 error4:
 	ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
 	destroy_workqueue(port_priv->wq);
 	destroy_mad_qp(&port_priv->qp_info[1]);
 	destroy_mad_qp(&port_priv->qp_info[0]);
-	ib_dereg_mr(port_priv->mr);
 	ib_dealloc_pd(port_priv->pd);
 	ib_destroy_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 5be89f9..4a4f7aa 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
 	int port_num;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
-	struct ib_mr *mr;
 
 	spinlock_t reg_lock;
 	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d5ac022..c206205 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -526,7 +526,6 @@ enum {
 struct ib_mad_agent {
 	struct ib_device	*device;
 	struct ib_qp		*qp;
-	struct ib_mr		*mr;
 	ib_mad_recv_handler	recv_handler;
 	ib_mad_send_handler	send_handler;
 	ib_mad_snoop_handler	snoop_handler;
-- 
cgit v0.10.2


From 77b1f99660ed60694e1d0e3a63096c1f252debfd Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:17 -0600
Subject: IB/ipoib: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 79859c4..ca28736 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -342,7 +342,6 @@ struct ipoib_dev_priv {
 	u16		  pkey;
 	u16		  pkey_index;
 	struct ib_pd	 *pd;
-	struct ib_mr	 *mr;
 	struct ib_cq	 *recv_cq;
 	struct ib_cq	 *send_cq;
 	struct ib_qp	 *qp;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 9d32157..c78dc16 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -332,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev,
 	int i;
 
 	for (i = 0; i < priv->cm.num_frags; ++i)
-		sge[i].lkey = priv->mr->lkey;
+		sge[i].lkey = priv->pd->local_dma_lkey;
 
 	sge[0].length = IPOIB_CM_HEAD_SIZE;
 	for (i = 1; i < priv->cm.num_frags; ++i)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 851c821..8c45198 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -152,12 +152,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		return -ENODEV;
 	}
 
-	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(priv->mr)) {
-		printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
-		goto out_free_pd;
-	}
-
 	/*
 	 * the various IPoIB tasks assume they will never race against
 	 * themselves, so always use a single thread workqueue
@@ -165,7 +159,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->wq = create_singlethread_workqueue("ipoib_wq");
 	if (!priv->wq) {
 		printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
-		goto out_free_mr;
+		goto out_free_pd;
 	}
 
 	size = ipoib_recvq_size + 1;
@@ -225,13 +219,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
 	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
-		priv->tx_sge[i].lkey = priv->mr->lkey;
+		priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
 
 	priv->tx_wr.opcode	= IB_WR_SEND;
 	priv->tx_wr.sg_list	= priv->tx_sge;
 	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
 
-	priv->rx_sge[0].lkey = priv->mr->lkey;
+	priv->rx_sge[0].lkey = priv->pd->local_dma_lkey;
 
 	priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 	priv->rx_wr.num_sge = 1;
@@ -254,9 +248,6 @@ out_free_wq:
 	destroy_workqueue(priv->wq);
 	priv->wq = NULL;
 
-out_free_mr:
-	ib_dereg_mr(priv->mr);
-
 out_free_pd:
 	ib_dealloc_pd(priv->pd);
 
@@ -289,9 +280,6 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 		priv->wq = NULL;
 	}
 
-	if (ib_dereg_mr(priv->mr))
-		ipoib_warn(priv, "ib_dereg_mr failed\n");
-
 	if (ib_dealloc_pd(priv->pd))
 		ipoib_warn(priv, "ib_dealloc_pd failed\n");
 
-- 
cgit v0.10.2


From 7dd975762825e4c2b95df5e24759a9d6ad915c79 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:18 -0600
Subject: IB/mlx4: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 68b3dfa..1cd75ff 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -580,7 +580,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 
 	list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
 	list.length = sizeof (struct mlx4_rcv_tunnel_mad);
-	list.lkey = tun_ctx->mr->lkey;
+	list.lkey = tun_ctx->pd->local_dma_lkey;
 
 	wr.wr.ud.ah = ah;
 	wr.wr.ud.port_num = port;
@@ -1133,7 +1133,7 @@ static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
 
 	sg_list.addr = tun_qp->ring[index].map;
 	sg_list.length = size;
-	sg_list.lkey = ctx->mr->lkey;
+	sg_list.lkey = ctx->pd->local_dma_lkey;
 
 	recv_wr.next = NULL;
 	recv_wr.sg_list = &sg_list;
@@ -1244,7 +1244,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 
 	list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
 	list.length = sizeof (struct mlx4_mad_snd_buf);
-	list.lkey = sqp_ctx->mr->lkey;
+	list.lkey = sqp_ctx->pd->local_dma_lkey;
 
 	wr.wr.ud.ah = ah;
 	wr.wr.ud.port_num = port;
@@ -1827,19 +1827,12 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		goto err_cq;
 	}
 
-	ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(ctx->mr)) {
-		ret = PTR_ERR(ctx->mr);
-		pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
-		goto err_pd;
-	}
-
 	if (ctx->has_smi) {
 		ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
 		if (ret) {
 			pr_err("Couldn't create %s QP0 (%d)\n",
 			       create_tun ? "tunnel for" : "",  ret);
-			goto err_mr;
+			goto err_pd;
 		}
 	}
 
@@ -1876,10 +1869,6 @@ err_qp0:
 		ib_destroy_qp(ctx->qp[0].qp);
 	ctx->qp[0].qp = NULL;
 
-err_mr:
-	ib_dereg_mr(ctx->mr);
-	ctx->mr = NULL;
-
 err_pd:
 	ib_dealloc_pd(ctx->pd);
 	ctx->pd = NULL;
@@ -1916,8 +1905,6 @@ static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
 		ib_destroy_qp(ctx->qp[1].qp);
 		ctx->qp[1].qp = NULL;
 		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
-		ib_dereg_mr(ctx->mr);
-		ctx->mr = NULL;
 		ib_dealloc_pd(ctx->pd);
 		ctx->pd = NULL;
 		ib_destroy_cq(ctx->cq);
@@ -2050,8 +2037,6 @@ static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
 		ib_destroy_qp(sqp_ctx->qp[1].qp);
 		sqp_ctx->qp[1].qp = NULL;
 		mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
-		ib_dereg_mr(sqp_ctx->mr);
-		sqp_ctx->mr = NULL;
 		ib_dealloc_pd(sqp_ctx->pd);
 		sqp_ctx->pd = NULL;
 		ib_destroy_cq(sqp_ctx->cq);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index cb47c2c..fe52ead 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -415,7 +415,6 @@ struct mlx4_ib_demux_pv_ctx {
 	struct ib_device *ib_dev;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
-	struct ib_mr *mr;
 	struct work_struct work;
 	struct workqueue_struct *wq;
 	struct mlx4_ib_demux_pv_qp qp[2];
-- 
cgit v0.10.2


From b37c788f595cd578524fb8f50d3bd2fff8b62bc3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:19 -0600
Subject: IB/mlx5: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index ce75875..41d6911 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1123,7 +1123,6 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev)
 
 	mlx5_ib_destroy_qp(dev->umrc.qp);
 	ib_destroy_cq(dev->umrc.cq);
-	ib_dereg_mr(dev->umrc.mr);
 	ib_dealloc_pd(dev->umrc.pd);
 }
 
@@ -1138,7 +1137,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
 	struct ib_pd *pd;
 	struct ib_cq *cq;
 	struct ib_qp *qp;
-	struct ib_mr *mr;
 	struct ib_cq_init_attr cq_attr = {};
 	int ret;
 
@@ -1156,13 +1154,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
 		goto error_0;
 	}
 
-	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(mr)) {
-		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
-		ret = PTR_ERR(mr);
-		goto error_1;
-	}
-
 	cq_attr.cqe = 128;
 	cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL,
 			  &cq_attr);
@@ -1220,7 +1211,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
 
 	dev->umrc.qp = qp;
 	dev->umrc.cq = cq;
-	dev->umrc.mr = mr;
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
@@ -1242,9 +1232,6 @@ error_3:
 	ib_destroy_cq(cq);
 
 error_2:
-	ib_dereg_mr(mr);
-
-error_1:
 	ib_dealloc_pd(pd);
 
 error_0:
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 62b06ae..a5fa0b9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -349,7 +349,6 @@ struct umr_common {
 	struct ib_pd	*pd;
 	struct ib_cq	*cq;
 	struct ib_qp	*qp;
-	struct ib_mr	*mr;
 	/* control access to UMR QP
 	 */
 	struct semaphore	sem;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index dad82db..0dfd379 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -687,12 +687,11 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
 			     int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct ib_mr *mr = dev->umrc.mr;
 	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
 
 	sg->addr = dma;
 	sg->length = ALIGN(sizeof(u64) * n, 64);
-	sg->lkey = mr->lkey;
+	sg->lkey = dev->umrc.pd->local_dma_lkey;
 
 	wr->next = NULL;
 	wr->send_flags = 0;
@@ -923,7 +922,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 		sg.addr = dma;
 		sg.length = ALIGN(npages * sizeof(u64),
 				MLX5_UMR_MTT_ALIGNMENT);
-		sg.lkey = dev->umrc.mr->lkey;
+		sg.lkey = dev->umrc.pd->local_dma_lkey;
 
 		wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
 				MLX5_IB_SEND_UMR_UPDATE_MTT;
-- 
cgit v0.10.2


From 256b7ad27316525711dfa98b67e7c40cb28b1711 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:20 -0600
Subject: IB/iser: Use pd->local_dma_lkey

Replace all leys with  pd->local_dma_lkey. This driver does not support
iWarp, so this is safe.

The insecure use of ib_get_dma_mr is thus isolated to an rkey, and this
looks trivially fixed by forcing the use of registration in a future
patch.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index e908a1e..1ace5d8 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -209,7 +209,7 @@ iser_initialize_task_headers(struct iscsi_task *task,
 	tx_desc->dma_addr = dma_addr;
 	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
 	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
-	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+	tx_desc->tx_sg[0].lkey   = device->pd->local_dma_lkey;
 
 	iser_task->iser_conn = iser_conn;
 out:
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 0b1f3b5..d511879 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -284,7 +284,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 		rx_sg = &rx_desc->rx_sg;
 		rx_sg->addr   = rx_desc->dma_addr;
 		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-		rx_sg->lkey   = device->mr->lkey;
+		rx_sg->lkey   = device->pd->local_dma_lkey;
 	}
 
 	iser_conn->rx_desc_head = 0;
@@ -538,7 +538,7 @@ int iser_send_control(struct iscsi_conn *conn,
 
 		tx_dsg->addr    = iser_conn->login_req_dma;
 		tx_dsg->length  = task->data_count;
-		tx_dsg->lkey    = device->mr->lkey;
+		tx_dsg->lkey    = device->pd->local_dma_lkey;
 		mdesc->num_sge = 2;
 	}
 
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 09640c8..2493cc7 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -471,7 +471,7 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 {
 	struct scatterlist *sg = mem->sg;
 
-	reg->sge.lkey = device->mr->lkey;
+	reg->sge.lkey = device->pd->local_dma_lkey;
 	reg->rkey = device->mr->rkey;
 	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
 	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 6f5be47..ad2d2b5 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -1061,7 +1061,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
 
 	sge.addr   = iser_conn->login_resp_dma;
 	sge.length = ISER_RX_LOGIN_SIZE;
-	sge.lkey   = ib_conn->device->mr->lkey;
+	sge.lkey   = ib_conn->device->pd->local_dma_lkey;
 
 	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
 	rx_wr.sg_list = &sge;
-- 
cgit v0.10.2


From 34efc7dfbd88e4a232b1efa31c5e900606c84fca Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:21 -0600
Subject: iser-target: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 0ebbaa5..dc439a4 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -235,7 +235,7 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
 		rx_sg = &rx_desc->rx_sg;
 		rx_sg->addr = rx_desc->dma_addr;
 		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-		rx_sg->lkey = device->mr->lkey;
+		rx_sg->lkey = device->pd->local_dma_lkey;
 	}
 
 	isert_conn->rx_desc_head = 0;
@@ -385,22 +385,12 @@ isert_create_device_ib_res(struct isert_device *device)
 		goto out_cq;
 	}
 
-	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(device->mr)) {
-		ret = PTR_ERR(device->mr);
-		isert_err("failed to create dma mr, device %p, ret=%d\n",
-			  device, ret);
-		goto out_mr;
-	}
-
 	/* Check signature cap */
 	device->pi_capable = dev_attr->device_cap_flags &
 			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
 
 	return 0;
 
-out_mr:
-	ib_dealloc_pd(device->pd);
 out_cq:
 	isert_free_comps(device);
 	return ret;
@@ -411,7 +401,6 @@ isert_free_device_ib_res(struct isert_device *device)
 {
 	isert_info("device %p\n", device);
 
-	ib_dereg_mr(device->mr);
 	ib_dealloc_pd(device->pd);
 	isert_free_comps(device);
 }
@@ -1090,8 +1079,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
 	tx_desc->num_sge = 1;
 	tx_desc->isert_cmd = isert_cmd;
 
-	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
-		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+	if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) {
+		tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
 		isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc);
 	}
 }
@@ -1114,7 +1103,7 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn,
 	tx_desc->dma_addr = dma_addr;
 	tx_desc->tx_sg[0].addr	= tx_desc->dma_addr;
 	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
-	tx_desc->tx_sg[0].lkey = device->mr->lkey;
+	tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
 
 	isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n",
 		  tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length,
@@ -1147,7 +1136,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn)
 	memset(&sge, 0, sizeof(struct ib_sge));
 	sge.addr = isert_conn->login_req_dma;
 	sge.length = ISER_RX_LOGIN_SIZE;
-	sge.lkey = isert_conn->device->mr->lkey;
+	sge.lkey = isert_conn->device->pd->local_dma_lkey;
 
 	isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
 		sge.addr, sge.length, sge.lkey);
@@ -1197,7 +1186,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
 
 		tx_dsg->addr	= isert_conn->login_rsp_dma;
 		tx_dsg->length	= length;
-		tx_dsg->lkey	= isert_conn->device->mr->lkey;
+		tx_dsg->lkey	= isert_conn->device->pd->local_dma_lkey;
 		tx_desc->num_sge = 2;
 	}
 	if (!login->login_failed) {
@@ -2214,7 +2203,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 		isert_cmd->pdu_buf_len = pdu_len;
 		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
 		tx_dsg->length	= pdu_len;
-		tx_dsg->lkey	= device->mr->lkey;
+		tx_dsg->lkey	= device->pd->local_dma_lkey;
 		isert_cmd->tx_desc.num_sge = 2;
 	}
 
@@ -2342,7 +2331,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 	isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
 	tx_dsg->addr	= isert_cmd->pdu_buf_dma;
 	tx_dsg->length	= ISCSI_HDR_LEN;
-	tx_dsg->lkey	= device->mr->lkey;
+	tx_dsg->lkey	= device->pd->local_dma_lkey;
 	isert_cmd->tx_desc.num_sge = 2;
 
 	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2383,7 +2372,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 		isert_cmd->pdu_buf_len = txt_rsp_len;
 		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
 		tx_dsg->length	= txt_rsp_len;
-		tx_dsg->lkey	= device->mr->lkey;
+		tx_dsg->lkey	= device->pd->local_dma_lkey;
 		isert_cmd->tx_desc.num_sge = 2;
 	}
 	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2424,7 +2413,7 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
 		ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
 		ib_sge->length = min_t(u32, data_left,
 				ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
-		ib_sge->lkey = device->mr->lkey;
+		ib_sge->lkey = device->pd->local_dma_lkey;
 
 		isert_dbg("RDMA ib_sge: addr: 0x%llx  length: %u lkey: %x\n",
 			  ib_sge->addr, ib_sge->length, ib_sge->lkey);
@@ -2598,7 +2587,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn,
 	u32 page_off;
 
 	if (mem->dma_nents == 1) {
-		sge->lkey = device->mr->lkey;
+		sge->lkey = device->pd->local_dma_lkey;
 		sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
 		sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
 		isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 9ec23a78..6a04ba3 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -209,7 +209,6 @@ struct isert_device {
 	int			refcount;
 	struct ib_device	*ib_device;
 	struct ib_pd		*pd;
-	struct ib_mr		*mr;
 	struct isert_comp	*comps;
 	int                     comps_used;
 	struct list_head	dev_node;
-- 
cgit v0.10.2


From e6bf5f48d2adc443689c5e627bcd559b00201e5c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:22 -0600
Subject: IB/srp: Use pd->local_dma_lkey

Replace all leys with  pd->local_dma_lkey. This driver does not support
iWarp, so this is safe.

The insecure use of ib_get_dma_mr is thus isolated to an rkey, and will
have to be fixed separately.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index c9c4f64..b890367 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3154,7 +3154,7 @@ static ssize_t srp_create_target(struct device *dev,
 	target->io_class	= SRP_REV16A_IB_IO_CLASS;
 	target->scsi_host	= target_host;
 	target->srp_host	= host;
-	target->lkey		= host->srp_dev->mr->lkey;
+	target->lkey		= host->srp_dev->pd->local_dma_lkey;
 	target->rkey		= host->srp_dev->mr->rkey;
 	target->cmd_sg_cnt	= cmd_sg_entries;
 	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
-- 
cgit v0.10.2


From 5a783956c2b90179b852dd58a2ee668f16dfe980 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:24 -0600
Subject: ib_srpt: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 3ab015b..f6fe041 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -783,7 +783,7 @@ static int srpt_post_recv(struct srpt_device *sdev,
 
 	list.addr = ioctx->ioctx.dma;
 	list.length = srp_max_req_size;
-	list.lkey = sdev->mr->lkey;
+	list.lkey = sdev->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	wr.sg_list = &list;
@@ -818,7 +818,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
 
 	list.addr = ioctx->ioctx.dma;
 	list.length = len;
-	list.lkey = sdev->mr->lkey;
+	list.lkey = sdev->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
@@ -1206,7 +1206,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
 		while (rsize > 0 && tsize > 0) {
 			sge->addr = dma_addr;
-			sge->lkey = ch->sport->sdev->mr->lkey;
+			sge->lkey = ch->sport->sdev->pd->local_dma_lkey;
 
 			if (rsize >= dma_len) {
 				sge->length =
@@ -3211,10 +3211,6 @@ static void srpt_add_one(struct ib_device *device)
 	if (IS_ERR(sdev->pd))
 		goto free_dev;
 
-	sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(sdev->mr))
-		goto err_pd;
-
 	sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
 
 	srq_attr.event_handler = srpt_srq_event;
@@ -3226,7 +3222,7 @@ static void srpt_add_one(struct ib_device *device)
 
 	sdev->srq = ib_create_srq(sdev->pd, &srq_attr);
 	if (IS_ERR(sdev->srq))
-		goto err_mr;
+		goto err_pd;
 
 	pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
 		 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
@@ -3311,8 +3307,6 @@ err_cm:
 	ib_destroy_cm_id(sdev->cm_id);
 err_srq:
 	ib_destroy_srq(sdev->srq);
-err_mr:
-	ib_dereg_mr(sdev->mr);
 err_pd:
 	ib_dealloc_pd(sdev->pd);
 free_dev:
@@ -3357,7 +3351,6 @@ static void srpt_remove_one(struct ib_device *device, void *client_data)
 	srpt_release_sdev(sdev);
 
 	ib_destroy_srq(sdev->srq);
-	ib_dereg_mr(sdev->mr);
 	ib_dealloc_pd(sdev->pd);
 
 	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 21f8df6..5faad8ac 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -393,7 +393,6 @@ struct srpt_port {
 struct srpt_device {
 	struct ib_device	*device;
 	struct ib_pd		*pd;
-	struct ib_mr		*mr;
 	struct ib_srq		*srq;
 	struct ib_cm_id		*cm_id;
 	struct ib_device_attr	dev_attr;
-- 
cgit v0.10.2


From 2f31fa881fbe70808b945a6d23cae1ca8eadf1b3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:25 -0600
Subject: net/9p: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Tested-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 37a78d2..ba12102 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -94,8 +94,6 @@ struct p9_trans_rdma {
 	struct ib_pd *pd;
 	struct ib_qp *qp;
 	struct ib_cq *cq;
-	struct ib_mr *dma_mr;
-	u32 lkey;
 	long timeout;
 	int sq_depth;
 	struct semaphore sq_sem;
@@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
 	if (!rdma)
 		return;
 
-	if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
-		ib_dereg_mr(rdma->dma_mr);
-
 	if (rdma->qp && !IS_ERR(rdma->qp))
 		ib_destroy_qp(rdma->qp);
 
@@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
 
 	sge.addr = c->busa;
 	sge.length = client->msize;
-	sge.lkey = rdma->lkey;
+	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	c->wc_op = IB_WC_RECV;
@@ -506,7 +501,7 @@ dont_need_post_recv:
 
 	sge.addr = c->busa;
 	sge.length = c->req->tc->size;
-	sge.lkey = rdma->lkey;
+	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	c->wc_op = IB_WC_SEND;
@@ -647,7 +642,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	struct p9_trans_rdma *rdma;
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
-	struct ib_device_attr devattr;
 	struct ib_cq_init_attr cq_attr = {};
 
 	/* Parse the transport specific mount options */
@@ -700,11 +694,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
 		goto error;
 
-	/* Query the device attributes */
-	err = ib_query_device(rdma->cm_id->device, &devattr);
-	if (err)
-		goto error;
-
 	/* Create the Completion Queue */
 	cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
 	rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
@@ -719,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	if (IS_ERR(rdma->pd))
 		goto error;
 
-	/* Cache the DMA lkey in the transport */
-	rdma->dma_mr = NULL;
-	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
-		rdma->lkey = rdma->cm_id->device->local_dma_lkey;
-	else {
-		rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
-		if (IS_ERR(rdma->dma_mr))
-			goto error;
-		rdma->lkey = rdma->dma_mr->lkey;
-	}
-
 	/* Create the Queue Pair */
 	memset(&qp_attr, 0, sizeof qp_attr);
 	qp_attr.event_handler = qp_event_handler;
-- 
cgit v0.10.2


From e5580242aa8fab292579a1661463f7479275f7ff Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Thu, 30 Jul 2015 17:22:26 -0600
Subject: rds/ib: Remove ib_get_dma_mr calls

The pd now has a local_dma_lkey member which completely replaces
ib_get_dma_mr, use it instead.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 348ac37..c6f95b9 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -99,8 +99,6 @@ static void rds_ib_dev_free(struct work_struct *work)
 
 	if (rds_ibdev->mr_pool)
 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-	if (rds_ibdev->mr)
-		ib_dereg_mr(rds_ibdev->mr);
 	if (rds_ibdev->pd)
 		ib_dealloc_pd(rds_ibdev->pd);
 
@@ -164,12 +162,6 @@ static void rds_ib_add_one(struct ib_device *device)
 		goto put_dev;
 	}
 
-	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(rds_ibdev->mr)) {
-		rds_ibdev->mr = NULL;
-		goto put_dev;
-	}
-
 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
 	if (IS_ERR(rds_ibdev->mr_pool)) {
 		rds_ibdev->mr_pool = NULL;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 86d88ec..36f7d80 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -100,7 +100,6 @@ struct rds_ib_connection {
 	/* alphabet soup, IBTA style */
 	struct rdma_cm_id	*i_cm_id;
 	struct ib_pd		*i_pd;
-	struct ib_mr		*i_mr;
 	struct ib_cq		*i_send_cq;
 	struct ib_cq		*i_recv_cq;
 
@@ -173,7 +172,6 @@ struct rds_ib_device {
 	struct list_head	conn_list;
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
-	struct ib_mr		*mr;
 	struct rds_ib_mr_pool	*mr_pool;
 	unsigned int		fmr_max_remaps;
 	unsigned int		max_fmrs;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..a75e883 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -269,7 +269,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
 	/* Protection domain and memory range */
 	ic->i_pd = rds_ibdev->pd;
-	ic->i_mr = rds_ibdev->mr;
 
 	cq_attr.cqe = ic->i_send_ring.w_nr + 1;
 	ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
@@ -375,7 +374,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
 	rds_ib_recv_init_ack(ic);
 
-	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+	rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
 		 ic->i_send_cq, ic->i_recv_cq);
 
 out:
@@ -678,7 +677,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 
 		ic->i_cm_id = NULL;
 		ic->i_pd = NULL;
-		ic->i_mr = NULL;
 		ic->i_send_cq = NULL;
 		ic->i_recv_cq = NULL;
 		ic->i_send_hdrs = NULL;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index cac5b45..0ceb4c6 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 		sge = &recv->r_sge[0];
 		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = ic->i_pd->local_dma_lkey;
 
 		sge = &recv->r_sge[1];
 		sge->addr = 0;
 		sge->length = RDS_FRAG_SIZE;
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = ic->i_pd->local_dma_lkey;
 	}
 }
 
@@ -520,7 +520,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
 
 	sge->addr = ic->i_ack_dma;
 	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	sge->lkey = ic->i_pd->local_dma_lkey;
 
 	wr->sg_list = sge;
 	wr->num_sge = 1;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 5d0a704..f6c829d 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -202,9 +202,9 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 		sge = &send->s_sge[0];
 		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = ic->i_pd->local_dma_lkey;
 
-		send->s_sge[1].lkey = ic->i_mr->lkey;
+		send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
 	}
 }
 
@@ -813,7 +813,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	/* Convert our struct scatterlist to struct ib_sge */
 	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
 	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
-	send->s_sge[0].lkey = ic->i_mr->lkey;
+	send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
 
 	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
 		 send->s_sge[0].addr, send->s_sge[0].length);
@@ -927,7 +927,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 			send->s_sge[j].addr =
 				 ib_sg_dma_address(ic->i_cm_id->device, scat);
 			send->s_sge[j].length = len;
-			send->s_sge[j].lkey = ic->i_mr->lkey;
+			send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
 
 			sent += len;
 			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
-- 
cgit v0.10.2


From 186fbc6689a368364b7c9eb9d42d6f84b3079f65 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:06:29 -0700
Subject: IB/srp: Re-enable FMR for non-page aligned buffers

During a discussion in 2011 nobody recalled why FMR was not used for
non-page aligned buffers (see also
http://thread.gmane.org/gmane.linux.drivers.rdma/7149). Re-enable FMR
for such buffers. For the reason why the srp_map_fmr() function needs
to be modified, see also patch "IB/srp: rework mapping engine to use
multiple FMR entries" (commit ID 8f26c9ff9cd0; January 2011).

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index b890367..6ed976d 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1272,6 +1272,8 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
 static int srp_map_finish_fmr(struct srp_map_state *state,
 			      struct srp_rdma_ch *ch)
 {
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
 	struct ib_pool_fmr *fmr;
 	u64 io_addr = 0;
 
@@ -1283,7 +1285,8 @@ static int srp_map_finish_fmr(struct srp_map_state *state,
 	*state->next_fmr++ = fmr;
 	state->nmdesc++;
 
-	srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey);
+	srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
+		     state->dma_len, fmr->fmr->rkey);
 
 	return 0;
 }
@@ -1390,14 +1393,7 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 		return 0;
 	}
 
-	/*
-	 * Since not all RDMA HW drivers support non-zero page offsets for
-	 * FMR, if we start at an offset into a page, don't merge into the
-	 * current FMR mapping. Finish it out, and use the kernel's MR for
-	 * this sg entry.
-	 */
-	if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) ||
-	    dma_len > dev->mr_max_size) {
+	if (dma_len > dev->mr_max_size) {
 		ret = srp_finish_mapping(state, ch);
 		if (ret)
 			return ret;
-- 
cgit v0.10.2


From 7e85c91970125cb16399c7d1cfedc943266eee49 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:06:57 -0700
Subject: IB/srp: Use multiple registrations for large memory regions

Instead of using the global rkey for large memory regions, use
multiple registrations. See also the while (dma_len) loop further
down in srp_map_sg_entry().

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 6ed976d..03ae72e 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1393,16 +1393,6 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 		return 0;
 	}
 
-	if (dma_len > dev->mr_max_size) {
-		ret = srp_finish_mapping(state, ch);
-		if (ret)
-			return ret;
-
-		srp_map_desc(state, dma_addr, dma_len, target->rkey);
-		srp_map_update_start(state, NULL, 0, 0);
-		return 0;
-	}
-
 	/*
 	 * If this is the first sg that will be mapped via FMR or via FR, save
 	 * our position. We need to know the first unmapped entry, its index,
-- 
cgit v0.10.2


From f731ed62934ace0d3f5aa9ec557349171711be05 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:07:27 -0700
Subject: IB/srp: Add memory descriptor array pointer range checking

Although most paths through which a request is submitted check
block layer parameters like the max_segments limit, these are
not checked when an SG_IO or direct I/O request is submitted.
Hence add a range check for the memory descriptor array pointer.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 03ae72e..b7feb74 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1277,12 +1277,15 @@ static int srp_map_finish_fmr(struct srp_map_state *state,
 	struct ib_pool_fmr *fmr;
 	u64 io_addr = 0;
 
+	if (state->fmr.next >= state->fmr.end)
+		return -ENOMEM;
+
 	fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
 				   state->npages, io_addr);
 	if (IS_ERR(fmr))
 		return PTR_ERR(fmr);
 
-	*state->next_fmr++ = fmr;
+	*state->fmr.next++ = fmr;
 	state->nmdesc++;
 
 	srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
@@ -1301,6 +1304,9 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 	struct srp_fr_desc *desc;
 	u32 rkey;
 
+	if (state->fr.next >= state->fr.end)
+		return -ENOMEM;
+
 	desc = srp_fr_pool_get(ch->fr_pool);
 	if (!desc)
 		return -ENOMEM;
@@ -1324,7 +1330,7 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 				       IB_ACCESS_REMOTE_WRITE);
 	wr.wr.fast_reg.rkey = desc->mr->lkey;
 
-	*state->next_fr++ = desc;
+	*state->fr.next++ = desc;
 	state->nmdesc++;
 
 	srp_map_desc(state, state->base_dma_addr, state->dma_len,
@@ -1450,10 +1456,12 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	state->desc	= req->indirect_desc;
 	state->pages	= req->map_page;
 	if (dev->use_fast_reg) {
-		state->next_fr = req->fr_list;
+		state->fr.next = req->fr_list;
+		state->fr.end = req->fr_list + target->cmd_sg_cnt;
 		use_mr = !!ch->fr_pool;
 	} else {
-		state->next_fmr = req->fmr_list;
+		state->fmr.next = req->fmr_list;
+		state->fmr.end = req->fmr_list + target->cmd_sg_cnt;
 		use_mr = !!ch->fmr_pool;
 	}
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 17ee3f8..2ab73bc 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -282,8 +282,14 @@ struct srp_fr_pool {
  */
 struct srp_map_state {
 	union {
-		struct ib_pool_fmr **next_fmr;
-		struct srp_fr_desc **next_fr;
+		struct {
+			struct ib_pool_fmr **next;
+			struct ib_pool_fmr **end;
+		} fmr;
+		struct {
+			struct srp_fr_desc **next;
+			struct srp_fr_desc **end;
+		} fr;
 	};
 	struct srp_direct_buf  *desc;
 	u64		       *pages;
-- 
cgit v0.10.2


From 0e0d3a480090d03f29e58dfd717960776f3416d9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:07:46 -0700
Subject: IB/srp: Remove the memory registration backtracking code

Mapping a discontiguous sg-list requires multiple memory regions
and hence can exhaust the memory region pool. The SRP initiator
already handles this by temporarily reducing the queue depth. This
means that it is safe to remove the memory registration backtracking
code. This patch has been tested with direct I/O sizes up to 256 MB.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index b7feb74..acda659 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1364,15 +1364,6 @@ static int srp_finish_mapping(struct srp_map_state *state,
 	return ret;
 }
 
-static void srp_map_update_start(struct srp_map_state *state,
-				 struct scatterlist *sg, int sg_index,
-				 dma_addr_t dma_addr)
-{
-	state->unmapped_sg = sg;
-	state->unmapped_index = sg_index;
-	state->unmapped_addr = dma_addr;
-}
-
 static int srp_map_sg_entry(struct srp_map_state *state,
 			    struct srp_rdma_ch *ch,
 			    struct scatterlist *sg, int sg_index,
@@ -1399,23 +1390,12 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 		return 0;
 	}
 
-	/*
-	 * If this is the first sg that will be mapped via FMR or via FR, save
-	 * our position. We need to know the first unmapped entry, its index,
-	 * and the first unmapped address within that entry to be able to
-	 * restart mapping after an error.
-	 */
-	if (!state->unmapped_sg)
-		srp_map_update_start(state, sg, sg_index, dma_addr);
-
 	while (dma_len) {
 		unsigned offset = dma_addr & ~dev->mr_page_mask;
 		if (state->npages == dev->max_pages_per_mr || offset != 0) {
 			ret = srp_finish_mapping(state, ch);
 			if (ret)
 				return ret;
-
-			srp_map_update_start(state, sg, sg_index, dma_addr);
 		}
 
 		len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
@@ -1434,11 +1414,8 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 	 * boundries.
 	 */
 	ret = 0;
-	if (len != dev->mr_page_size) {
+	if (len != dev->mr_page_size)
 		ret = srp_finish_mapping(state, ch);
-		if (!ret)
-			srp_map_update_start(state, NULL, 0, 0);
-	}
 	return ret;
 }
 
@@ -1448,9 +1425,8 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 {
 	struct srp_target_port *target = ch->target;
 	struct srp_device *dev = target->srp_host->srp_dev;
-	struct ib_device *ibdev = dev->dev;
 	struct scatterlist *sg;
-	int i;
+	int i, ret;
 	bool use_mr;
 
 	state->desc	= req->indirect_desc;
@@ -1466,34 +1442,22 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	}
 
 	for_each_sg(scat, sg, count, i) {
-		if (srp_map_sg_entry(state, ch, sg, i, use_mr)) {
-			/*
-			 * Memory registration failed, so backtrack to the
-			 * first unmapped entry and continue on without using
-			 * memory registration.
-			 */
-			dma_addr_t dma_addr;
-			unsigned int dma_len;
-
-backtrack:
-			sg = state->unmapped_sg;
-			i = state->unmapped_index;
-
-			dma_addr = ib_sg_dma_address(ibdev, sg);
-			dma_len = ib_sg_dma_len(ibdev, sg);
-			dma_len -= (state->unmapped_addr - dma_addr);
-			dma_addr = state->unmapped_addr;
-			use_mr = false;
-			srp_map_desc(state, dma_addr, dma_len, target->rkey);
-		}
+		ret = srp_map_sg_entry(state, ch, sg, i, use_mr);
+		if (ret)
+			goto out;
 	}
 
-	if (use_mr && srp_finish_mapping(state, ch))
-		goto backtrack;
+	if (use_mr) {
+		ret = srp_finish_mapping(state, ch);
+		if (ret)
+			goto out;
+	}
 
 	req->nmdesc = state->nmdesc;
+	ret = 0;
 
-	return 0;
+out:
+	return ret;
 }
 
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 2ab73bc..1e42418 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -276,9 +276,6 @@ struct srp_fr_pool {
  * @npages:	    Number of page addresses in the pages[] array.
  * @nmdesc:	    Number of FMR or FR memory descriptors used for mapping.
  * @ndesc:	    Number of SRP buffer descriptors that have been filled in.
- * @unmapped_sg:    First element of the sg-list that is mapped via FMR or FR.
- * @unmapped_index: Index of the first element mapped via FMR or FR.
- * @unmapped_addr:  DMA address of the first element mapped via FMR or FR.
  */
 struct srp_map_state {
 	union {
@@ -299,9 +296,6 @@ struct srp_map_state {
 	unsigned int		npages;
 	unsigned int		nmdesc;
 	unsigned int		ndesc;
-	struct scatterlist     *unmapped_sg;
-	int			unmapped_index;
-	dma_addr_t		unmapped_addr;
 };
 
 #endif /* IB_SRP_H */
-- 
cgit v0.10.2


From 3ae95da8831e7bcf7b6da2b31cb4b50b3058f14e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:08:18 -0700
Subject: IB/srp: Remove use_mr argument from srp_map_sg_entry()

Move the srp_map_desc() call from inside srp_map_sg_entry() to
srp_map_sg() such that the use_mr argument can be removed from
srp_map_sg_entry().

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index acda659..9f57fa00 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1260,6 +1260,8 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
 {
 	struct srp_direct_buf *desc = state->desc;
 
+	WARN_ON_ONCE(!dma_len);
+
 	desc->va = cpu_to_be64(dma_addr);
 	desc->key = cpu_to_be32(rkey);
 	desc->len = cpu_to_be32(dma_len);
@@ -1366,29 +1368,17 @@ static int srp_finish_mapping(struct srp_map_state *state,
 
 static int srp_map_sg_entry(struct srp_map_state *state,
 			    struct srp_rdma_ch *ch,
-			    struct scatterlist *sg, int sg_index,
-			    bool use_mr)
+			    struct scatterlist *sg, int sg_index)
 {
 	struct srp_target_port *target = ch->target;
 	struct srp_device *dev = target->srp_host->srp_dev;
 	struct ib_device *ibdev = dev->dev;
 	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
 	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
-	unsigned int len;
+	unsigned int len = 0;
 	int ret;
 
-	if (!dma_len)
-		return 0;
-
-	if (!use_mr) {
-		/*
-		 * Once we're in direct map mode for a request, we don't
-		 * go back to FMR or FR mode, so no need to update anything
-		 * other than the descriptor.
-		 */
-		srp_map_desc(state, dma_addr, dma_len, target->rkey);
-		return 0;
-	}
+	WARN_ON_ONCE(!dma_len);
 
 	while (dma_len) {
 		unsigned offset = dma_addr & ~dev->mr_page_mask;
@@ -1441,16 +1431,20 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 		use_mr = !!ch->fmr_pool;
 	}
 
-	for_each_sg(scat, sg, count, i) {
-		ret = srp_map_sg_entry(state, ch, sg, i, use_mr);
-		if (ret)
-			goto out;
-	}
-
 	if (use_mr) {
+		for_each_sg(scat, sg, count, i) {
+			ret = srp_map_sg_entry(state, ch, sg, i);
+			if (ret)
+				goto out;
+		}
 		ret = srp_finish_mapping(state, ch);
 		if (ret)
 			goto out;
+	} else {
+		for_each_sg(scat, sg, count, i) {
+			srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
+				     ib_sg_dma_len(dev->dev, sg), target->rkey);
+		}
 	}
 
 	req->nmdesc = state->nmdesc;
-- 
cgit v0.10.2


From 002f15674c84fb4c38ed5237c1e7235e09c033f0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:08:44 -0700
Subject: IB/srp: Introduce srp_device.use_fmr

Introduce the variable srp_device.use_fmr. Leave out the dev->has_fr /
dev->has_fmr and ch->fr_pool / ch->fmr_pool checks since these are
redundant. This patch does not change any functionality but makes the
source code easier to read.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 9f57fa00..8f385af 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -546,7 +546,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	if (ret)
 		goto err_qp;
 
-	if (dev->use_fast_reg && dev->has_fr) {
+	if (dev->use_fast_reg) {
 		fr_pool = srp_alloc_fr_pool(target);
 		if (IS_ERR(fr_pool)) {
 			ret = PTR_ERR(fr_pool);
@@ -557,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 		if (ch->fr_pool)
 			srp_destroy_fr_pool(ch->fr_pool);
 		ch->fr_pool = fr_pool;
-	} else if (!dev->use_fast_reg && dev->has_fmr) {
+	} else if (dev->use_fmr) {
 		fmr_pool = srp_alloc_fmr_pool(target);
 		if (IS_ERR(fmr_pool)) {
 			ret = PTR_ERR(fmr_pool);
@@ -623,7 +623,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
 	if (dev->use_fast_reg) {
 		if (ch->fr_pool)
 			srp_destroy_fr_pool(ch->fr_pool);
-	} else {
+	} else if (dev->use_fmr) {
 		if (ch->fmr_pool)
 			ib_destroy_fmr_pool(ch->fmr_pool);
 	}
@@ -1085,7 +1085,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
 		if (req->nmdesc)
 			srp_fr_pool_put(ch->fr_pool, req->fr_list,
 					req->nmdesc);
-	} else {
+	} else if (dev->use_fmr) {
 		struct ib_pool_fmr **pfmr;
 
 		for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
@@ -1345,8 +1345,11 @@ static int srp_finish_mapping(struct srp_map_state *state,
 			      struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
 	int ret = 0;
 
+	WARN_ON_ONCE(!dev->use_fast_reg && !dev->use_fmr);
+
 	if (state->npages == 0)
 		return 0;
 
@@ -1354,8 +1357,7 @@ static int srp_finish_mapping(struct srp_map_state *state,
 		srp_map_desc(state, state->base_dma_addr, state->dma_len,
 			     target->rkey);
 	else
-		ret = target->srp_host->srp_dev->use_fast_reg ?
-			srp_map_finish_fr(state, ch) :
+		ret = dev->use_fast_reg ? srp_map_finish_fr(state, ch) :
 			srp_map_finish_fmr(state, ch);
 
 	if (ret == 0) {
@@ -1417,21 +1419,18 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	struct srp_device *dev = target->srp_host->srp_dev;
 	struct scatterlist *sg;
 	int i, ret;
-	bool use_mr;
 
 	state->desc	= req->indirect_desc;
 	state->pages	= req->map_page;
 	if (dev->use_fast_reg) {
 		state->fr.next = req->fr_list;
 		state->fr.end = req->fr_list + target->cmd_sg_cnt;
-		use_mr = !!ch->fr_pool;
-	} else {
+	} else if (dev->use_fmr) {
 		state->fmr.next = req->fmr_list;
 		state->fmr.end = req->fmr_list + target->cmd_sg_cnt;
-		use_mr = !!ch->fmr_pool;
 	}
 
-	if (use_mr) {
+	if (dev->use_fast_reg || dev->use_fmr) {
 		for_each_sg(scat, sg, count, i) {
 			ret = srp_map_sg_entry(state, ch, sg, i);
 			if (ret)
@@ -3364,6 +3363,7 @@ static void srp_add_one(struct ib_device *device)
 
 	srp_dev->use_fast_reg = (srp_dev->has_fr &&
 				 (!srp_dev->has_fmr || prefer_fr));
+	srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
 
 	/*
 	 * Use the smallest page size supported by the HCA, down to a
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 1e42418..60a33c1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -102,6 +102,7 @@ struct srp_device {
 	int			max_pages_per_mr;
 	bool			has_fmr;
 	bool			has_fr;
+	bool			use_fmr;
 	bool			use_fast_reg;
 };
 
-- 
cgit v0.10.2


From 330179f2fa93c1c6c41a90c7deabc98e363018e5 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:09:05 -0700
Subject: IB/srp: Register the indirect data buffer descriptor

Instead of always using the global rkey for the indirect data
buffer descriptor, register that descriptor with the HCA if
the kernel module parameter register_always has been set to Y.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 8f385af..45b81f1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1453,18 +1453,58 @@ out:
 	return ret;
 }
 
+/*
+ * Register the indirect data buffer descriptor with the HCA.
+ *
+ * Note: since the indirect data buffer descriptor has been allocated with
+ * kmalloc() it is guaranteed that this buffer is a physically contiguous
+ * memory buffer.
+ */
+static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
+		       void **next_mr, void **end_mr, u32 idb_len,
+		       __be32 *idb_rkey)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct srp_map_state state;
+	struct srp_direct_buf idb_desc;
+	u64 idb_pages[1];
+	int ret;
+
+	memset(&state, 0, sizeof(state));
+	memset(&idb_desc, 0, sizeof(idb_desc));
+	state.gen.next = next_mr;
+	state.gen.end = end_mr;
+	state.desc = &idb_desc;
+	state.pages = idb_pages;
+	state.pages[0] = (req->indirect_dma_addr &
+			  dev->mr_page_mask);
+	state.npages = 1;
+	state.base_dma_addr = req->indirect_dma_addr;
+	state.dma_len = idb_len;
+	ret = srp_finish_mapping(&state, ch);
+	if (ret < 0)
+		goto out;
+
+	*idb_rkey = idb_desc.key;
+
+out:
+	return ret;
+}
+
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 			struct srp_request *req)
 {
 	struct srp_target_port *target = ch->target;
 	struct scatterlist *scat;
 	struct srp_cmd *cmd = req->cmd->buf;
-	int len, nents, count;
+	int len, nents, count, ret;
 	struct srp_device *dev;
 	struct ib_device *ibdev;
 	struct srp_map_state state;
 	struct srp_indirect_buf *indirect_hdr;
-	u32 table_len;
+	u32 idb_len, table_len;
+	__be32 idb_rkey;
 	u8 fmt;
 
 	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
@@ -1546,6 +1586,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 
 	count = min(state.ndesc, target->cmd_sg_cnt);
 	table_len = state.ndesc * sizeof (struct srp_direct_buf);
+	idb_len = sizeof(struct srp_indirect_buf) + table_len;
 
 	fmt = SRP_DATA_DESC_INDIRECT;
 	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
@@ -1554,8 +1595,18 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 	memcpy(indirect_hdr->desc_list, req->indirect_desc,
 	       count * sizeof (struct srp_direct_buf));
 
+	if (register_always && (dev->use_fast_reg || dev->use_fmr)) {
+		ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
+				  idb_len, &idb_rkey);
+		if (ret < 0)
+			return ret;
+		req->nmdesc++;
+	} else {
+		idb_rkey = target->rkey;
+	}
+
 	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
-	indirect_hdr->table_desc.key = cpu_to_be32(target->rkey);
+	indirect_hdr->table_desc.key = idb_rkey;
 	indirect_hdr->table_desc.len = cpu_to_be32(table_len);
 	indirect_hdr->len = cpu_to_be32(state.total_len);
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 60a33c1..255b0e5 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -288,6 +288,10 @@ struct srp_map_state {
 			struct srp_fr_desc **next;
 			struct srp_fr_desc **end;
 		} fr;
+		struct {
+			void		   **next;
+			void		   **end;
+		} gen;
 	};
 	struct srp_direct_buf  *desc;
 	u64		       *pages;
-- 
cgit v0.10.2


From 03f6fb93fde24f01a940283bdf55024e576ee87d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 10 Aug 2015 17:09:36 -0700
Subject: IB/srp: Create an insecure all physical rkey only if needed

The SRP initiator only needs this if the insecure register_always=N
performance optimization is enabled, or if FRWR/FMR is not supported
in the driver.

Do not create an all physical MR unless it is needed to support
either of those modes. Default register_always to true so the out of
the box configuration does not create an insecure all physical MR.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
[bvanassche: reworked and rebased this patch]
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 45b81f1..ca98d3b 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -68,8 +68,8 @@ static unsigned int srp_sg_tablesize;
 static unsigned int cmd_sg_entries;
 static unsigned int indirect_sg_entries;
 static bool allow_ext_sg;
-static bool prefer_fr;
-static bool register_always;
+static bool prefer_fr = true;
+static bool register_always = true;
 static int topspin_workarounds = 1;
 
 module_param(srp_sg_tablesize, uint, 0444);
@@ -1353,9 +1353,9 @@ static int srp_finish_mapping(struct srp_map_state *state,
 	if (state->npages == 0)
 		return 0;
 
-	if (state->npages == 1 && !register_always)
+	if (state->npages == 1 && target->global_mr)
 		srp_map_desc(state, state->base_dma_addr, state->dma_len,
-			     target->rkey);
+			     target->global_mr->rkey);
 	else
 		ret = dev->use_fast_reg ? srp_map_finish_fr(state, ch) :
 			srp_map_finish_fmr(state, ch);
@@ -1442,7 +1442,8 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	} else {
 		for_each_sg(scat, sg, count, i) {
 			srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
-				     ib_sg_dma_len(dev->dev, sg), target->rkey);
+				     ib_sg_dma_len(dev->dev, sg),
+				     target->global_mr->rkey);
 		}
 	}
 
@@ -1531,7 +1532,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 	fmt = SRP_DATA_DESC_DIRECT;
 	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf);
 
-	if (count == 1 && !register_always) {
+	if (count == 1 && target->global_mr) {
 		/*
 		 * The midlayer only generated a single gather/scatter
 		 * entry, or DMA mapping coalesced everything to a
@@ -1541,7 +1542,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 		struct srp_direct_buf *buf = (void *) cmd->add_data;
 
 		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
-		buf->key = cpu_to_be32(target->rkey);
+		buf->key = cpu_to_be32(target->global_mr->rkey);
 		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
 
 		req->nmdesc = 0;
@@ -1595,14 +1596,14 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 	memcpy(indirect_hdr->desc_list, req->indirect_desc,
 	       count * sizeof (struct srp_direct_buf));
 
-	if (register_always && (dev->use_fast_reg || dev->use_fmr)) {
+	if (!target->global_mr) {
 		ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
 				  idb_len, &idb_rkey);
 		if (ret < 0)
 			return ret;
 		req->nmdesc++;
 	} else {
-		idb_rkey = target->rkey;
+		idb_rkey = target->global_mr->rkey;
 	}
 
 	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
@@ -3157,7 +3158,7 @@ static ssize_t srp_create_target(struct device *dev,
 	target->scsi_host	= target_host;
 	target->srp_host	= host;
 	target->lkey		= host->srp_dev->pd->local_dma_lkey;
-	target->rkey		= host->srp_dev->mr->rkey;
+	target->global_mr	= host->srp_dev->global_mr;
 	target->cmd_sg_cnt	= cmd_sg_entries;
 	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
 	target->allow_ext_sg	= allow_ext_sg;
@@ -3447,12 +3448,16 @@ static void srp_add_one(struct ib_device *device)
 	if (IS_ERR(srp_dev->pd))
 		goto free_dev;
 
-	srp_dev->mr = ib_get_dma_mr(srp_dev->pd,
-				    IB_ACCESS_LOCAL_WRITE |
-				    IB_ACCESS_REMOTE_READ |
-				    IB_ACCESS_REMOTE_WRITE);
-	if (IS_ERR(srp_dev->mr))
-		goto err_pd;
+	if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
+		srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
+						   IB_ACCESS_LOCAL_WRITE |
+						   IB_ACCESS_REMOTE_READ |
+						   IB_ACCESS_REMOTE_WRITE);
+		if (IS_ERR(srp_dev->global_mr))
+			goto err_pd;
+	} else {
+		srp_dev->global_mr = NULL;
+	}
 
 	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
 		host = srp_add_port(srp_dev, p);
@@ -3509,7 +3514,8 @@ static void srp_remove_one(struct ib_device *device, void *client_data)
 		kfree(host);
 	}
 
-	ib_dereg_mr(srp_dev->mr);
+	if (srp_dev->global_mr)
+		ib_dereg_mr(srp_dev->global_mr);
 	ib_dealloc_pd(srp_dev->pd);
 
 	kfree(srp_dev);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 255b0e5..3608f2e 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -95,7 +95,7 @@ struct srp_device {
 	struct list_head	dev_list;
 	struct ib_device       *dev;
 	struct ib_pd	       *pd;
-	struct ib_mr	       *mr;
+	struct ib_mr	       *global_mr;
 	u64			mr_page_mask;
 	int			mr_page_size;
 	int			mr_max_size;
@@ -183,10 +183,10 @@ struct srp_target_port {
 	spinlock_t		lock;
 
 	/* read only in the hot path */
+	struct ib_mr		*global_mr;
 	struct srp_rdma_ch	*ch;
 	u32			ch_count;
 	u32			lkey;
-	u32			rkey;
 	enum srp_target_state	state;
 	unsigned int		max_iu_len;
 	unsigned int		cmd_sg_cnt;
-- 
cgit v0.10.2


From 7dd78647a2c2c224e376fc72797d411a3a0bb047 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Wed, 5 Aug 2015 14:34:31 -0600
Subject: IB/core: Make ib_dealloc_pd return void

The majority of callers never check the return value, and even if they
did, they can't do anything about a failure.

All possible failure cases represent a bug in the caller, so just
WARN_ON inside the function instead.

This fixes a few random errors:
 net/rd/iw.c infinite loops while it fails. (racing with EBUSY?)

This also lays the ground work to get rid of error return from the
drivers. Most drivers do not error, the few that do are broken since
it cannot be handled.

Since uverbs can legitimately make use of EBUSY, open code the
check.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 258485e..4c98696 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -606,6 +606,7 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 {
 	struct ib_uverbs_dealloc_pd cmd;
 	struct ib_uobject          *uobj;
+	struct ib_pd		   *pd;
 	int                         ret;
 
 	if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -614,15 +615,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
 	if (!uobj)
 		return -EINVAL;
+	pd = uobj->object;
 
-	ret = ib_dealloc_pd(uobj->object);
-	if (!ret)
-		uobj->live = 0;
-
-	put_uobj_write(uobj);
+	if (atomic_read(&pd->usecnt)) {
+		ret = -EBUSY;
+		goto err_put;
+	}
 
+	ret = pd->device->dealloc_pd(uobj->object);
+	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
 	if (ret)
-		return ret;
+		goto err_put;
+
+	uobj->live = 0;
+	put_uobj_write(uobj);
 
 	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
 
@@ -633,6 +639,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	put_uobj(uobj);
 
 	return in_len;
+
+err_put:
+	put_uobj_write(uobj);
+	return ret;
 }
 
 struct xrcd_table_entry {
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 2e5fd89..aad8b3c 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -260,18 +260,32 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device)
 }
 EXPORT_SYMBOL(ib_alloc_pd);
 
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist.  The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
 {
+	int ret;
+
 	if (pd->local_mr) {
-		if (ib_dereg_mr(pd->local_mr))
-			return -EBUSY;
+		ret = ib_dereg_mr(pd->local_mr);
+		WARN_ON(ret);
 		pd->local_mr = NULL;
 	}
 
-	if (atomic_read(&pd->usecnt))
-		return -EBUSY;
+	/* uverbs manipulates usecnt with proper locking, while the kabi
+	   requires the caller to guarantee we can't race here. */
+	WARN_ON(atomic_read(&pd->usecnt));
 
-	return pd->device->dealloc_pd(pd);
+	/* Making delalloc_pd a void return is a WIP, no driver should return
+	   an error here. */
+	ret = pd->device->dealloc_pd(pd);
+	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
 }
 EXPORT_SYMBOL(ib_dealloc_pd);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8c45198..78845b6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -280,9 +280,7 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 		priv->wq = NULL;
 	}
 
-	if (ib_dealloc_pd(priv->pd))
-		ipoib_warn(priv, "ib_dealloc_pd failed\n");
-
+	ib_dealloc_pd(priv->pd);
 }
 
 void ipoib_event(struct ib_event_handler *handler,
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index ad2d2b5..ae70cc1 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -185,7 +185,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 
 	(void)ib_unregister_event_handler(&device->event_handler);
 	(void)ib_dereg_mr(device->mr);
-	(void)ib_dealloc_pd(device->pd);
+	ib_dealloc_pd(device->pd);
 
 	kfree(device->comps);
 	device->comps = NULL;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0940051..128abf2 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2196,11 +2196,7 @@ int ib_find_pkey(struct ib_device *device,
 
 struct ib_pd *ib_alloc_pd(struct ib_device *device);
 
-/**
- * ib_dealloc_pd - Deallocates a protection domain.
- * @pd: The protection domain to deallocate.
- */
-int ib_dealloc_pd(struct ib_pd *pd);
+void ib_dealloc_pd(struct ib_pd *pd);
 
 /**
  * ib_create_ah - Creates an address handle for the given address vector.
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 7cc2f32..c7dcddb 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -148,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device, void *client_data)
 	if (rds_iwdev->mr)
 		ib_dereg_mr(rds_iwdev->mr);
 
-	while (ib_dealloc_pd(rds_iwdev->pd)) {
-		rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
-		msleep(1);
-	}
+	ib_dealloc_pd(rds_iwdev->pd);
 
 	list_del(&rds_iwdev->list);
 	kfree(rds_iwdev);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 891c4ed..afd5043 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -624,7 +624,7 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
 
 	/* If the pd is still busy, xprtrdma missed freeing a resource */
 	if (ia->ri_pd && !IS_ERR(ia->ri_pd))
-		WARN_ON(ib_dealloc_pd(ia->ri_pd));
+		ib_dealloc_pd(ia->ri_pd);
 }
 
 /*
-- 
cgit v0.10.2


From 03c40442a0e66fa52aec6733ea88804fe7d12c77 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:02 +0300
Subject: IB/uverbs: Fix reference counting usage of event files

Fix the reference counting usage to be handled in the event file
creation/destruction function, instead of being done by the caller.
This is done for both async/non-async event files.

Based on Jason Gunthorpe report at https://www.mail-archive.com/
linux-rdma@vger.kernel.org/msg24680.html:
"The existing code for this is broken, in ib_uverbs_get_context all
the error paths between ib_uverbs_alloc_event_file and the
kref_get(file->ref) are wrong - this will result in fput() which will
call ib_uverbs_event_close, which will try to do kref_put and
ib_unregister_event_handler - which are no longer paired."

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ba365b6..60e6e3d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -178,6 +178,7 @@ void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 					int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
 
 void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 4c98696..ae2d5972 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		goto err_file;
 	}
 
-	file->async_file = filp->private_data;
-
-	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
-			      ib_uverbs_event_handler);
-	ret = ib_register_event_handler(&file->event_handler);
-	if (ret)
-		goto err_file;
-
-	kref_get(&file->async_file->ref);
-	kref_get(&file->ref);
 	file->ucontext = ucontext;
 
 	fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	return in_len;
 
 err_file:
+	ib_uverbs_free_async_event_file(file);
 	fput(filp);
 
 err_fd:
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 46c9229..7536a4c8 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -406,10 +406,9 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 	}
 	spin_unlock_irq(&file->lock);
 
-	if (file->is_async) {
+	if (file->is_async)
 		ib_unregister_event_handler(&file->uverbs_file->event_handler);
-		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
-	}
+	kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
 	kref_put(&file->ref, ib_uverbs_release_event_file);
 
 	return 0;
@@ -541,13 +540,20 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
 				NULL, NULL);
 }
 
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+	kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+	file->async_file = NULL;
+}
+
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 					int is_async)
 {
 	struct ib_uverbs_event_file *ev_file;
 	struct file *filp;
+	int ret;
 
-	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
 	if (!ev_file)
 		return ERR_PTR(-ENOMEM);
 
@@ -556,15 +562,41 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 	INIT_LIST_HEAD(&ev_file->event_list);
 	init_waitqueue_head(&ev_file->poll_wait);
 	ev_file->uverbs_file = uverbs_file;
+	kref_get(&ev_file->uverbs_file->ref);
 	ev_file->async_queue = NULL;
-	ev_file->is_async    = is_async;
 	ev_file->is_closed   = 0;
 
 	filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
 				  ev_file, O_RDONLY);
 	if (IS_ERR(filp))
-		kfree(ev_file);
+		goto err_put_refs;
+
+	if (is_async) {
+		WARN_ON(uverbs_file->async_file);
+		uverbs_file->async_file = ev_file;
+		kref_get(&uverbs_file->async_file->ref);
+		INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+				      uverbs_file->device->ib_dev,
+				      ib_uverbs_event_handler);
+		ret = ib_register_event_handler(&uverbs_file->event_handler);
+		if (ret)
+			goto err_put_file;
+
+		/* At that point async file stuff was fully set */
+		ev_file->is_async = 1;
+	}
+
+	return filp;
+
+err_put_file:
+	fput(filp);
+	kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+	uverbs_file->async_file = NULL;
+	return ERR_PTR(ret);
 
+err_put_refs:
+	kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+	kref_put(&ev_file->ref, ib_uverbs_release_event_file);
 	return filp;
 }
 
-- 
cgit v0.10.2


From 35d4a0b63dc0c6d1177d4f532a9deae958f0662c Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:03 +0300
Subject: IB/uverbs: Fix race between ib_uverbs_open and remove_one

Fixes: 2a72f212263701b927559f6850446421d5906c41 ("IB/uverbs: Remove dev_table")

Before this commit there was a device look-up table that was protected
by a spin_lock used by ib_uverbs_open and by ib_uverbs_remove_one. When
it was dropped and container_of was used instead, it enabled the race
with remove_one as dev might be freed just after:
dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev) but
before the kref_get.

In addition, this buggy patch added some dead code as
container_of(x,y,z) can never be NULL and so dev can never be NULL.
As a result the comment above ib_uverbs_open saying "the open method
will either immediately run -ENXIO" is wrong as it can never happen.

The solution follows Jason Gunthorpe suggestion from below URL:
https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg25692.html

cdev will hold a kref on the parent (the containing structure,
ib_uverbs_device) and only when that kref is released it is
guaranteed that open will never be called again.

In addition, fixes the active count scheme to use an atomic
not a kref to prevent WARN_ON as pointed by above comment
from Jason.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 60e6e3d..92ec765 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -85,7 +85,7 @@
  */
 
 struct ib_uverbs_device {
-	struct kref				ref;
+	atomic_t				refcount;
 	int					num_comp_vectors;
 	struct completion			comp;
 	struct device			       *dev;
@@ -94,6 +94,7 @@ struct ib_uverbs_device {
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
+	struct kobject				kobj;
 };
 
 struct ib_uverbs_event_file {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 7536a4c8..e05bbb2 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -130,14 +130,18 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
 {
 	struct ib_uverbs_device *dev =
-		container_of(ref, struct ib_uverbs_device, ref);
+		container_of(kobj, struct ib_uverbs_device, kobj);
 
-	complete(&dev->comp);
+	kfree(dev);
 }
 
+static struct kobj_type ib_uverbs_dev_ktype = {
+	.release = ib_uverbs_release_dev,
+};
+
 static void ib_uverbs_release_event_file(struct kref *ref)
 {
 	struct ib_uverbs_event_file *file =
@@ -303,13 +307,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 	return context->device->dealloc_ucontext(context);
 }
 
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+	complete(&dev->comp);
+}
+
 static void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
 
 	module_put(file->device->ib_dev->owner);
-	kref_put(&file->device->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&file->device->refcount))
+		ib_uverbs_comp_dev(file->device);
 
 	kfree(file);
 }
@@ -775,9 +785,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	int ret;
 
 	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
-	if (dev)
-		kref_get(&dev->ref);
-	else
+	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
 	if (!try_module_get(dev->ib_dev->owner)) {
@@ -798,6 +806,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	mutex_init(&file->mutex);
 
 	filp->private_data = file;
+	kobject_get(&dev->kobj);
 
 	return nonseekable_open(inode, filp);
 
@@ -805,13 +814,16 @@ err_module:
 	module_put(dev->ib_dev->owner);
 
 err:
-	kref_put(&dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&dev->refcount))
+		ib_uverbs_comp_dev(dev);
+
 	return ret;
 }
 
 static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_device *dev = file->device;
 
 	ib_uverbs_cleanup_ucontext(file, file->ucontext);
 
@@ -819,6 +831,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
 
 	kref_put(&file->ref, ib_uverbs_release_file);
+	kobject_put(&dev->kobj);
 
 	return 0;
 }
@@ -914,10 +927,11 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (!uverbs_dev)
 		return;
 
-	kref_init(&uverbs_dev->ref);
+	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
 
 	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -944,6 +958,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	cdev_init(&uverbs_dev->cdev, NULL);
 	uverbs_dev->cdev.owner = THIS_MODULE;
 	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
 	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
 	if (cdev_add(&uverbs_dev->cdev, base, 1))
 		goto err_cdev;
@@ -974,9 +989,10 @@ err_cdev:
 		clear_bit(devnum, overflow_map);
 
 err:
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 	return;
 }
 
@@ -996,9 +1012,10 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 	else
 		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)
-- 
cgit v0.10.2


From 057aec0d23f750b27f0bb92d2606871f60417e0a Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:04 +0300
Subject: IB/uverbs: Explicitly pass ib_dev to uverbs commands

Done in preparation for deploying RCU for the device removal
flow. Allows isolating the RCU handling to the uverb_main layer and
keeping the uverbs_cmd code as is.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 92ec765..ea52db1 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -178,6 +178,7 @@ extern struct idr ib_uverbs_rule_idr;
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					struct ib_device *ib_dev,
 					int is_async);
 void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
@@ -214,6 +215,7 @@ struct ib_uverbs_flow_spec {
 
 #define IB_UVERBS_DECLARE_CMD(name)					\
 	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 struct ib_device *ib_dev,              \
 				 const char __user *buf, int in_len,	\
 				 int out_len)
 
@@ -255,6 +257,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 
 #define IB_UVERBS_DECLARE_EX_CMD(name)				\
 	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_device *ib_dev,		\
 				struct ib_udata *ucore,		\
 				struct ib_udata *uhw)
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index ae2d5972..a15318a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
 }
 
 ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf,
 			      int in_len, int out_len)
 {
 	struct ib_uverbs_get_context      cmd;
 	struct ib_uverbs_get_context_resp resp;
 	struct ib_udata                   udata;
-	struct ib_device                 *ibdev = file->device->ib_dev;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct ib_device_attr		  dev_attr;
 #endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
 	if (IS_ERR(ucontext)) {
 		ret = PTR_ERR(ucontext);
 		goto err;
 	}
 
-	ucontext->device = ibdev;
+	ucontext->device = ib_dev;
 	INIT_LIST_HEAD(&ucontext->pd_list);
 	INIT_LIST_HEAD(&ucontext->mr_list);
 	INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	ucontext->odp_mrs_count = 0;
 	INIT_LIST_HEAD(&ucontext->no_private_counters);
 
-	ret = ib_query_device(ibdev, &dev_attr);
+	ret = ib_query_device(ib_dev, &dev_attr);
 	if (ret)
 		goto err_free;
 	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		goto err_free;
 	resp.async_fd = ret;
 
-	filp = ib_uverbs_alloc_event_file(file, 1);
+	filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
 	if (IS_ERR(filp)) {
 		ret = PTR_ERR(filp);
 		goto err_fd;
@@ -384,7 +384,7 @@ err_fd:
 
 err_free:
 	put_pid(ucontext->tgid);
-	ibdev->dealloc_ucontext(ucontext);
+	ib_dev->dealloc_ucontext(ucontext);
 
 err:
 	mutex_unlock(&file->mutex);
@@ -392,11 +392,12 @@ err:
 }
 
 static void copy_query_dev_fields(struct ib_uverbs_file *file,
+				  struct ib_device *ib_dev,
 				  struct ib_uverbs_query_device_resp *resp,
 				  struct ib_device_attr *attr)
 {
 	resp->fw_ver		= attr->fw_ver;
-	resp->node_guid		= file->device->ib_dev->node_guid;
+	resp->node_guid		= ib_dev->node_guid;
 	resp->sys_image_guid	= attr->sys_image_guid;
 	resp->max_mr_size	= attr->max_mr_size;
 	resp->page_size_cap	= attr->page_size_cap;
@@ -434,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
 	resp->max_srq_sge		= attr->max_srq_sge;
 	resp->max_pkeys			= attr->max_pkeys;
 	resp->local_ca_ack_delay	= attr->local_ca_ack_delay;
-	resp->phys_port_cnt		= file->device->ib_dev->phys_port_cnt;
+	resp->phys_port_cnt		= ib_dev->phys_port_cnt;
 }
 
 ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf,
 			       int in_len, int out_len)
 {
@@ -452,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
 
-	ret = ib_query_device(file->device->ib_dev, &attr);
+	ret = ib_query_device(ib_dev, &attr);
 	if (ret)
 		return ret;
 
 	memset(&resp, 0, sizeof resp);
-	copy_query_dev_fields(file, &resp, &attr);
+	copy_query_dev_fields(file, ib_dev, &resp, &attr);
 
 	if (copy_to_user((void __user *) (unsigned long) cmd.response,
 			 &resp, sizeof resp))
@@ -467,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf,
 			     int in_len, int out_len)
 {
@@ -481,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
 
-	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	ret = ib_query_port(ib_dev, cmd.port_num, &attr);
 	if (ret)
 		return ret;
 
@@ -506,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	resp.active_width    = attr.active_width;
 	resp.active_speed    = attr.active_speed;
 	resp.phys_state      = attr.phys_state;
-	resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+	resp.link_layer      = rdma_port_get_link_layer(ib_dev,
 							cmd.port_num);
 
 	if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -517,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf,
 			   int in_len, int out_len)
 {
@@ -544,14 +548,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
 	down_write(&uobj->mutex);
 
-	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
-					    file->ucontext, &udata);
+	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
 	if (IS_ERR(pd)) {
 		ret = PTR_ERR(pd);
 		goto err;
 	}
 
-	pd->device  = file->device->ib_dev;
+	pd->device  = ib_dev;
 	pd->uobject = uobj;
 	pd->local_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
@@ -592,6 +595,7 @@ err:
 }
 
 ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf,
 			     int in_len, int out_len)
 {
@@ -722,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -780,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
 	down_write(&obj->uobject.mutex);
 
 	if (!xrcd) {
-		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
-							file->ucontext, &udata);
+		xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
 		if (IS_ERR(xrcd)) {
 			ret = PTR_ERR(xrcd);
 			goto err;
 		}
 
 		xrcd->inode   = inode;
-		xrcd->device  = file->device->ib_dev;
+		xrcd->device  = ib_dev;
 		atomic_set(&xrcd->usecnt, 0);
 		mutex_init(&xrcd->tgt_qp_mutex);
 		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -859,6 +863,7 @@ err_tree_mutex_unlock:
 }
 
 ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -936,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
 			 const char __user *buf, int in_len,
 			 int out_len)
 {
@@ -1045,6 +1051,7 @@ err_free:
 }
 
 ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -1138,6 +1145,7 @@ put_uobjs:
 }
 
 ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -1176,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
-			 const char __user *buf, int in_len,
-			 int out_len)
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
 {
 	struct ib_uverbs_alloc_mw      cmd;
 	struct ib_uverbs_alloc_mw_resp resp;
@@ -1258,8 +1267,9 @@ err_free:
 }
 
 ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
 {
 	struct ib_uverbs_dealloc_mw cmd;
 	struct ib_mw               *mw;
@@ -1296,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      struct ib_device *ib_dev,
 				      const char __user *buf, int in_len,
 				      int out_len)
 {
@@ -1315,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
 		return ret;
 	resp.fd = ret;
 
-	filp = ib_uverbs_alloc_event_file(file, 0);
+	filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
 	if (IS_ERR(filp)) {
 		put_unused_fd(resp.fd);
 		return PTR_ERR(filp);
@@ -1333,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
 }
 
 static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+					struct ib_device *ib_dev,
 				       struct ib_udata *ucore,
 				       struct ib_udata *uhw,
 				       struct ib_uverbs_ex_create_cq *cmd,
@@ -1381,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
 		attr.flags = cmd->flags;
 
-	cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
+	cq = ib_dev->create_cq(ib_dev, &attr,
 					     file->ucontext, uhw);
 	if (IS_ERR(cq)) {
 		ret = PTR_ERR(cq);
 		goto err_file;
 	}
 
-	cq->device        = file->device->ib_dev;
+	cq->device        = ib_dev;
 	cq->uobject       = &obj->uobject;
 	cq->comp_handler  = ib_uverbs_comp_handler;
 	cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1449,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1477,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
 	cmd_ex.comp_vector = cmd.comp_vector;
 	cmd_ex.comp_channel = cmd.comp_channel;
 
-	obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+	obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
 			offsetof(typeof(cmd_ex), comp_channel) +
 			sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
 			NULL);
@@ -1500,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
 			   struct ib_udata *ucore,
 			   struct ib_udata *uhw)
 {
@@ -1525,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 			     sizeof(resp.response_length)))
 		return -ENOSPC;
 
-	obj = create_cq(file, ucore, uhw, &cmd,
+	obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
 			min(ucore->inlen, sizeof(cmd)),
 			ib_uverbs_ex_create_cq_cb, NULL);
 
@@ -1536,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1599,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
 }
 
 ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
 			  const char __user *buf, int in_len,
 			  int out_len)
 {
@@ -1650,6 +1666,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				const char __user *buf, int in_len,
 				int out_len)
 {
@@ -1672,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -1724,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1919,6 +1938,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
 			  const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_open_qp        cmd;
@@ -2013,6 +2033,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -2127,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
 }
 
 ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2223,6 +2245,7 @@ out:
 }
 
 ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -2281,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2525,6 +2549,7 @@ err:
 }
 
 ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2574,6 +2599,7 @@ out:
 }
 
 ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				const char __user *buf, int in_len,
 				int out_len)
 {
@@ -2623,6 +2649,7 @@ out:
 }
 
 ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2715,6 +2742,7 @@ err:
 }
 
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_destroy_ah cmd;
@@ -2751,6 +2779,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf, int in_len,
 			       int out_len)
 {
@@ -2798,6 +2827,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf, int in_len,
 			       int out_len)
 {
@@ -2878,6 +2908,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
 }
 
 int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     struct ib_udata *ucore,
 			     struct ib_udata *uhw)
 {
@@ -3038,6 +3069,7 @@ err_free_attr:
 }
 
 int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      struct ib_udata *ucore,
 			      struct ib_udata *uhw)
 {
@@ -3080,6 +3112,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 }
 
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				struct ib_uverbs_create_xsrq *cmd,
 				struct ib_udata *udata)
 {
@@ -3213,6 +3246,7 @@ err:
 }
 
 ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -3240,7 +3274,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+	ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
 	if (ret)
 		return ret;
 
@@ -3248,6 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_create_xsrq     cmd;
@@ -3265,7 +3300,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ret = __uverbs_create_xsrq(file, &cmd, &udata);
+	ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
 	if (ret)
 		return ret;
 
@@ -3273,6 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -3303,6 +3339,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf,
 			    int in_len, int out_len)
 {
@@ -3343,6 +3380,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf, int in_len,
 			      int out_len)
 {
@@ -3400,16 +3438,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      struct ib_udata *ucore,
 			      struct ib_udata *uhw)
 {
 	struct ib_uverbs_ex_query_device_resp resp;
 	struct ib_uverbs_ex_query_device  cmd;
 	struct ib_device_attr attr;
-	struct ib_device *device;
 	int err;
 
-	device = file->device->ib_dev;
 	if (ucore->inlen < sizeof(cmd))
 		return -EINVAL;
 
@@ -3430,11 +3467,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
 	memset(&attr, 0, sizeof(attr));
 
-	err = device->query_device(device, &attr, uhw);
+	err = ib_dev->query_device(ib_dev, &attr, uhw);
 	if (err)
 		return err;
 
-	copy_query_dev_fields(file, &resp.base, &attr);
+	copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
 	resp.comp_mask = 0;
 
 	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e05bbb2..082d9c7 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     struct ib_device *ib_dev,
 				     const char __user *buf, int in_len,
 				     int out_len) = {
 	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_device *ib_dev,
 				    struct ib_udata *ucore,
 				    struct ib_udata *uhw) = {
 	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
@@ -557,6 +559,7 @@ void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
 }
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					struct ib_device	*ib_dev,
 					int is_async)
 {
 	struct ib_uverbs_event_file *ev_file;
@@ -586,7 +589,7 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 		uverbs_file->async_file = ev_file;
 		kref_get(&uverbs_file->async_file->ref);
 		INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
-				      uverbs_file->device->ib_dev,
+				      ib_dev,
 				      ib_uverbs_event_handler);
 		ret = ib_register_event_handler(&uverbs_file->event_handler);
 		if (ret)
@@ -643,9 +646,13 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 			     size_t count, loff_t *pos)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_device *ib_dev = file->device->ib_dev;
 	struct ib_uverbs_cmd_hdr hdr;
 	__u32 flags;
 
+	if (!ib_dev)
+		return -ENODEV;
+
 	if (count < sizeof hdr)
 		return -EINVAL;
 
@@ -672,13 +679,13 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
 			return -EINVAL;
 
-		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+		if (!(ib_dev->uverbs_cmd_mask & (1ull << command)))
 			return -ENOSYS;
 
 		if (hdr.in_words * 4 != count)
 			return -EINVAL;
 
-		return uverbs_cmd_table[command](file,
+		return uverbs_cmd_table[command](file, ib_dev,
 						 buf + sizeof(hdr),
 						 hdr.in_words * 4,
 						 hdr.out_words * 4);
@@ -705,7 +712,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		if (!file->ucontext)
 			return -EINVAL;
 
-		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+		if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
 			return -ENOSYS;
 
 		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
@@ -746,6 +753,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 				       ex_hdr.provider_out_words * 8);
 
 		err = uverbs_ex_cmd_table[command](file,
+						   ib_dev,
 						   &ucore,
 						   &uhw);
 
@@ -761,11 +769,12 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_device *ib_dev = file->device->ib_dev;
 
-	if (!file->ucontext)
+	if (!ib_dev || !file->ucontext)
 		return -ENODEV;
 	else
-		return file->device->ib_dev->mmap(file->ucontext, vma);
+		return ib_dev->mmap(file->ucontext, vma);
 }
 
 /*
-- 
cgit v0.10.2


From 036b10635739ffd030246eedde3d67f724800177 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:05 +0300
Subject: IB/uverbs: Enable device removal when there are active user space
 applications

Enables the uverbs_remove_one to succeed despite the fact that there are
running IB applications working with the given ib device.  This
functionality enables a HW device to be unbind/reset despite the fact that
there are running user space applications using it.

It exposes a new IB kernel API named 'disassociate_ucontext' which lets
a driver detaching its HW resources from a given user context without
crashing/terminating the application. In case a driver implemented the
above API and registered with ib_uverb there will be no dependency between its
device to its uverbs_device. Upon calling remove_one of ib_uverbs the call
should return after disassociating the open HW resources without waiting to
clients disconnecting. In case driver didn't implement this API there will be no
change to current behaviour and uverbs_remove_one will return only when last
client has disconnected and reference count on uverbs device became 0.

In case the lower driver device was removed any application will
continue working over some zombie HCA, further calls will ended with an
immediate error.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ea52db1..3863d33 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -89,12 +89,16 @@ struct ib_uverbs_device {
 	int					num_comp_vectors;
 	struct completion			comp;
 	struct device			       *dev;
-	struct ib_device		       *ib_dev;
+	struct ib_device	__rcu	       *ib_dev;
 	int					devnum;
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
 	struct kobject				kobj;
+	struct srcu_struct			disassociate_srcu;
+	struct mutex				lists_mutex; /* protect lists */
+	struct list_head			uverbs_file_list;
+	struct list_head			uverbs_events_file_list;
 };
 
 struct ib_uverbs_event_file {
@@ -106,6 +110,7 @@ struct ib_uverbs_event_file {
 	wait_queue_head_t			poll_wait;
 	struct fasync_struct		       *async_queue;
 	struct list_head			event_list;
+	struct list_head			list;
 };
 
 struct ib_uverbs_file {
@@ -115,6 +120,8 @@ struct ib_uverbs_file {
 	struct ib_ucontext		       *ucontext;
 	struct ib_event_handler			event_handler;
 	struct ib_uverbs_event_file	       *async_file;
+	struct list_head			list;
+	int					is_closed;
 };
 
 struct ib_uverbs_event {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 082d9c7..c29a660 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -137,6 +137,7 @@ static void ib_uverbs_release_dev(struct kobject *kobj)
 	struct ib_uverbs_device *dev =
 		container_of(kobj, struct ib_uverbs_device, kobj);
 
+	cleanup_srcu_struct(&dev->disassociate_srcu);
 	kfree(dev);
 }
 
@@ -207,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 {
 	struct ib_uobject *uobj, *tmp;
 
-	if (!context)
-		return 0;
-
 	context->closing = 1;
 
 	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -318,8 +316,16 @@ static void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
+	struct ib_device *ib_dev;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (ib_dev && !ib_dev->disassociate_ucontext)
+		module_put(ib_dev->owner);
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
-	module_put(file->device->ib_dev->owner);
 	if (atomic_dec_and_test(&file->device->refcount))
 		ib_uverbs_comp_dev(file->device);
 
@@ -343,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
 			return -EAGAIN;
 
 		if (wait_event_interruptible(file->poll_wait,
-					     !list_empty(&file->event_list)))
+					     (!list_empty(&file->event_list) ||
+			/* The barriers built into wait_event_interruptible()
+			 * and wake_up() guarentee this will see the null set
+			 * without using RCU
+			 */
+					     !file->uverbs_file->device->ib_dev)))
 			return -ERESTARTSYS;
 
+		/* If device was disassociated and no event exists set an error */
+		if (list_empty(&file->event_list) &&
+		    !file->uverbs_file->device->ib_dev)
+			return -EIO;
+
 		spin_lock_irq(&file->lock);
 	}
 
@@ -408,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_event_file *file = filp->private_data;
 	struct ib_uverbs_event *entry, *tmp;
+	int closed_already = 0;
 
+	mutex_lock(&file->uverbs_file->device->lists_mutex);
 	spin_lock_irq(&file->lock);
+	closed_already = file->is_closed;
 	file->is_closed = 1;
 	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
 		if (entry->counter)
@@ -417,9 +436,14 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 		kfree(entry);
 	}
 	spin_unlock_irq(&file->lock);
+	if (!closed_already) {
+		list_del(&file->list);
+		if (file->is_async)
+			ib_unregister_event_handler(&file->uverbs_file->
+				event_handler);
+	}
+	mutex_unlock(&file->uverbs_file->device->lists_mutex);
 
-	if (file->is_async)
-		ib_unregister_event_handler(&file->uverbs_file->event_handler);
 	kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
 	kref_put(&file->ref, ib_uverbs_release_event_file);
 
@@ -584,6 +608,11 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 	if (IS_ERR(filp))
 		goto err_put_refs;
 
+	mutex_lock(&uverbs_file->device->lists_mutex);
+	list_add_tail(&ev_file->list,
+		      &uverbs_file->device->uverbs_events_file_list);
+	mutex_unlock(&uverbs_file->device->lists_mutex);
+
 	if (is_async) {
 		WARN_ON(uverbs_file->async_file);
 		uverbs_file->async_file = ev_file;
@@ -646,12 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 			     size_t count, loff_t *pos)
 {
 	struct ib_uverbs_file *file = filp->private_data;
-	struct ib_device *ib_dev = file->device->ib_dev;
+	struct ib_device *ib_dev;
 	struct ib_uverbs_cmd_hdr hdr;
 	__u32 flags;
-
-	if (!ib_dev)
-		return -ENODEV;
+	int srcu_key;
+	ssize_t ret;
 
 	if (count < sizeof hdr)
 		return -EINVAL;
@@ -659,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 	if (copy_from_user(&hdr, buf, sizeof hdr))
 		return -EFAULT;
 
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
+
 	flags = (hdr.command &
 		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
 
@@ -666,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		__u32 command;
 
 		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-					   IB_USER_VERBS_CMD_COMMAND_MASK))
-			return -EINVAL;
+					   IB_USER_VERBS_CMD_COMMAND_MASK)) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
 		if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
-		    !uverbs_cmd_table[command])
-			return -EINVAL;
+		    !uverbs_cmd_table[command]) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		if (!file->ucontext &&
-		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
-			return -EINVAL;
+		    command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (!(ib_dev->uverbs_cmd_mask & (1ull << command)))
-			return -ENOSYS;
+		if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (hdr.in_words * 4 != count)
-			return -EINVAL;
+		if (hdr.in_words * 4 != count) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		return uverbs_cmd_table[command](file, ib_dev,
+		ret = uverbs_cmd_table[command](file, ib_dev,
 						 buf + sizeof(hdr),
 						 hdr.in_words * 4,
 						 hdr.out_words * 4);
@@ -696,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		struct ib_uverbs_ex_cmd_hdr ex_hdr;
 		struct ib_udata ucore;
 		struct ib_udata uhw;
-		int err;
 		size_t written_count = count;
 
 		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-					   IB_USER_VERBS_CMD_COMMAND_MASK))
-			return -EINVAL;
+					   IB_USER_VERBS_CMD_COMMAND_MASK)) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
 		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
-		    !uverbs_ex_cmd_table[command])
-			return -ENOSYS;
+		    !uverbs_ex_cmd_table[command]) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (!file->ucontext)
-			return -EINVAL;
+		if (!file->ucontext) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
-			return -ENOSYS;
+		if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
-			return -EINVAL;
+		if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
-			return -EFAULT;
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+			ret = -EFAULT;
+			goto out;
+		}
 
 		count -= sizeof(hdr) + sizeof(ex_hdr);
 		buf += sizeof(hdr) + sizeof(ex_hdr);
 
-		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
-			return -EINVAL;
+		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (ex_hdr.cmd_hdr_reserved)
-			return -EINVAL;
+		if (ex_hdr.cmd_hdr_reserved) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		if (ex_hdr.response) {
-			if (!hdr.out_words && !ex_hdr.provider_out_words)
-				return -EINVAL;
+			if (!hdr.out_words && !ex_hdr.provider_out_words) {
+				ret = -EINVAL;
+				goto out;
+			}
 
 			if (!access_ok(VERIFY_WRITE,
 				       (void __user *) (unsigned long) ex_hdr.response,
-				       (hdr.out_words + ex_hdr.provider_out_words) * 8))
-				return -EFAULT;
+				       (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else {
-			if (hdr.out_words || ex_hdr.provider_out_words)
-				return -EINVAL;
+			if (hdr.out_words || ex_hdr.provider_out_words) {
+				ret = -EINVAL;
+				goto out;
+			}
 		}
 
 		INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -752,29 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 				       ex_hdr.provider_in_words * 8,
 				       ex_hdr.provider_out_words * 8);
 
-		err = uverbs_ex_cmd_table[command](file,
+		ret = uverbs_ex_cmd_table[command](file,
 						   ib_dev,
 						   &ucore,
 						   &uhw);
-
-		if (err)
-			return err;
-
-		return written_count;
+		if (!ret)
+			ret = written_count;
+	} else {
+		ret = -ENOSYS;
 	}
 
-	return -ENOSYS;
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct ib_uverbs_file *file = filp->private_data;
-	struct ib_device *ib_dev = file->device->ib_dev;
+	struct ib_device *ib_dev;
+	int ret = 0;
+	int srcu_key;
 
-	if (!ib_dev || !file->ucontext)
-		return -ENODEV;
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!file->ucontext)
+		ret = -ENODEV;
 	else
-		return ib_dev->mmap(file->ucontext, vma);
+		ret = ib_dev->mmap(file->ucontext, vma);
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
 }
 
 /*
@@ -791,21 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_device *dev;
 	struct ib_uverbs_file *file;
+	struct ib_device *ib_dev;
 	int ret;
+	int module_dependent;
+	int srcu_key;
 
 	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
 	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
-	if (!try_module_get(dev->ib_dev->owner)) {
-		ret = -ENODEV;
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	mutex_lock(&dev->lists_mutex);
+	ib_dev = srcu_dereference(dev->ib_dev,
+				  &dev->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
 		goto err;
 	}
 
-	file = kmalloc(sizeof *file, GFP_KERNEL);
+	/* In case IB device supports disassociate ucontext, there is no hard
+	 * dependency between uverbs device and its low level device.
+	 */
+	module_dependent = !(ib_dev->disassociate_ucontext);
+
+	if (module_dependent) {
+		if (!try_module_get(ib_dev->owner)) {
+			ret = -ENODEV;
+			goto err;
+		}
+	}
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
 	if (!file) {
 		ret = -ENOMEM;
-		goto err_module;
+		if (module_dependent)
+			goto err_module;
+
+		goto err;
 	}
 
 	file->device	 = dev;
@@ -816,13 +919,18 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = file;
 	kobject_get(&dev->kobj);
+	list_add_tail(&file->list, &dev->uverbs_file_list);
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return nonseekable_open(inode, filp);
 
 err_module:
-	module_put(dev->ib_dev->owner);
+	module_put(ib_dev->owner);
 
 err:
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 	if (atomic_dec_and_test(&dev->refcount))
 		ib_uverbs_comp_dev(dev);
 
@@ -833,8 +941,18 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_file *file = filp->private_data;
 	struct ib_uverbs_device *dev = file->device;
-
-	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+	struct ib_ucontext *ucontext = NULL;
+
+	mutex_lock(&file->device->lists_mutex);
+	ucontext = file->ucontext;
+	file->ucontext = NULL;
+	if (!file->is_closed) {
+		list_del(&file->list);
+		file->is_closed = 1;
+	}
+	mutex_unlock(&file->device->lists_mutex);
+	if (ucontext)
+		ib_uverbs_cleanup_ucontext(file, ucontext);
 
 	if (file->async_file)
 		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -871,12 +989,21 @@ static struct ib_client uverbs_client = {
 static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
+	int ret = -ENODEV;
+	int srcu_key;
 	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	struct ib_device *ib_dev;
 
 	if (!dev)
 		return -ENODEV;
 
-	return sprintf(buf, "%s\n", dev->ib_dev->name);
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%s\n", ib_dev->name);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+	return ret;
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
@@ -884,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
 				    struct device_attribute *attr, char *buf)
 {
 	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	int ret = -ENODEV;
+	int srcu_key;
+	struct ib_device *ib_dev;
 
 	if (!dev)
 		return -ENODEV;
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
-	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+	return ret;
 }
 static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
 
@@ -928,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	int devnum;
 	dev_t base;
 	struct ib_uverbs_device *uverbs_dev;
+	int ret;
 
 	if (!device->alloc_ucontext)
 		return;
@@ -936,11 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (!uverbs_dev)
 		return;
 
+	ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+	if (ret) {
+		kfree(uverbs_dev);
+		return;
+	}
+
 	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
 	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+	mutex_init(&uverbs_dev->lists_mutex);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
 
 	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -961,7 +1106,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	}
 	spin_unlock(&map_lock);
 
-	uverbs_dev->ib_dev           = device;
+	rcu_assign_pointer(uverbs_dev->ib_dev, device);
 	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
 	cdev_init(&uverbs_dev->cdev, NULL);
@@ -1005,9 +1150,72 @@ err:
 	return;
 }
 
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+					struct ib_device *ib_dev)
+{
+	struct ib_uverbs_file *file;
+	struct ib_uverbs_event_file *event_file;
+	struct ib_event event;
+
+	/* Pending running commands to terminate */
+	synchronize_srcu(&uverbs_dev->disassociate_srcu);
+	event.event = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	event.device = ib_dev;
+
+	mutex_lock(&uverbs_dev->lists_mutex);
+	while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+		struct ib_ucontext *ucontext;
+
+		file = list_first_entry(&uverbs_dev->uverbs_file_list,
+					struct ib_uverbs_file, list);
+		file->is_closed = 1;
+		ucontext = file->ucontext;
+		list_del(&file->list);
+		file->ucontext = NULL;
+		kref_get(&file->ref);
+		mutex_unlock(&uverbs_dev->lists_mutex);
+		/* We must release the mutex before going ahead and calling
+		 * disassociate_ucontext. disassociate_ucontext might end up
+		 * indirectly calling uverbs_close, for example due to freeing
+		 * the resources (e.g mmput).
+		 */
+		ib_uverbs_event_handler(&file->event_handler, &event);
+		if (ucontext) {
+			ib_dev->disassociate_ucontext(ucontext);
+			ib_uverbs_cleanup_ucontext(file, ucontext);
+		}
+
+		mutex_lock(&uverbs_dev->lists_mutex);
+		kref_put(&file->ref, ib_uverbs_release_file);
+	}
+
+	while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+		event_file = list_first_entry(&uverbs_dev->
+					      uverbs_events_file_list,
+					      struct ib_uverbs_event_file,
+					      list);
+		spin_lock_irq(&event_file->lock);
+		event_file->is_closed = 1;
+		spin_unlock_irq(&event_file->lock);
+
+		list_del(&event_file->list);
+		if (event_file->is_async) {
+			ib_unregister_event_handler(&event_file->uverbs_file->
+						    event_handler);
+			event_file->uverbs_file->event_handler.device = NULL;
+		}
+
+		wake_up_interruptible(&event_file->poll_wait);
+		kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+	}
+	mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
 static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 {
 	struct ib_uverbs_device *uverbs_dev = client_data;
+	int wait_clients = 1;
 
 	if (!uverbs_dev)
 		return;
@@ -1021,9 +1229,27 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 	else
 		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
+	if (device->disassociate_ucontext) {
+		/* We disassociate HW resources and immediately return.
+		 * Userspace will see a EIO errno for all future access.
+		 * Upon returning, ib_device may be freed internally and is not
+		 * valid any more.
+		 * uverbs_device is still available until all clients close
+		 * their files, then the uverbs device ref count will be zero
+		 * and its resources will be freed.
+		 * Note: At this point no more files can be opened since the
+		 * cdev was deleted, however active clients can still issue
+		 * commands and close their open files.
+		 */
+		rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+		ib_uverbs_free_hw_resources(uverbs_dev, device);
+		wait_clients = 0;
+	}
+
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
 		ib_uverbs_comp_dev(uverbs_dev);
-	wait_for_completion(&uverbs_dev->comp);
+	if (wait_clients)
+		wait_for_completion(&uverbs_dev->comp);
 	kobject_put(&uverbs_dev->kobj);
 }
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 128abf2..40b83f4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1769,6 +1769,7 @@ struct ib_device {
 	int			   (*destroy_flow)(struct ib_flow *flow_id);
 	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
 						      struct ib_mr_status *mr_status);
+	void			   (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
 
 	struct ib_dma_mapping_ops   *dma_ops;
 
-- 
cgit v0.10.2


From ae184ddeca5db6d60ba9067ba1c9e940fa01d400 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:06 +0300
Subject: IB/mlx4_ib: Disassociate support

Implements the IB core disassociate_ucontext API. The driver detaches the HW
resources for a given user context to prevent a dependency between application
termination and device disconnecting. This is done by managing the VMAs that
were mapped to the HW bars such as door bell and blueflame. When need to detach
remap them to an arbitrary kernel page returned by the zap API.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 1437ed5..efecdf0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -921,7 +921,7 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 		resp.cqe_size	      = dev->dev->caps.cqe_size;
 	}
 
-	context = kmalloc(sizeof *context, GFP_KERNEL);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
@@ -958,21 +958,143 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	return 0;
 }
 
+static void  mlx4_ib_vma_open(struct vm_area_struct *area)
+{
+	/* vma_open is called when a new VMA is created on top of our VMA.
+	 * This is done through either mremap flow or split_vma (usually due
+	 * to mlock, madvise, munmap, etc.). We do not support a clone of the
+	 * vma, as this VMA is strongly hardware related. Therefore we set the
+	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+	 * calling us again and trying to do incorrect actions. We assume that
+	 * the original vma size is exactly a single page that there will be no
+	 * "splitting" operations on.
+	 */
+	area->vm_ops = NULL;
+}
+
+static void  mlx4_ib_vma_close(struct vm_area_struct *area)
+{
+	struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
+
+	/* It's guaranteed that all VMAs opened on a FD are closed before the
+	 * file itself is closed, therefore no sync is needed with the regular
+	 * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
+	 * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
+	 * The close operation is usually called under mm->mmap_sem except when
+	 * process is exiting.  The exiting case is handled explicitly as part
+	 * of mlx4_ib_disassociate_ucontext.
+	 */
+	mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
+				area->vm_private_data;
+
+	/* set the vma context pointer to null in the mlx4_ib driver's private
+	 * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
+	 */
+	mlx4_ib_vma_priv_data->vma = NULL;
+}
+
+static const struct vm_operations_struct mlx4_ib_vm_ops = {
+	.open = mlx4_ib_vma_open,
+	.close = mlx4_ib_vma_close
+};
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+	int i;
+	int ret = 0;
+	struct vm_area_struct *vma;
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+	struct task_struct *owning_process  = NULL;
+	struct mm_struct   *owning_mm       = NULL;
+
+	owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+	if (!owning_process)
+		return;
+
+	owning_mm = get_task_mm(owning_process);
+	if (!owning_mm) {
+		pr_info("no mm, disassociate ucontext is pending task termination\n");
+		while (1) {
+			/* make sure that task is dead before returning, it may
+			 * prevent a rare case of module down in parallel to a
+			 * call to mlx4_ib_vma_close.
+			 */
+			put_task_struct(owning_process);
+			msleep(1);
+			owning_process = get_pid_task(ibcontext->tgid,
+						      PIDTYPE_PID);
+			if (!owning_process ||
+			    owning_process->state == TASK_DEAD) {
+				pr_info("disassociate ucontext done, task was terminated\n");
+				/* in case task was dead need to release the task struct */
+				if (owning_process)
+					put_task_struct(owning_process);
+				return;
+			}
+		}
+	}
+
+	/* need to protect from a race on closing the vma as part of
+	 * mlx4_ib_vma_close().
+	 */
+	down_read(&owning_mm->mmap_sem);
+	for (i = 0; i < HW_BAR_COUNT; i++) {
+		vma = context->hw_bar_info[i].vma;
+		if (!vma)
+			continue;
+
+		ret = zap_vma_ptes(context->hw_bar_info[i].vma,
+				   context->hw_bar_info[i].vma->vm_start,
+				   PAGE_SIZE);
+		if (ret) {
+			pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret);
+			BUG_ON(1);
+		}
+
+		/* context going to be destroyed, should not access ops any more */
+		context->hw_bar_info[i].vma->vm_ops = NULL;
+	}
+
+	up_read(&owning_mm->mmap_sem);
+	mmput(owning_mm);
+	put_task_struct(owning_process);
+}
+
+static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
+				 struct mlx4_ib_vma_private_data *vma_private_data)
+{
+	vma_private_data->vma = vma;
+	vma->vm_private_data = vma_private_data;
+	vma->vm_ops =  &mlx4_ib_vm_ops;
+}
+
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
+	struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
 
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 		return -EINVAL;
 
 	if (vma->vm_pgoff == 0) {
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_DB].vma)
+			return -EINVAL;
+
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
 				       to_mucontext(context)->uar.pfn,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
+
 	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_BF].vma)
+			return -EINVAL;
+
 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
@@ -980,9 +1102,18 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 				       dev->dev->caps.num_uars,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
+
 	} else if (vma->vm_pgoff == 3) {
 		struct mlx4_clock_params params;
-		int ret = mlx4_get_internal_clock_params(dev->dev, &params);
+		int ret;
+
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
+			return -EINVAL;
+
+		ret = mlx4_get_internal_clock_params(dev->dev, &params);
 
 		if (ret)
 			return ret;
@@ -995,6 +1126,9 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 				       >> PAGE_SHIFT,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma,
+				     &mucontext->hw_bar_info[HW_BAR_CLOCK]);
 	} else {
 		return -EINVAL;
 	}
@@ -2119,6 +2253,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
 	ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index fe52ead..1e7b23b 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -70,11 +70,24 @@ extern int mlx4_ib_sm_guid_assign;
 
 #define MLX4_IB_UC_STEER_QPN_ALIGN 1
 #define MLX4_IB_UC_MAX_NUM_QPS     256
+
+enum hw_bar_type {
+	HW_BAR_BF,
+	HW_BAR_DB,
+	HW_BAR_CLOCK,
+	HW_BAR_COUNT
+};
+
+struct mlx4_ib_vma_private_data {
+	struct vm_area_struct *vma;
+};
+
 struct mlx4_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct mlx4_uar		uar;
 	struct list_head	db_page_list;
 	struct mutex		db_page_mutex;
+	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
 };
 
 struct mlx4_ib_pd {
-- 
cgit v0.10.2


From e1c30298ccab87151a0c4241fc5985c591598361 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:07 +0300
Subject: IB/ucma: HW Device hot-removal support

Currently, IB/cma remove_one flow blocks until all user descriptor managed by
IB/ucma are released. This prevents hot-removal of IB devices. This patch
allows IB/cma to remove devices regardless of user space activity. Upon getting
the RDMA_CM_EVENT_DEVICE_REMOVAL event we close all the underlying HW resources
for the given ucontext. The ucontext itself is still alive till its explicit
destroying by its creator.

Running applications at that time will have some zombie device, further
operations may fail.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index acac9ea..a53fc9b 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -74,6 +74,7 @@ struct ucma_file {
 	struct list_head	ctx_list;
 	struct list_head	event_list;
 	wait_queue_head_t	poll_wait;
+	struct workqueue_struct	*close_wq;
 };
 
 struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
 
 	struct list_head	list;
 	struct list_head	mc_list;
+	/* mark that device is in process of destroying the internal HW
+	 * resources, protected by the global mut
+	 */
+	int			closing;
+	/* sync between removal event and id destroy, protected by file mut */
+	int			destroying;
+	struct work_struct	close_work;
 };
 
 struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
 	struct list_head	list;
 	struct rdma_cm_id	*cm_id;
 	struct rdma_ucm_event_resp resp;
+	struct work_struct	close_work;
 };
 
 static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 
 	mutex_lock(&mut);
 	ctx = _ucma_find_context(id, file);
-	if (!IS_ERR(ctx))
-		atomic_inc(&ctx->ref);
+	if (!IS_ERR(ctx)) {
+		if (ctx->closing)
+			ctx = ERR_PTR(-EIO);
+		else
+			atomic_inc(&ctx->ref);
+	}
 	mutex_unlock(&mut);
 	return ctx;
 }
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
 		complete(&ctx->comp);
 }
 
+static void ucma_close_event_id(struct work_struct *work)
+{
+	struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work);
+
+	rdma_destroy_id(uevent_close->cm_id);
+	kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+	struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work);
+
+	/* once all inflight tasks are finished, we close all underlying
+	 * resources. The context is still alive till its explicit destryoing
+	 * by its creator.
+	 */
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+}
+
 static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 {
 	struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 	if (!ctx)
 		return NULL;
 
+	INIT_WORK(&ctx->close_work, ucma_close_id);
 	atomic_set(&ctx->ref, 1);
 	init_completion(&ctx->comp);
 	INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
 	}
 }
 
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+	struct ucma_context *ctx = cm_id->context;
+	struct ucma_event *con_req_eve;
+	int event_found = 0;
+
+	if (ctx->destroying)
+		return;
+
+	/* only if context is pointing to cm_id that it owns it and can be
+	 * queued to be closed, otherwise that cm_id is an inflight one that
+	 * is part of that context event list pending to be detached and
+	 * reattached to its new context as part of ucma_get_event,
+	 * handled separately below.
+	 */
+	if (ctx->cm_id == cm_id) {
+		mutex_lock(&mut);
+		ctx->closing = 1;
+		mutex_unlock(&mut);
+		queue_work(ctx->file->close_wq, &ctx->close_work);
+		return;
+	}
+
+	list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+		if (con_req_eve->cm_id == cm_id &&
+		    con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+			list_del(&con_req_eve->list);
+			INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+			queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+			event_found = 1;
+			break;
+		}
+	}
+	if (!event_found)
+		printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
 static int ucma_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event)
 {
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
 		 * We ignore events for new connections until userspace has set
 		 * their context.  This can only happen if an error occurs on a
 		 * new connection before the user accepts it.  This is okay,
-		 * since the accept will just fail later.
+		 * since the accept will just fail later. However, we do need
+		 * to release the underlying HW resources in case of a device
+		 * removal event.
 		 */
+		if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+			ucma_removal_event_handler(cm_id);
+
 		kfree(uevent);
 		goto out;
 	}
 
 	list_add_tail(&uevent->list, &ctx->file->event_list);
 	wake_up_interruptible(&ctx->file->poll_wait);
+	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+		ucma_removal_event_handler(cm_id);
 out:
 	mutex_unlock(&ctx->file->mut);
 	return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
 }
 
 /*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock.  We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing.  We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
  */
 static int ucma_free_ctx(struct ucma_context *ctx)
 {
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
 	struct ucma_event *uevent, *tmp;
 	LIST_HEAD(list);
 
-	/* No new events will be generated after destroying the id. */
-	rdma_destroy_id(ctx->cm_id);
 
 	ucma_cleanup_multicast(ctx);
 
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
-	ucma_put_ctx(ctx);
-	wait_for_completion(&ctx->comp);
-	resp.events_reported = ucma_free_ctx(ctx);
+	mutex_lock(&ctx->file->mut);
+	ctx->destroying = 1;
+	mutex_unlock(&ctx->file->mut);
+
+	flush_workqueue(ctx->file->close_wq);
+	/* At this point it's guaranteed that there is no inflight
+	 * closing task */
+	mutex_lock(&mut);
+	if (!ctx->closing) {
+		mutex_unlock(&mut);
+		ucma_put_ctx(ctx);
+		wait_for_completion(&ctx->comp);
+		rdma_destroy_id(ctx->cm_id);
+	} else {
+		mutex_unlock(&mut);
+	}
 
+	resp.events_reported = ucma_free_ctx(ctx);
 	if (copy_to_user((void __user *)(unsigned long)cmd.response,
 			 &resp, sizeof(resp)))
 		ret = -EFAULT;
@@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
 	INIT_LIST_HEAD(&file->ctx_list);
 	init_waitqueue_head(&file->poll_wait);
 	mutex_init(&file->mut);
+	file->close_wq = create_singlethread_workqueue("ucma_close_id");
 
 	filp->private_data = file;
 	file->filp = filp;
@@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
 
 	mutex_lock(&file->mut);
 	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		ctx->destroying = 1;
 		mutex_unlock(&file->mut);
 
 		mutex_lock(&mut);
 		idr_remove(&ctx_idr, ctx->id);
 		mutex_unlock(&mut);
 
+		flush_workqueue(file->close_wq);
+		/* At that step once ctx was marked as destroying and workqueue
+		 * was flushed we are safe from any inflights handlers that
+		 * might put other closing task.
+		 */
+		mutex_lock(&mut);
+		if (!ctx->closing) {
+			mutex_unlock(&mut);
+			/* rdma_destroy_id ensures that no event handlers are
+			 * inflight for that id before releasing it.
+			 */
+			rdma_destroy_id(ctx->cm_id);
+		} else {
+			mutex_unlock(&mut);
+		}
+
 		ucma_free_ctx(ctx);
 		mutex_lock(&file->mut);
 	}
 	mutex_unlock(&file->mut);
+	destroy_workqueue(file->close_wq);
 	kfree(file);
 	return 0;
 }
-- 
cgit v0.10.2


From ba13b5f8f86efa78bc0aaea297b0001b6cbf6c21 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Thu, 20 Aug 2015 14:20:42 -0400
Subject: IB/sa: Fix rdma netlink message flags

The flags to ibnl_put_msg should be NLM_F_REQUEST instead of GFP_KERNEL.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index edcf568..8c014b3 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -532,7 +532,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query)
 
 	/* Put nlmsg header only for now */
 	data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
-			    RDMA_NL_LS_OP_RESOLVE, (int) GFP_KERNEL);
+			    RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
 	if (!data) {
 		kfree_skb(skb);
 		return -EMSGSIZE;
-- 
cgit v0.10.2


From 54b9a96f10d9acb7b1ffd40e2e1736443eb7656d Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Wed, 26 Aug 2015 23:00:59 -0400
Subject: IB/cxgb4: Fix if statement in pick_local_ip6adddrs

This fixes an if statement checking the return value of the function
get_lladdr for success in the function pick_local_ip6addrs to instead
of directly checking the return value of this call check the opposite
as get_lladdr returns zero for success which would incorrectly make
this if statement block not execute with the current if statement
check.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Acked-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index f0c1512..debc39d 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3031,7 +3031,7 @@ static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
 	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr;
 	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr;
 
-	if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
+	if (!get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
 		memcpy(la6->sin6_addr.s6_addr, &addr, 16);
 		memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
 		return 0;
-- 
cgit v0.10.2


From b632ffa7cee439ba5dce3b3bc4a5cbe2b3e20133 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Aug 2015 11:00:37 +0200
Subject: IB/uverbs: reject invalid or unknown opcodes

We have many WR opcodes that are only supported in kernel space
and/or require optional information to be copied into the WR
structure.  Reject all those not explicitly handled so that we
can't pass invalid information to drivers.

Cc: stable@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a15318a..be4cb9f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2372,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 		next->send_flags = user_wr->send_flags;
 
 		if (is_ud) {
+			if (next->opcode != IB_WR_SEND &&
+			    next->opcode != IB_WR_SEND_WITH_IMM) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+
 			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
 						     file->ucontext);
 			if (!next->wr.ud.ah) {
@@ -2411,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 					user_wr->wr.atomic.compare_add;
 				next->wr.atomic.swap = user_wr->wr.atomic.swap;
 				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+			case IB_WR_SEND:
 				break;
 			default:
-				break;
+				ret = -EINVAL;
+				goto out_put;
 			}
 		}
 
-- 
cgit v0.10.2


From 11d748045c6dadb279d1acdb6d2ea8f3f2ede85b Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Tue, 1 Sep 2015 09:56:56 +0300
Subject: IB/mlx5: avoid destroying a NULL mr in reg_user_mr error flow

The mlx5_ib_reg_user_mr() function will attempt to call clean_mr() in
its error flow even though there is never a case where the error flow
occurs with a valid MR pointer to destroy.

Remove the clean_mr() call and the incorrect comment above it.

Fixes: b4cfe447d47b ("IB/mlx5: Implement on demand paging by adding
support for MMU notifiers")
Cc: Eli Cohen <eli@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 0dfd379..54a15b5 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1114,19 +1114,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	return &mr->ibmr;
 
 error:
-	/*
-	 * Destroy the umem *before* destroying the MR, to ensure we
-	 * will not have any in-flight notifiers when destroying the
-	 * MR.
-	 *
-	 * As the MR is completely invalid to begin with, and this
-	 * error path is only taken if we can't push the mr entry into
-	 * the pagefault tree, this is safe.
-	 */
-
 	ib_umem_release(umem);
-	/* Kill the MR, and return an error code. */
-	clean_mr(mr);
 	return ERR_PTR(err);
 }
 
-- 
cgit v0.10.2


From b636401f0ec9bbf7931774e00f3adf7ee9214cce Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Wed, 2 Sep 2015 22:23:04 +0300
Subject: mlx5: Fix incorrect wc pkey_index assignment for GSI messages

Since patch series "Demux IB CM requests in the rdma_cm module" the
P_Key index is taken from the work completion rather than the message
itself.

The HCA provides us with the message P_Key. In order to provide the
P_Key index, we need to look it up. Given that this is relevant only
for GSI messages (session establishments) which is less performance critical,
micro-optimize against the GSI (is_qp1) branch.

Fixes: 4c21b5bcef73 ("IB/cma: Add net_dev and private data checks to
RDMA CM")
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 5c9eeea..2d0dbbf 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -33,6 +33,7 @@
 #include <linux/kref.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/ib_cache.h>
 #include "mlx5_ib.h"
 #include "user.h"
 
@@ -227,7 +228,14 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 	wc->dlid_path_bits = cqe->ml_path;
 	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
 	wc->wc_flags |= g ? IB_WC_GRH : 0;
-	wc->pkey_index     = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+	if (unlikely(is_qp1(qp->ibqp.qp_type))) {
+		u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+
+		ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
+				    &wc->pkey_index);
+	} else {
+		wc->pkey_index = 0;
+	}
 }
 
 static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a5fa0b9..bb8cda7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -680,6 +680,11 @@ static inline u8 convert_access(int acc)
 	       MLX5_PERM_LOCAL_READ;
 }
 
+static inline int is_qp1(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_GSI;
+}
+
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 203c8a4..c745c6c 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -76,11 +76,6 @@ static int is_qp0(enum ib_qp_type qp_type)
 	return qp_type == IB_QPT_SMI;
 }
 
-static int is_qp1(enum ib_qp_type qp_type)
-{
-	return qp_type == IB_QPT_GSI;
-}
-
 static int is_sqp(enum ib_qp_type qp_type)
 {
 	return is_qp0(qp_type) || is_qp1(qp_type);
-- 
cgit v0.10.2


From 6fd8edabc2b03203e6bc44e77d1dfff415e706cc Mon Sep 17 00:00:00 2001
From: Jubin John <jubin.john@intel.com>
Date: Wed, 2 Sep 2015 10:43:24 -0400
Subject: IB/hfi1: Add CSRs for CONFIG_SDMA_VERBOSITY

3 CSRs needed by the CONFIG_SDMA_VERBOSITY code were removed during
the CSR clean up. Adding these CSRs back to resolve 0-day build failure:
https://lists.01.org/pipermail/kbuild-all/2015-August/011919.html

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
index 6521030..bf45de2 100644
--- a/drivers/staging/rdma/hfi1/chip_registers.h
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -1285,5 +1285,8 @@
 #define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
 #define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
 #define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
 
 #endif          /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
index 37bd767..a8c903c 100644
--- a/drivers/staging/rdma/hfi1/sdma.c
+++ b/drivers/staging/rdma/hfi1/sdma.c
@@ -1769,7 +1769,7 @@ void sdma_dumpstate(struct sdma_engine *sde)
 	sdma_dumpstate_helper(SD(ENG_ERR_MASK));
 
 	for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
-		sdma_dumpstate_helper2(CCE_INT_STATUS));
+		sdma_dumpstate_helper2(CCE_INT_STATUS);
 		sdma_dumpstate_helper2(CCE_INT_MASK);
 		sdma_dumpstate_helper2(CCE_INT_BLOCKED);
 	}
@@ -1777,7 +1777,7 @@ void sdma_dumpstate(struct sdma_engine *sde)
 	sdma_dumpstate_helper(SD(TAIL));
 	sdma_dumpstate_helper(SD(HEAD));
 	sdma_dumpstate_helper(SD(PRIORITY_THLD));
-	sdma_dumpstate_helper(SD(IDLE_CNT);
+	sdma_dumpstate_helper(SD(IDLE_CNT));
 	sdma_dumpstate_helper(SD(RELOAD_CNT));
 	sdma_dumpstate_helper(SD(DESC_CNT));
 	sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
-- 
cgit v0.10.2


From 6f876ce4b53f7c748e07cedab661aa4bcc8a09d2 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 2 Sep 2015 18:46:21 -0400
Subject: IB/hfi1: Add PSM2 user space header to header_install

When the hfi1 driver was added a user space header file (hfi1_user.h) was added
to be shared between PSM2 and the driver.  However, the file was not added to
the header install.  Add it now.

Fixes: d4ab347005fb ("IB/core: Add core header changes needed for OPA")
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild
index 687ae33..231901b 100644
--- a/include/uapi/rdma/Kbuild
+++ b/include/uapi/rdma/Kbuild
@@ -5,3 +5,4 @@ header-y += ib_user_sa.h
 header-y += ib_user_verbs.h
 header-y += rdma_netlink.h
 header-y += rdma_user_cm.h
+header-y += hfi/
diff --git a/include/uapi/rdma/hfi/Kbuild b/include/uapi/rdma/hfi/Kbuild
new file mode 100644
index 0000000..ef23c29
--- /dev/null
+++ b/include/uapi/rdma/hfi/Kbuild
@@ -0,0 +1,2 @@
+# UAPI Header export list
+header-y += hfi1_user.h
-- 
cgit v0.10.2


From ce755c9b01e09ee4907cf79bd0f57fa5cf65c4c3 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 2 Sep 2015 18:45:54 -0400
Subject: IB/core: Remove unnecessary defines from ib_mad.h

Remove the unused IB_NOTICE_REPRESS_* defines.

When the hfi1 driver was added these definitions were moved from the qib driver
to ib_mad.h.  They should have been removed instead.

Fixes: d4ab347005fb ("IB/core: Add core header changes needed for OPA")
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Hal Rosenstock <hal@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index c206205..7f2cf85 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -157,18 +157,6 @@
 #define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
 
 /*
- * Repress trap/notice flags
- */
-#define IB_NOTICE_REPRESS_LLI_THRESH	(1 << 0)
-#define IB_NOTICE_REPRESS_EBO_THRESH	(1 << 1)
-#define IB_NOTICE_REPRESS_FLOW_UPDATE	(1 << 2)
-#define IB_NOTICE_REPRESS_CAP_MASK_CHG	(1 << 3)
-#define IB_NOTICE_REPRESS_SYS_GUID_CHG	(1 << 4)
-#define IB_NOTICE_REPRESS_BAD_MKEY	(1 << 5)
-#define IB_NOTICE_REPRESS_BAD_PKEY	(1 << 6)
-#define IB_NOTICE_REPRESS_BAD_QKEY	(1 << 7)
-
-/*
  * Generic trap/notice other local changes flags (trap 144).
  */
 #define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
-- 
cgit v0.10.2


From 0629cb06cdf8f1a403ce71bce5b83380ae898e1a Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 3 Sep 2015 14:16:30 -0400
Subject: IB/core: Move SM class defines from ib_mad.h to ib_smi.h

When the hfi1 driver was added these definitions were moved from the qib driver
to ib_mad.h to be used by both qib and hfi1.  They should have been moved to
ib_smi.h instead.

Fixes: d4ab347005fb ("IB/core: Add core header changes needed for OPA")
Reviewed-by: Hal Rosenstock <hal@mellanox.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index f42bd0f..22e356c 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <rdma/ib_smi.h>
 
 #include "qib.h"
 #include "qib_mad.h"
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 7f2cf85..188df91 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -144,31 +144,6 @@
 #define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
 #define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
 
-/*
- * Generic trap/notice numbers
- */
-#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
-#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
-#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
-#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
-#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
-#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
-#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
-#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
-
-/*
- * Generic trap/notice other local changes flags (trap 144).
- */
-#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
-#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
-#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
-
-/*
- * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
- */
-#define IB_NOTICE_TRAP_DR_NOTICE	0x80
-#define IB_NOTICE_TRAP_DR_TRUNC		0x40
-
 enum {
 	IB_MGMT_MAD_HDR = 24,
 	IB_MGMT_MAD_DATA = 232,
@@ -282,21 +257,6 @@ struct ib_class_port_info {
 	__be32			trap_qkey;
 };
 
-struct ib_node_info {
-	u8 base_version;
-	u8 class_version;
-	u8 node_type;
-	u8 num_ports;
-	__be64 sys_guid;
-	__be64 node_guid;
-	__be64 port_guid;
-	__be16 partition_cap;
-	__be16 device_id;
-	__be32 revision;
-	u8 local_port_num;
-	u8 vendor_id[3];
-} __packed;
-
 struct ib_mad_notice_attr {
 	u8 generic_type;
 	u8 prod_type_msb;
@@ -361,11 +321,6 @@ struct ib_mad_notice_attr {
 	} details;
 };
 
-struct ib_vl_weight_elem {
-	u8      vl;     /* VL is low 5 bits, upper 3 bits reserved */
-	u8      weight;
-};
-
 /**
  * ib_mad_send_buf - MAD data buffer and work request for sends.
  * @next: A pointer used to chain together MADs for posting.
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index 98b9086..b439e98 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -119,10 +119,57 @@ struct ib_port_info {
 	u8 link_roundtrip_latency[3];
 };
 
+struct ib_node_info {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __packed;
+
+struct ib_vl_weight_elem {
+	u8      vl;     /* IB: VL is low 4 bits, upper 4 bits reserved */
+                        /* OPA: VL is low 5 bits, upper 3 bits reserved */
+	u8      weight;
+};
+
 static inline u8
 ib_get_smp_direction(struct ib_smp *smp)
 {
 	return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
 }
 
+/*
+ * SM Trap/Notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
+
+/*
+ * Other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
+
+/*
+ * M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE	0x80
+#define IB_NOTICE_TRAP_DR_TRUNC		0x40
+
+
 #endif /* IB_SMI_H */
-- 
cgit v0.10.2


From 7fbc67df2cd6d0b72fd5d6d3acaa79ab6f5b0224 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 24 Aug 2015 19:04:51 +0300
Subject: IB/srp: Fix possible protection fault

srp_destroy_qp is designed to indicate we are safe to continue with
freeing the channel resources by modifying the qp error state,
posting a dummy wr on the queue-pair and waiting for it to flush.
This also holds for the channel registration pool as we are unmapping
the memory region when handling a scsi response. Destroying the
channel registration pool before we make sure we processed all the
inflight IO might introduce a use-after-free of the registration pool.

This use-after-free is demonstrated in the stack trace below where
srp is trying to unmap a used FMR after the fmr_pool was already destroyed.

general protection fault: 0000 [#1] SMP
RIP: 0010:[<ffffffff8151121b>]  [<ffffffff8151121b>] _raw_spin_lock_irqsave+0x1b/0x50
Call Trace:
 [<ffffffffa055d88a>] ib_fmr_pool_unmap+0x1a/0xb0 [ib_core]
 [<ffffffffa06c00ed>] srp_unmap_data.isra.28+0x17d/0x250 [ib_srp]
 [<ffffffffa06c01eb>] srp_free_req+0x2b/0x60 [ib_srp]
 [<ffffffffa06c0c94>] srp_recv_completion+0x174/0x580 [ib_srp]
 [<ffffffffa04580fe>] mlx4_eq_int+0x4de/0xe50 [mlx4_core]
 [<ffffffffa0458b00>] mlx4_msi_x_interrupt+0x10/0x20 [mlx4_core]
 [<ffffffff810abc45>] handle_irq_event_percpu+0x35/0x1b0
 [<ffffffff810abdf2>] handle_irq_event+0x32/0x50
 [<ffffffff810ae5cf>] handle_edge_irq+0x6f/0x120
 [<ffffffff8100455a>] handle_irq+0x1a/0x30
 [<ffffffff8151b475>] do_IRQ+0x45/0xb0
 [<ffffffff8151162d>] common_interrupt+0x6d/0x6d
 [<ffffffff813e4d2f>] cpuidle_enter_state+0x4f/0xc0
 [<ffffffff813e4e6c>] cpuidle_idle_call+0xcc/0x210
 [<ffffffff8100b9ea>] arch_cpu_idle+0xa/0x30
 [<ffffffff810ab1e1>] cpu_startup_entry+0xe1/0x270
 [<ffffffff81030b3a>] start_secondary+0x21a/0x2c0

Reported-by: Eliott Kespi <eliottk@mellanox.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ca98d3b..b481490 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -554,9 +554,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 				     "FR pool allocation failed (%d)\n", ret);
 			goto err_qp;
 		}
-		if (ch->fr_pool)
-			srp_destroy_fr_pool(ch->fr_pool);
-		ch->fr_pool = fr_pool;
 	} else if (dev->use_fmr) {
 		fmr_pool = srp_alloc_fmr_pool(target);
 		if (IS_ERR(fmr_pool)) {
@@ -565,9 +562,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 				     "FMR pool allocation failed (%d)\n", ret);
 			goto err_qp;
 		}
-		if (ch->fmr_pool)
-			ib_destroy_fmr_pool(ch->fmr_pool);
-		ch->fmr_pool = fmr_pool;
 	}
 
 	if (ch->qp)
@@ -581,6 +575,16 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	ch->recv_cq = recv_cq;
 	ch->send_cq = send_cq;
 
+	if (dev->use_fast_reg) {
+		if (ch->fr_pool)
+			srp_destroy_fr_pool(ch->fr_pool);
+		ch->fr_pool = fr_pool;
+	} else if (dev->use_fmr) {
+		if (ch->fmr_pool)
+			ib_destroy_fmr_pool(ch->fmr_pool);
+		ch->fmr_pool = fmr_pool;
+	}
+
 	kfree(init_attr);
 	return 0;
 
-- 
cgit v0.10.2


From c3acdc06a95ff20d920220ecb931186b0bb22c42 Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Thu, 3 Sep 2015 17:05:58 -0400
Subject: IB/ipoib: Clean up send-only multicast joins

Even though we don't expect the group to be created by the SM we
sill need to provide all the parameters to force the SM to validate
they are correct.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 0d23e05..5e2db3b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -448,8 +448,7 @@ out_locked:
 	return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
-			     int create)
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_sa_multicast *multicast;
@@ -471,7 +470,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 		IB_SA_MCMEMBER_REC_PKEY		|
 		IB_SA_MCMEMBER_REC_JOIN_STATE;
 
-	if (create) {
+	if (mcast != priv->broadcast) {
+		/*
+		 * RFC 4391:
+		 *  The MGID MUST use the same P_Key, Q_Key, SL, MTU,
+		 *  and HopLimit as those used in the broadcast-GID.  The rest
+		 *  of attributes SHOULD follow the values used in the
+		 *  broadcast-GID as well.
+		 */
 		comp_mask |=
 			IB_SA_MCMEMBER_REC_QKEY			|
 			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
@@ -492,6 +498,22 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 		rec.sl		  = priv->broadcast->mcmember.sl;
 		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
 		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
+
+		/*
+		 * Historically Linux IPoIB has never properly supported SEND
+		 * ONLY join. It emulated it by not providing all the required
+		 * attributes, which is enough to prevent group creation and
+		 * detect if there are full members or not. A major problem
+		 * with supporting SEND ONLY is detecting when the group is
+		 * auto-destroyed as IPoIB will cache the MLID..
+		 */
+#if 1
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			comp_mask &= ~IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+#else
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			rec.join_state = 4;
+#endif
 	}
 
 	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
@@ -517,7 +539,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	struct ib_port_attr port_attr;
 	unsigned long delay_until = 0;
 	struct ipoib_mcast *mcast = NULL;
-	int create = 1;
 
 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
 		return;
@@ -566,7 +587,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
 		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
 			mcast = priv->broadcast;
-			create = 0;
 			if (mcast->backoff > 1 &&
 			    time_before(jiffies, mcast->delay_until)) {
 				delay_until = mcast->delay_until;
@@ -590,12 +610,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
 				/* Found the next unjoined group */
 				init_completion(&mcast->done);
 				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-				if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-					create = 0;
-				else
-					create = 1;
 				spin_unlock_irq(&priv->lock);
-				ipoib_mcast_join(dev, mcast, create);
+				ipoib_mcast_join(dev, mcast);
 				spin_lock_irq(&priv->lock);
 			} else if (!delay_until ||
 				 time_before(mcast->delay_until, delay_until))
@@ -618,7 +634,7 @@ out:
 	}
 	spin_unlock_irq(&priv->lock);
 	if (mcast)
-		ipoib_mcast_join(dev, mcast, create);
+		ipoib_mcast_join(dev, mcast);
 }
 
 int ipoib_mcast_start_thread(struct net_device *dev)
-- 
cgit v0.10.2


From d1178cbcdcf91900ccf10a177350d7945703c151 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Fri, 21 Aug 2015 17:34:13 -0600
Subject: IB/ipoib: Suppress warning for send only join failures

We expect send only joins to fail, it just means there are no listeners
for the group. The correct thing to do is silently drop the packet
at source.

Eg avahi will full join 224.0.0.251 which causes a send only IGMP packet
to 224.0.0.22, and then a warning level kmessage like this:

 ib0: sendonly multicast join failed for ff12:401b:ffff:0000:0000:0000:0000:0016, status -22

If there is no IP router listening to IGMP.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 5e2db3b..09a1748 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -393,8 +393,13 @@ static int ipoib_mcast_join_complete(int status,
 			goto out_locked;
 		}
 	} else {
-		if (mcast->logcount++ < 20) {
-			if (status == -ETIMEDOUT || status == -EAGAIN) {
+		bool silent_fail =
+		    test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    status == -EINVAL;
+
+		if (mcast->logcount < 20) {
+			if (status == -ETIMEDOUT || status == -EAGAIN ||
+			    silent_fail) {
 				ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
 						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
 						mcast->mcmember.mgid.raw, status);
@@ -403,6 +408,9 @@ static int ipoib_mcast_join_complete(int status,
 						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
 					   mcast->mcmember.mgid.raw, status);
 			}
+
+			if (!silent_fail)
+				mcast->logcount++;
 		}
 
 		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
-- 
cgit v0.10.2