From bc1db9af731a74c7eca04df5936214c800774113 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Fri, 9 Apr 2010 17:13:50 -0700
Subject: IB: Explicitly rule out llseek to avoid BKL in default_llseek()

Several RDMA user-access drivers have file_operations structures with
no .llseek method set.  None of the drivers actually do anything with
f_pos, so this means llseek is essentially a NOP, instead of returning
an error as leaving other file_operations methods unimplemented would
do.  This is mostly harmless, except that a NULL .llseek means that
default_llseek() is used, and this function grabs the BKL, which we
would like to avoid.

Since llseek does nothing useful on these files, we would like it to
return an error to userspace instead of silently grabbing the BKL and
succeeding.  For nearly all of the file types, we take the
belt-and-suspenders approach of setting the .llseek method to
no_llseek and also calling nonseekable_open(); the exception is the
uverbs_event files, which are created with anon_inode_getfile(), which
already sets f_mode the same way as nonseekable_open() would.

This work is motivated by Arnd Bergmann's bkl-removal tree.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 512b1c4..4647484 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1181,7 +1181,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)
 	file->filp = filp;
 	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
 
-	return 0;
+	return nonseekable_open(inode, filp);
 }
 
 static int ib_ucm_close(struct inode *inode, struct file *filp)
@@ -1229,6 +1229,7 @@ static const struct file_operations ucm_fops = {
 	.release = ib_ucm_close,
 	.write	 = ib_ucm_write,
 	.poll    = ib_ucm_poll,
+	.llseek	 = no_llseek,
 };
 
 static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4618508..ac7edc2 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1220,7 +1220,8 @@ static int ucma_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = file;
 	file->filp = filp;
-	return 0;
+
+	return nonseekable_open(inode, filp);
 }
 
 static int ucma_close(struct inode *inode, struct file *filp)
@@ -1250,6 +1251,7 @@ static const struct file_operations ucma_fops = {
 	.release = ucma_close,
 	.write	 = ucma_write,
 	.poll    = ucma_poll,
+	.llseek	 = no_llseek,
 };
 
 static struct miscdevice ucma_misc = {
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index e7db054..6babb72 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -781,7 +781,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 {
 	struct ib_umad_port *port;
 	struct ib_umad_file *file;
-	int ret = 0;
+	int ret;
 
 	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
 	if (port)
@@ -814,6 +814,8 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 
 	list_add_tail(&file->port_list, &port->file_list);
 
+	ret = nonseekable_open(inode, filp);
+
 out:
 	mutex_unlock(&port->file_mutex);
 	return ret;
@@ -866,7 +868,8 @@ static const struct file_operations umad_fops = {
 	.compat_ioctl	= ib_umad_compat_ioctl,
 #endif
 	.open		= ib_umad_open,
-	.release	= ib_umad_close
+	.release	= ib_umad_close,
+	.llseek		= no_llseek,
 };
 
 static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -903,7 +906,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = port;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 
 fail:
 	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
@@ -933,7 +936,8 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
 static const struct file_operations umad_sm_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = ib_umad_sm_open,
-	.release = ib_umad_sm_close
+	.release = ib_umad_sm_close,
+	.llseek	 = no_llseek,
 };
 
 static struct ib_client umad_client = {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index fb35262..ec83e9f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -369,7 +369,8 @@ static const struct file_operations uverbs_event_fops = {
 	.read	 = ib_uverbs_event_read,
 	.poll    = ib_uverbs_event_poll,
 	.release = ib_uverbs_event_close,
-	.fasync  = ib_uverbs_event_fasync
+	.fasync  = ib_uverbs_event_fasync,
+	.llseek	 = no_llseek,
 };
 
 void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -623,7 +624,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = file;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 
 err_module:
 	module_put(dev->ib_dev->owner);
@@ -651,7 +652,8 @@ static const struct file_operations uverbs_fops = {
 	.owner	 = THIS_MODULE,
 	.write	 = ib_uverbs_write,
 	.open	 = ib_uverbs_open,
-	.release = ib_uverbs_close
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
 };
 
 static const struct file_operations uverbs_mmap_fops = {
@@ -659,7 +661,8 @@ static const struct file_operations uverbs_mmap_fops = {
 	.write	 = ib_uverbs_write,
 	.mmap    = ib_uverbs_mmap,
 	.open	 = ib_uverbs_open,
-	.release = ib_uverbs_close
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
 };
 
 static struct ib_client uverbs_client = {
-- 
cgit v0.10.2


From 7960d6b9de7716e9080b47f6dc4d415d967e032d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 2 Apr 2010 04:29:38 +0000
Subject: RDMA/cxgb3: Use the dma state API instead of pci equivalents

The DMA API is preferred; no functional change.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 35f286f..005b7b5 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -174,7 +174,7 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
 		kfree(cq->sw_queue);
 		return -ENOMEM;
 	}
-	pci_unmap_addr_set(cq, mapping, cq->dma_addr);
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
 	memset(cq->queue, 0, size);
 	setup.id = cq->cqid;
 	setup.base_addr = (u64) (cq->dma_addr);
@@ -297,7 +297,7 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
 		goto err4;
 
 	memset(wq->queue, 0, depth * sizeof(union t3_wr));
-	pci_unmap_addr_set(wq, mapping, wq->dma_addr);
+	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
 	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
 	if (!kernel_domain)
 		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
@@ -325,7 +325,7 @@ int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << (cq->size_log2))
 			  * sizeof(struct t3_cqe), cq->queue,
-			  pci_unmap_addr(cq, mapping));
+			  dma_unmap_addr(cq, mapping));
 	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
 	return err;
 }
@@ -336,7 +336,7 @@ int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << (wq->size_log2))
 			  * sizeof(union t3_wr), wq->queue,
-			  pci_unmap_addr(wq, mapping));
+			  dma_unmap_addr(wq, mapping));
 	kfree(wq->sq);
 	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
 	kfree(wq->rq);
@@ -537,7 +537,7 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
 		err = -ENOMEM;
 		goto err;
 	}
-	pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
 			   rdev_p->ctrl_qp.dma_addr);
 	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
 	memset(rdev_p->ctrl_qp.workq, 0,
@@ -583,7 +583,7 @@ static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << T3_CTRL_QP_SIZE_LOG2)
 			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
-			  pci_unmap_addr(&rdev_p->ctrl_qp, mapping));
+			  dma_unmap_addr(&rdev_p->ctrl_qp, mapping));
 	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 073373c..8f0caf7 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -71,7 +71,7 @@ struct cxio_hal_ctrl_qp {
 	wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
 	union t3_wr *workq;	/* the work request queue */
 	dma_addr_t dma_addr;	/* pci bus address of the workq */
-	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 	void __iomem *doorbell;
 };
 
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index 15073b2..e5ddb63 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -691,7 +691,7 @@ struct t3_swrq {
 struct t3_wq {
 	union t3_wr *queue;		/* DMA accessable memory */
 	dma_addr_t dma_addr;		/* DMA address for HW */
-	DECLARE_PCI_UNMAP_ADDR(mapping)	/* unmap kruft */
+	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
 	u32 error;			/* 1 once we go to ERROR */
 	u32 qpid;
 	u32 wptr;			/* idx to next available WR slot */
@@ -718,7 +718,7 @@ struct t3_cq {
 	u32 wptr;
 	u32 size_log2;
 	dma_addr_t dma_addr;
-	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 	struct t3_cqe *queue;
 	struct t3_cqe *sw_queue;
 	u32 sw_rptr;
-- 
cgit v0.10.2


From 73a203d2014f50d874b9e40083ad481ca70408e8 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 5 Apr 2010 19:59:57 +0000
Subject: RDMA/cxgb3: Don't free skbs on NET_XMIT_* indications from LLD

The low level cxgb3 driver can return NET_XMIT_CN and friends.
The iw_cxgb3 driver should _not_ treat these as errors.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 4fef032..cfd6db0 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -151,7 +151,7 @@ int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2
 		return -EIO;
 	}
 	error = l2t_send(tdev, skb, l2e);
-	if (error)
+	if (error < 0)
 		kfree_skb(skb);
 	return error;
 }
@@ -167,7 +167,7 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
 		return -EIO;
 	}
 	error = cxgb3_ofld_send(tdev, skb);
-	if (error)
+	if (error < 0)
 		kfree_skb(skb);
 	return error;
 }
-- 
cgit v0.10.2


From 3a2baff783497321e8322ce29f3a33a21c0d88f5 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 2 Apr 2010 04:29:39 +0000
Subject: IB/mthca: Use the dma state API instead of pci equivalents

The DMA API is preferred; no functional change.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
index c5ccc2d..b4e0cf4 100644
--- a/drivers/infiniband/hw/mthca/mthca_allocator.c
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -211,7 +211,7 @@ int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
 		if (!buf->direct.buf)
 			return -ENOMEM;
 
-		pci_unmap_addr_set(&buf->direct, mapping, t);
+		dma_unmap_addr_set(&buf->direct, mapping, t);
 
 		memset(buf->direct.buf, 0, size);
 
@@ -251,7 +251,7 @@ int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
 				goto err_free;
 
 			dma_list[i] = t;
-			pci_unmap_addr_set(&buf->page_list[i], mapping, t);
+			dma_unmap_addr_set(&buf->page_list[i], mapping, t);
 
 			clear_page(buf->page_list[i].buf);
 		}
@@ -289,12 +289,12 @@ void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
 
 	if (is_direct)
 		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
-				  pci_unmap_addr(&buf->direct, mapping));
+				  dma_unmap_addr(&buf->direct, mapping));
 	else {
 		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
 			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
 					  buf->page_list[i].buf,
-					  pci_unmap_addr(&buf->page_list[i],
+					  dma_unmap_addr(&buf->page_list[i],
 							 mapping));
 		kfree(buf->page_list);
 	}
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 9388164..8e8c728 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -504,7 +504,7 @@ static int mthca_create_eq(struct mthca_dev *dev,
 			goto err_out_free_pages;
 
 		dma_list[i] = t;
-		pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+		dma_unmap_addr_set(&eq->page_list[i], mapping, t);
 
 		clear_page(eq->page_list[i].buf);
 	}
@@ -579,7 +579,7 @@ static int mthca_create_eq(struct mthca_dev *dev,
 		if (eq->page_list[i].buf)
 			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
 					  eq->page_list[i].buf,
-					  pci_unmap_addr(&eq->page_list[i],
+					  dma_unmap_addr(&eq->page_list[i],
 							 mapping));
 
 	mthca_free_mailbox(dev, mailbox);
@@ -629,7 +629,7 @@ static void mthca_free_eq(struct mthca_dev *dev,
 	for (i = 0; i < npages; ++i)
 		pci_free_consistent(dev->pdev, PAGE_SIZE,
 				    eq->page_list[i].buf,
-				    pci_unmap_addr(&eq->page_list[i], mapping));
+				    dma_unmap_addr(&eq->page_list[i], mapping));
 
 	kfree(eq->page_list);
 	mthca_free_mailbox(dev, mailbox);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index 90f4c4d..596acc4 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -46,7 +46,7 @@
 
 struct mthca_buf_list {
 	void *buf;
-	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
 union mthca_buf {
-- 
cgit v0.10.2


From cfdda9d764362ab77b11a410bb928400e6520d57 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Wed, 21 Apr 2010 15:30:06 -0700
Subject: RDMA/cxgb4: Add driver for Chelsio T4 RNIC

Add an RDMA/iWARP driver for Chelsio T4 Ethernet adapters.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 975adce..330d2a4 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -46,6 +46,7 @@ source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/nes/Kconfig"
 
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index ed35e44..0c4e589 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_IPATH)		+= hw/ipath/
 obj-$(CONFIG_INFINIBAND_EHCA)		+= hw/ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)	+= hw/amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
+obj-$(CONFIG_INFINIBAND_CXGB4)		+= hw/cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
 obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
new file mode 100644
index 0000000..ccb85ea
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -0,0 +1,18 @@
+config INFINIBAND_CXGB4
+	tristate "Chelsio T4 RDMA Driver"
+	depends on CHELSIO_T4 && INET
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T4 1GbE and
+	  10GbE adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.htm>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb4.
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
new file mode 100644
index 0000000..e31a499
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -0,0 +1,5 @@
+EXTRA_CFLAGS += -Idrivers/net/cxgb4
+
+obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
+
+iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
new file mode 100644
index 0000000..07b068b
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -0,0 +1,2330 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+
+#include "iw_cxgb4.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+static int enable_tcp_timestamps;
+module_param(enable_tcp_timestamps, int, 0644);
+MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)");
+
+static int enable_tcp_sack;
+module_param(enable_tcp_sack, int, 0644);
+MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)");
+
+static int enable_tcp_window_scaling = 1;
+module_param(enable_tcp_window_scaling, int, 0644);
+MODULE_PARM_DESC(enable_tcp_window_scaling,
+		 "Enable tcp window scaling (default=1)");
+
+int c4iw_debug;
+module_param(c4iw_debug, int, 0644);
+MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)");
+
+static int peer2peer;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
+
+static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;
+module_param(p2p_type, int, 0644);
+MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: "
+			   "1=RDMA_READ 0=RDMA_WRITE (default 1)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		 "1 is spec compliant. (default=1)");
+
+static int markers_enabled;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static void process_work(struct work_struct *work);
+static struct workqueue_struct *workq;
+static DECLARE_WORK(skb_work, process_work);
+
+static struct sk_buff_head rxq;
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS];
+c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+
+static void start_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		PDBG("%s stopped / restarted timer ep %p\n", __func__, ep);
+		del_timer_sync(&ep->timer);
+	} else
+		c4iw_get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	ep->timer.data = (unsigned long)ep;
+	ep->timer.function = ep_timeout;
+	add_timer(&ep->timer);
+}
+
+static void stop_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (!timer_pending(&ep->timer)) {
+		printk(KERN_ERR "%s timer stopped when its not running! "
+		       "ep %p state %u\n", __func__, ep, ep->com.state);
+		WARN_ON(1);
+		return;
+	}
+	del_timer_sync(&ep->timer);
+	c4iw_put_ep(&ep->com);
+}
+
+static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
+		  struct l2t_entry *l2e)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_ofld_send(rdev->lldi.ports[0], skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, 0);
+	c4iw_ofld_send(rdev, skb);
+	return;
+}
+
+static void set_emss(struct c4iw_ep *ep, u16 opt)
+{
+	ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - 40;
+	ep->mss = ep->emss;
+	if (GET_TCPOPT_TSTAMP(opt))
+		ep->emss -= 12;
+	if (ep->emss < 128)
+		ep->emss = 128;
+	PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt),
+	     ep->mss, ep->emss);
+}
+
+static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc)
+{
+	unsigned long flags;
+	enum c4iw_ep_state state;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	state = epc->state;
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return state;
+}
+
+static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct c4iw_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		spin_lock_init(&epc->lock);
+		init_waitqueue_head(&epc->waitq);
+	}
+	PDBG("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void _c4iw_free_ep(struct kref *kref)
+{
+	struct c4iw_ep *ep;
+
+	ep = container_of(kref, struct c4iw_ep, com.kref);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct c4iw_ep *ep)
+{
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	c4iw_put_ep(&ep->com);
+}
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct c4iw_dev *dev;
+	struct cpl_act_establish *rpl = cplhdr(skb);
+	unsigned int opcode;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		rpl = cplhdr(skb);
+		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+		opcode = rpl->ot.opcode;
+
+		BUG_ON(!work_handlers[opcode]);
+		ret = work_handlers[opcode](dev, skb);
+		if (!ret)
+			kfree_skb(skb);
+	}
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+		skb_reset_transport_header(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	return skb;
+}
+
+static struct rtable *find_route(struct c4iw_dev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			 .ip4_u = {
+				   .daddr = peer_ip,
+				   .saddr = local_ip,
+				   .tos = tos}
+			 },
+		.proto = IPPROTO_TCP,
+		.uli_u = {
+			  .ports = {
+				    .sport = local_port,
+				    .dport = peer_port}
+			  }
+	};
+
+	if (ip_route_output_flow(&init_net, &rt, &fl, NULL, 0))
+		return NULL;
+	return rt;
+}
+
+static void arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p\n", __func__, handle);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	printk(KERN_ERR MOD "ARP failure duing connect\n");
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_rdev *rdev = handle;
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	PDBG("%s rdev %p\n", __func__, rdev);
+	req->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(rdev, skb);
+}
+
+static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	unsigned int flowclen = 80;
+	struct fw_flowc_wr *flowc;
+	int i;
+
+	skb = get_skb(skb, flowclen, GFP_KERNEL);
+	flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+
+	flowc->op_to_nparams = cpu_to_be32(FW_WR_OP(FW_FLOWC_WR) |
+					   FW_FLOWC_WR_NPARAMS(8));
+	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(flowclen,
+					  16)) | FW_WR_FLOWID(ep->hwtid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = cpu_to_be32(0);
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = cpu_to_be32(snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = cpu_to_be32(ep->emss);
+	/* Pad WR to 16 byte boundary */
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+}
+
+static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(NULL, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ,
+						    ep->hwtid));
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(skb, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure);
+	req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_connect(struct c4iw_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct sk_buff *skb;
+	u64 opt0;
+	u32 opt2;
+	unsigned int mtu_idx;
+	int wscale;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->txq_idx);
+
+	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
+	wscale = compute_wscale(rcv_win);
+	opt0 = KEEP_ALIVE(1) |
+	       WND_SCALE(wscale) |
+	       MSS_IDX(mtu_idx) |
+	       L2T_IDX(ep->l2t->idx) |
+	       TX_CHAN(ep->tx_chan) |
+	       SMAC_SEL(ep->smac_idx) |
+	       DSCP(ep->tos) |
+	       RCV_BUFSIZ(rcv_win>>10);
+	opt2 = RX_CHANNEL(0) |
+	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack)
+		opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN(1);
+	t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure);
+
+	req = (struct cpl_act_open_req *) skb_put(skb, wrlen);
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = cpu_to_be32(
+		MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ((ep->rss_qid<<14)|ep->atid)));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->peer_port = ep->com.remote_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
+	req->opt0 = cpu_to_be64(opt0);
+	req->params = 0;
+	req->opt2 = cpu_to_be32(opt2);
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	wrlen = roundup(mpalen + sizeof *req, 16);
+	skb = get_skb(skb, wrlen, GFP_KERNEL);
+	if (!skb) {
+		connect_reply_upcall(ep, -ENOMEM);
+		return;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev;
+
+	if (ep->plen)
+		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	state_set(&ep->com, MPA_REQ_SENT);
+	ep->mpa_attr.initiator = 1;
+	return;
+}
+
+static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	ep->mpa_skb = skb;
+	state_set(&ep->com, MPA_REP_SENT);
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = GET_TID_TID(ntohl(req->tos_atid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_atid(t, atid);
+
+	PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid,
+	     be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn));
+
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb4_insert_tid(t, ep, tid);
+
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	cxgb4_free_atid(t, atid);
+
+	/* start MPA negotiation */
+	send_flowc(ep, NULL);
+	send_mpa_req(ep, skb);
+
+	return 0;
+}
+
+static void close_complete_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	if (ep->com.cm_id) {
+		PDBG("close complete delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	close_complete_upcall(ep);
+	state_set(&ep->com, ABORTING);
+	return send_abort(ep, skb, gfp);
+}
+
+static void peer_close_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		PDBG("peer close delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void peer_abort_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		PDBG("abort delivered ep %p cm_id %p tid %u\n", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_reply_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	event.local_addr = ep->com.local_addr;
+	event.remote_addr = ep->com.remote_addr;
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %u status %d\n", __func__, ep,
+		     ep->hwtid, status);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_request_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	event.local_addr = ep->com.local_addr;
+	event.remote_addr = ep->com.remote_addr;
+	event.private_data_len = ep->plen;
+	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	event.provider_data = ep;
+	if (state_read(&ep->parent_ep->com) != DEAD) {
+		c4iw_get_ep(&ep->com);
+		ep->parent_ep->com.cm_id->event_handler(
+						ep->parent_ep->com.cm_id,
+						&event);
+	}
+	c4iw_put_ep(&ep->parent_ep->com);
+	ep->parent_ep = NULL;
+}
+
+static void established_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+						    ep->hwtid));
+	req->credit_dack = cpu_to_be32(credits);
+	set_wr_txq(skb, CPL_PRIORITY_ACK, ep->txq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return credits;
+}
+
+static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	int err;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_SENT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision != mpa_rev) {
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	ep->mpa_attr.p2p_type = peer2peer ? p2p_type :
+					    FW_RI_INIT_P2PTYPE_DISABLED;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+	    C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR |
+	    C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+	goto out;
+err:
+	abort_connection(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return;
+}
+
+static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	if (state_read(&ep->com) != MPA_REQ_WAIT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	stop_ep_timer(ep);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision != mpa_rev) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	ep->mpa_attr.p2p_type = peer2peer ? p2p_type :
+					    FW_RI_INIT_P2PTYPE_DISABLED;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+	     ep->mpa_attr.p2p_type);
+
+	state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	connect_request_upcall(ep);
+	return;
+}
+
+static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+
+	ep->rcv_seq += dlen;
+	BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen));
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	switch (state_read(&ep->com)) {
+	case MPA_REQ_SENT:
+		process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		process_mpa_request(ep, skb);
+		break;
+	case MPA_REP_SENT:
+		break;
+	default:
+		printk(KERN_ERR MOD "%s Unexpected streaming data."
+		       " ep %p state %d tid %u\n",
+		       __func__, ep, state_read(&ep->com), ep->hwtid);
+
+		/*
+		 * The ep will timeout and inform the ULP of the failure.
+		 * See ep_timeout().
+		 */
+		break;
+	}
+	return 0;
+}
+
+static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	unsigned long flags;
+	int release = 0;
+	unsigned int tid = GET_TID(rpl);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(!ep);
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case ABORTING:
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = GET_TID_TID(GET_AOPEN_ATID(
+					ntohl(rpl->atid_status)));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	int status = GET_AOPEN_STATUS(ntohl(rpl->atid_status));
+
+	ep = lookup_atid(t, atid);
+
+	PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid,
+	     status, status2errno(status));
+
+	if (status == CPL_ERR_RTX_NEG_ADVICE) {
+		printk(KERN_WARNING MOD "Connection problems for atid %u\n",
+			atid);
+		return 0;
+	}
+
+	connect_reply_upcall(ep, status2errno(status));
+	state_set(&ep->com, DEAD);
+
+	if (status && act_open_has_tid(status))
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
+
+	cxgb4_free_atid(t, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	if (!ep) {
+		printk(KERN_ERR MOD "stid %d lookup failure!\n", stid);
+		return 0;
+	}
+	PDBG("%s ep %p status %d error %d\n", __func__, ep,
+	     rpl->status, status2errno(rpl->status));
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+
+	return 0;
+}
+
+static int listen_stop(struct c4iw_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_close_listsvr_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	req = (struct cpl_close_listsvr_req *) skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
+						    ep->stid));
+	req->reply_ctrl = cpu_to_be16(
+			  QUEUENO(ep->com.dev->rdev.lldi.rxq_ids[0]));
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, 0);
+	return c4iw_ofld_send(&ep->com.dev->rdev, skb);
+}
+
+static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+	return 0;
+}
+
+static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb,
+		      struct cpl_pass_accept_req *req)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u64 opt0;
+	u32 opt2;
+	int wscale;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(*rpl));
+	skb_get(skb);
+	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
+	wscale = compute_wscale(rcv_win);
+	opt0 = KEEP_ALIVE(1) |
+	       WND_SCALE(wscale) |
+	       MSS_IDX(mtu_idx) |
+	       L2T_IDX(ep->l2t->idx) |
+	       TX_CHAN(ep->tx_chan) |
+	       SMAC_SEL(ep->smac_idx) |
+	       DSCP(ep->tos) |
+	       RCV_BUFSIZ(rcv_win>>10);
+	opt2 = RX_CHANNEL(0) |
+	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+
+	if (enable_tcp_timestamps && req->tcpopt.tstamp)
+		opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack && req->tcpopt.sack)
+		opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN(1);
+
+	rpl = cplhdr(skb);
+	INIT_TP_WR(rpl, ep->hwtid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+				      ep->hwtid));
+	rpl->opt0 = cpu_to_be64(opt0);
+	rpl->opt2 = cpu_to_be32(opt2);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->txq_idx);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct c4iw_dev *dev, u32 hwtid, __be32 peer_ip,
+		      struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p tid %u peer_ip %x\n", __func__, dev, hwtid,
+	     peer_ip);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	skb_get(skb);
+	release_tid(&dev->rdev, hwtid, skb);
+	return;
+}
+
+static void get_4tuple(struct cpl_pass_accept_req *req,
+		       __be32 *local_ip, __be32 *peer_ip,
+		       __be16 *local_port, __be16 *peer_port)
+{
+	int eth_len = G_ETH_HDR_LEN(be32_to_cpu(req->hdr_len));
+	int ip_len = G_IP_HDR_LEN(be32_to_cpu(req->hdr_len));
+	struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len);
+	struct tcphdr *tcp = (struct tcphdr *)
+			     ((u8 *)(req + 1) + eth_len + ip_len);
+
+	PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__,
+	     ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source),
+	     ntohs(tcp->dest));
+
+	*peer_ip = ip->saddr;
+	*local_ip = ip->daddr;
+	*peer_port = tcp->source;
+	*local_port = tcp->dest;
+
+	return;
+}
+
+static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *child_ep, *parent_ep;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	struct l2t_entry *l2t;
+	struct rtable *rt;
+	__be32 local_ip, peer_ip;
+	__be16 local_port, peer_port;
+	struct net_device *pdev;
+	u32 tx_chan, smac_idx;
+	u16 rss_qid;
+	u32 mtu;
+	int step;
+	int txq_idx;
+
+	parent_ep = lookup_stid(t, stid);
+	PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
+
+	get_4tuple(req, &local_ip, &peer_ip, &local_port, &peer_port);
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+		       __func__);
+		goto reject;
+	}
+
+	/* Find output route */
+	rt = find_route(dev, local_ip, peer_ip, local_port, peer_port,
+			GET_POPEN_TOS(ntohl(req->tos_stid)));
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	dst = &rt->u.dst;
+	if (dst->neighbour->dev->flags & IFF_LOOPBACK) {
+		pdev = ip_dev_find(&init_net, peer_ip);
+		BUG_ON(!pdev);
+		l2t = cxgb4_l2t_get(dev->rdev.lldi.l2t, dst->neighbour,
+				    pdev, 0);
+		mtu = pdev->mtu;
+		tx_chan = cxgb4_port_chan(pdev);
+		smac_idx = tx_chan << 1;
+		step = dev->rdev.lldi.ntxq / dev->rdev.lldi.nchan;
+		txq_idx = cxgb4_port_idx(pdev) * step;
+		step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+		rss_qid = dev->rdev.lldi.rxq_ids[cxgb4_port_idx(pdev) * step];
+		dev_put(pdev);
+	} else {
+		l2t = cxgb4_l2t_get(dev->rdev.lldi.l2t, dst->neighbour,
+					dst->neighbour->dev, 0);
+		mtu = dst_mtu(dst);
+		tx_chan = cxgb4_port_chan(dst->neighbour->dev);
+		smac_idx = tx_chan << 1;
+		step = dev->rdev.lldi.ntxq / dev->rdev.lldi.nchan;
+		txq_idx = cxgb4_port_idx(dst->neighbour->dev) * step;
+		step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+		rss_qid = dev->rdev.lldi.rxq_ids[
+			  cxgb4_port_idx(dst->neighbour->dev) * step];
+	}
+	if (!l2t) {
+		printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+		       __func__);
+		dst_release(dst);
+		goto reject;
+	}
+
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+		       __func__);
+		cxgb4_l2t_release(l2t);
+		dst_release(dst);
+		goto reject;
+	}
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.dev = dev;
+	child_ep->com.cm_id = NULL;
+	child_ep->com.local_addr.sin_family = PF_INET;
+	child_ep->com.local_addr.sin_port = local_port;
+	child_ep->com.local_addr.sin_addr.s_addr = local_ip;
+	child_ep->com.remote_addr.sin_family = PF_INET;
+	child_ep->com.remote_addr.sin_port = peer_port;
+	child_ep->com.remote_addr.sin_addr.s_addr = peer_ip;
+	c4iw_get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = GET_POPEN_TOS(ntohl(req->tos_stid));
+	child_ep->l2t = l2t;
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+	child_ep->tx_chan = tx_chan;
+	child_ep->smac_idx = smac_idx;
+	child_ep->rss_qid = rss_qid;
+	child_ep->mtu = mtu;
+	child_ep->txq_idx = txq_idx;
+
+	PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__,
+	     tx_chan, smac_idx, rss_qid);
+
+	init_timer(&child_ep->timer);
+	cxgb4_insert_tid(t, child_ep, hwtid);
+	accept_cr(child_ep, peer_ip, skb, req);
+	goto out;
+reject:
+	reject_cr(dev, hwtid, peer_ip, skb);
+out:
+	return 0;
+}
+
+static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_pass_establish *req = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+	send_flowc(ep, skb);
+
+	return 0;
+}
+
+static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_peer_close *hdr = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	unsigned long flags;
+	int disconnect = 1;
+	int release = 0;
+	int closing = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(hdr);
+	int start_timer = 0;
+	int stop_timer = 0;
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	dst_confirm(ep->dst);
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see c4iw_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		wake_up(&ep->com.waitq);
+		break;
+	case FPDU_MODE:
+		start_timer = 1;
+		__state_set(&ep->com, CLOSING);
+		closing = 1;
+		peer_close_upcall(ep);
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		stop_timer = 1;
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (closing) {
+		attrs.next_state = C4IW_QP_STATE_CLOSING;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	}
+	if (start_timer)
+		start_ep_timer(ep);
+	if (stop_timer)
+		stop_ep_timer(ep);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static int is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct c4iw_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	unsigned long flags;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+	int stop_timer = 0;
+
+	ep = lookup_tid(t, tid);
+	if (is_neg_adv_abort(req->status)) {
+		PDBG("%s neg_adv_abort ep %p tid %u\n", __func__, ep,
+		     ep->hwtid);
+		return 0;
+	}
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		stop_timer = 1;
+		break;
+	case MPA_REQ_SENT:
+		stop_timer = 1;
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see c4iw_accept_cr()).
+		 */
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		wake_up(&ep->com.waitq);
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_timer = 1;
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			ret = c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				printk(KERN_ERR MOD
+				       "%s - qp <- error failed!\n",
+				       __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		spin_unlock_irqrestore(&ep->com.lock, flags);
+		return 0;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		release = 1;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+		       __func__);
+		release = 1;
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+	INIT_TP_WR(rpl, ep->hwtid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb);
+out:
+	if (stop_timer)
+		stop_ep_timer(ep);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+	unsigned long flags;
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(rpl);
+	int stop_timer = 0;
+
+	ep = lookup_tid(t, tid);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		stop_timer = 1;
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     C4IW_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (stop_timer)
+		stop_ep_timer(ep);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_rdma_terminate *term = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(term);
+
+	ep = lookup_tid(t, tid);
+
+	if (state_read(&ep->com) != FPDU_MODE)
+		return 0;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb_pull(skb, sizeof *term);
+	PDBG("%s saving %d bytes of term msg\n", __func__, skb->len);
+	skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer,
+				  skb->len);
+	ep->com.qp->attr.terminate_msg_len = skb->len;
+	ep->com.qp->attr.is_terminate_local = 0;
+	return 0;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_fw4_ack *hdr = cplhdr(skb);
+	u8 credits = hdr->credits;
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	if (credits == 0) {
+		PDBG(KERN_ERR "%s 0 credit ack ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, state_read(&ep->com));
+		return 0;
+	}
+
+	dst_confirm(ep->dst);
+	if (ep->mpa_skb) {
+		PDBG("%s last streaming msg ack ep %p tid %u state %u "
+		     "initiator %u freeing skb\n", __func__, ep, ep->hwtid,
+		     state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	return 0;
+}
+
+static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct c4iw_wr_wait *wr_waitp;
+	int ret;
+
+	PDBG("%s type %u\n", __func__, rpl->type);
+
+	switch (rpl->type) {
+	case 1:
+		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
+		wr_waitp = (__force struct c4iw_wr_wait *)rpl->data[1];
+		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
+		if (wr_waitp) {
+			wr_waitp->ret = ret;
+			wr_waitp->done = 1;
+			wake_up(&wr_waitp->wait);
+		}
+		break;
+	case 2:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	default:
+		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
+		       rpl->type);
+		break;
+	}
+	return 0;
+}
+
+static void ep_timeout(unsigned long arg)
+{
+	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
+	struct c4iw_qp_attributes attrs;
+	unsigned long flags;
+	int abort = 1;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		break;
+	default:
+		printk(KERN_ERR "%s unexpected state ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, ep->com.state);
+		WARN_ON(1);
+		abort = 0;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (abort)
+		abort_connection(ep, NULL, GFP_ATOMIC);
+	c4iw_put_ep(&ep->com);
+}
+
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	if (state_read(&ep->com) == DEAD) {
+		c4iw_put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	}
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
+	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	if (state_read(&ep->com) == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	if ((conn_param->ord > T4_MAX_READ_DEPTH) ||
+	    (conn_param->ird > T4_MAX_READ_DEPTH)) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ird == 0)
+		ep->ird = 1;
+
+	PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+			     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+			     C4IW_QP_ATTR_MPA_ATTR |
+			     C4IW_QP_ATTR_MAX_IRD |
+			     C4IW_QP_ATTR_MAX_ORD;
+
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+	state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	c4iw_put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	ep->com.qp = NULL;
+	cm_id->rem_ref(cm_id);
+err:
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err = 0;
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_ep *ep;
+	struct rtable *rt;
+	struct net_device *pdev;
+	int step;
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	init_timer(&ep->timer);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	cm_id->add_ref(cm_id);
+	ep->com.dev = dev;
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(dev, conn_param->qpn);
+	BUG_ON(!ep->com.qp);
+	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	PDBG("%s saddr 0x%x sport 0x%x raddr 0x%x rport 0x%x\n", __func__,
+	     ntohl(cm_id->local_addr.sin_addr.s_addr),
+	     ntohs(cm_id->local_addr.sin_port),
+	     ntohl(cm_id->remote_addr.sin_addr.s_addr),
+	     ntohs(cm_id->remote_addr.sin_port));
+
+	/* find a route */
+	rt = find_route(dev,
+			cm_id->local_addr.sin_addr.s_addr,
+			cm_id->remote_addr.sin_addr.s_addr,
+			cm_id->local_addr.sin_port,
+			cm_id->remote_addr.sin_port, 0);
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	ep->dst = &rt->u.dst;
+
+	/* get a l2t entry */
+	if (ep->dst->neighbour->dev->flags & IFF_LOOPBACK) {
+		PDBG("%s LOOPBACK\n", __func__);
+		pdev = ip_dev_find(&init_net,
+				   cm_id->remote_addr.sin_addr.s_addr);
+		ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
+					ep->dst->neighbour,
+					pdev, 0);
+		ep->mtu = pdev->mtu;
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = ep->tx_chan << 1;
+		step = ep->com.dev->rdev.lldi.ntxq /
+		       ep->com.dev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		step = ep->com.dev->rdev.lldi.nrxq /
+		       ep->com.dev->rdev.lldi.nchan;
+		ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
+			      cxgb4_port_idx(pdev) * step];
+		dev_put(pdev);
+	} else {
+		ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
+					ep->dst->neighbour,
+					ep->dst->neighbour->dev, 0);
+		ep->mtu = dst_mtu(ep->dst);
+		ep->tx_chan = cxgb4_port_chan(ep->dst->neighbour->dev);
+		ep->smac_idx = ep->tx_chan << 1;
+		step = ep->com.dev->rdev.lldi.ntxq /
+		       ep->com.dev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(ep->dst->neighbour->dev) * step;
+		step = ep->com.dev->rdev.lldi.nrxq /
+		       ep->com.dev->rdev.lldi.nchan;
+		ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
+			      cxgb4_port_idx(ep->dst->neighbour->dev) * step];
+	}
+	if (!ep->l2t) {
+		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
+		err = -ENOMEM;
+		goto fail4;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+		__func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+		ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+	ep->com.local_addr = cm_id->local_addr;
+	ep->com.remote_addr = cm_id->remote_addr;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_listen_ep *ep;
+
+
+	might_sleep();
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	PDBG("%s ep %p\n", __func__, ep);
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.dev = dev;
+	ep->backlog = backlog;
+	ep->com.local_addr = cm_id->local_addr;
+
+	/*
+	 * Allocate a server TID.
+	 */
+	ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, PF_INET, ep);
+	if (ep->stid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	state_set(&ep->com, LISTEN);
+	err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				  ep->com.local_addr.sin_addr.s_addr,
+				  ep->com.local_addr.sin_port,
+				  ep->com.dev->rdev.lldi.rxq_ids[0]);
+	if (err)
+		goto fail3;
+
+	/* wait for pass_open_rpl */
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	err = ep->com.rpl_err;
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+fail3:
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET);
+fail2:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int c4iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct c4iw_listen_ep *ep = to_listen_ep(cm_id);
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	ep->com.rpl_done = 0;
+	ep->com.rpl_err = 0;
+	err = listen_stop(ep);
+	if (err)
+		goto done;
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET);
+done:
+	err = ep->com.rpl_err;
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret = 0;
+	unsigned long flags;
+	int close = 0;
+	int fatal = 0;
+	struct c4iw_rdev *rdev;
+	int start_timer = 0;
+	int stop_timer = 0;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+
+	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
+	     states[ep->com.state], abrupt);
+
+	rdev = &ep->com.dev->rdev;
+	if (c4iw_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_timer = 1;
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				stop_timer = 1;
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (start_timer)
+		start_ep_timer(ep);
+	if (stop_timer)
+		stop_ep_timer(ep);
+	if (close) {
+		if (abrupt)
+			ret = abort_connection(ep, NULL, gfp);
+		else
+			ret = send_halfclose(ep, gfp);
+		if (ret)
+			fatal = 1;
+	}
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ */
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+
+	/*
+	 * Save dev in the skb->cb area.
+	 */
+	*((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u "
+		       "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+	return 0;
+}
+
+int __init c4iw_cm_init(void)
+{
+	skb_queue_head_init(&rxq);
+
+	workq = create_singlethread_workqueue("iw_cxgb4");
+	if (!workq)
+		return -ENOMEM;
+
+	/*
+	 * Most upcalls from the T4 Core go to sched() to
+	 * schedule the processing on a work queue.
+	 */
+	c4iw_handlers[CPL_ACT_ESTABLISH] = sched;
+	c4iw_handlers[CPL_ACT_OPEN_RPL] = sched;
+	c4iw_handlers[CPL_RX_DATA] = sched;
+	c4iw_handlers[CPL_ABORT_RPL_RSS] = sched;
+	c4iw_handlers[CPL_ABORT_RPL] = sched;
+	c4iw_handlers[CPL_PASS_OPEN_RPL] = sched;
+	c4iw_handlers[CPL_CLOSE_LISTSRV_RPL] = sched;
+	c4iw_handlers[CPL_PASS_ACCEPT_REQ] = sched;
+	c4iw_handlers[CPL_PASS_ESTABLISH] = sched;
+	c4iw_handlers[CPL_PEER_CLOSE] = sched;
+	c4iw_handlers[CPL_CLOSE_CON_RPL] = sched;
+	c4iw_handlers[CPL_ABORT_REQ_RSS] = sched;
+	c4iw_handlers[CPL_RDMA_TERMINATE] = sched;
+	c4iw_handlers[CPL_FW4_ACK] = sched;
+	c4iw_handlers[CPL_SET_TCB_RPL] = set_tcb_rpl;
+	c4iw_handlers[CPL_FW6_MSG] = fw6_msg;
+
+	/*
+	 * These are the real handlers that are called from a
+	 * work queue.
+	 */
+	work_handlers[CPL_ACT_ESTABLISH] = act_establish;
+	work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl;
+	work_handlers[CPL_RX_DATA] = rx_data;
+	work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl;
+	work_handlers[CPL_ABORT_RPL] = abort_rpl;
+	work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl;
+	work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl;
+	work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req;
+	work_handlers[CPL_PASS_ESTABLISH] = pass_establish;
+	work_handlers[CPL_PEER_CLOSE] = peer_close;
+	work_handlers[CPL_ABORT_REQ_RSS] = peer_abort;
+	work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl;
+	work_handlers[CPL_RDMA_TERMINATE] = terminate;
+	work_handlers[CPL_FW4_ACK] = fw4_ack;
+	return 0;
+}
+
+void __exit c4iw_cm_term(void)
+{
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
new file mode 100644
index 0000000..fb1aafc
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -0,0 +1,882 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iw_cxgb4.h"
+
+static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret;
+
+	wr_len = sizeof *res_wr + sizeof *res;
+	skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(1) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (u64)&wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_RESET;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+
+	c4iw_init_wr_wait(&wr_wait);
+	ret = c4iw_ofld_send(rdev, skb);
+	if (!ret) {
+		wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+		if (!wr_wait.done) {
+			printk(KERN_ERR MOD "Device %s not responding!\n",
+			       pci_name(rdev->lldi.pdev));
+			rdev->flags = T4_FATAL_ERROR;
+			ret = -EIO;
+		} else
+			ret = wr_wait.ret;
+	}
+
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  cq->memsize, cq->queue,
+			  pci_unmap_addr(cq, mapping));
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+	return ret;
+}
+
+static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	int user = (uctx != &rdev->uctx);
+	struct c4iw_wr_wait wr_wait;
+	int ret;
+	struct sk_buff *skb;
+
+	cq->cqid = c4iw_get_cqid(rdev, uctx);
+	if (!cq->cqid) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	if (!user) {
+		cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL);
+		if (!cq->sw_queue) {
+			ret = -ENOMEM;
+			goto err2;
+		}
+	}
+	cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize,
+				       &cq->dma_addr, GFP_KERNEL);
+	if (!cq->queue) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+	pci_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, cq->memsize);
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(1) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (u64)&wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_WRITE;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+	res->u.cq.iqandst_to_iqandstindex = cpu_to_be32(
+			V_FW_RI_RES_WR_IQANUS(0) |
+			V_FW_RI_RES_WR_IQANUD(1) |
+			F_FW_RI_RES_WR_IQANDST |
+			V_FW_RI_RES_WR_IQANDSTINDEX(*rdev->lldi.rxq_ids));
+	res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(
+			F_FW_RI_RES_WR_IQDROPRSS |
+			V_FW_RI_RES_WR_IQPCIECH(2) |
+			V_FW_RI_RES_WR_IQINTCNTTHRESH(0) |
+			F_FW_RI_RES_WR_IQO |
+			V_FW_RI_RES_WR_IQESIZE(1));
+	res->u.cq.iqsize = cpu_to_be16(cq->size);
+	res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto err4;
+	PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait);
+	wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+	if (!wr_wait.done) {
+		printk(KERN_ERR MOD "Device %s not responding!\n",
+		       pci_name(rdev->lldi.pdev));
+		rdev->flags = T4_FATAL_ERROR;
+		ret = -EIO;
+	} else
+		ret = wr_wait.ret;
+	if (ret)
+		goto err4;
+
+	cq->gen = 1;
+	cq->gts = rdev->lldi.gts_reg;
+	cq->rdev = rdev;
+	if (user) {
+		cq->ugts = (u64)pci_resource_start(rdev->lldi.pdev, 2) +
+					(cq->cqid << rdev->cqshift);
+		cq->ugts &= PAGE_MASK;
+	}
+	return 0;
+err4:
+	dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue,
+			  pci_unmap_addr(cq, mapping));
+err3:
+	kfree(cq->sw_queue);
+err2:
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+err1:
+	return ret;
+}
+
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
+				 V_CQE_OPCODE(FW_RI_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->rq.qid));
+	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
+{
+	int flushed = 0;
+	int in_use = wq->rq.in_use - count;
+
+	BUG_ON(in_use < 0);
+	PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
+	     wq, cq, wq->rq.in_use, count);
+	while (in_use--) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
+			  struct t4_swsqe *swcqe)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
+				 V_CQE_OPCODE(swcqe->opcode) |
+				 V_CQE_TYPE(1) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->sq.qid));
+	CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
+	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count)
+{
+	int flushed = 0;
+	struct t4_swsqe *swsqe = &wq->sq.sw_sq[wq->sq.cidx + count];
+	int in_use = wq->sq.in_use - count;
+
+	BUG_ON(in_use < 0);
+	while (in_use--) {
+		swsqe->signaled = 0;
+		insert_sq_cqe(wq, cq, swsqe);
+		swsqe++;
+		if (swsqe == (wq->sq.sw_sq + wq->sq.size))
+			swsqe = wq->sq.sw_sq;
+		flushed++;
+	}
+	return flushed;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void c4iw_flush_hw_cq(struct t4_cq *cq)
+{
+	struct t4_cqe *cqe = NULL, *swcqe;
+	int ret;
+
+	PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid);
+	ret = t4_next_hw_cqe(cq, &cqe);
+	while (!ret) {
+		PDBG("%s flushing hwcq cidx 0x%x swcq pidx 0x%x\n",
+		     __func__, cq->cidx, cq->sw_pidx);
+		swcqe = &cq->sw_queue[cq->sw_pidx];
+		*swcqe = *cqe;
+		swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+		t4_swcq_produce(cq);
+		t4_hwcq_consume(cq);
+		ret = t4_next_hw_cqe(cq, &cqe);
+	}
+}
+
+static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
+{
+	if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
+		return 0;
+	return 1;
+}
+
+void c4iw_count_scqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
+{
+	struct t4_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	ptr = cq->sw_cidx;
+	while (ptr != cq->sw_pidx) {
+		cqe = &cq->sw_queue[ptr];
+		if ((SQ_TYPE(cqe) || ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) &&
+				      wq->sq.oldest_read)) &&
+		    (CQE_QPID(cqe) == wq->sq.qid))
+			(*count)++;
+		if (++ptr == cq->size)
+			ptr = 0;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
+{
+	struct t4_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	PDBG("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_cidx;
+	while (ptr != cq->sw_pidx) {
+		cqe = &cq->sw_queue[ptr];
+		if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
+		    (CQE_QPID(cqe) == wq->rq.qid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		if (++ptr == cq->size)
+			ptr = 0;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_swsqe *swsqe;
+	u16 ptr = wq->sq.cidx;
+	int count = wq->sq.in_use;
+	int unsignaled = 0;
+
+	swsqe = &wq->sq.sw_sq[ptr];
+	while (count--)
+		if (!swsqe->signaled) {
+			if (++ptr == wq->sq.size)
+				ptr = 0;
+			swsqe = &wq->sq.sw_sq[ptr];
+			unsignaled++;
+		} else if (swsqe->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n",
+			     __func__, ptr, cq->sw_pidx);
+			swsqe->cqe.header |= htonl(V_CQE_SWCQE(1));
+			cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
+			t4_swcq_produce(cq);
+			swsqe->signaled = 0;
+			wq->sq.in_use -= unsignaled;
+			break;
+		} else
+			break;
+}
+
+static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
+				struct t4_cqe *read_cqe)
+{
+	read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
+	read_cqe->len = cpu_to_be32(wq->sq.oldest_read->read_len);
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) |
+				 V_CQE_SWCQE(SW_CQE(hw_cqe)) |
+				 V_CQE_OPCODE(FW_RI_READ_REQ) |
+				 V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static void advance_oldest_read(struct t4_wq *wq)
+{
+
+	u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
+
+	if (rptr == wq->sq.size)
+		rptr = 0;
+	while (rptr != wq->sq.pidx) {
+		wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
+
+		if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
+			return;
+		if (++rptr == wq->sq.size)
+			rptr = 0;
+	}
+	wq->sq.oldest_read = NULL;
+}
+
+/*
+ * poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *    0		    CQE returned ok.
+ *    -EAGAIN       CQE skipped, try again.
+ *    -EOVERFLOW    CQ overflow detected.
+ */
+static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
+		   u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t4_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	ret = t4_next_cqe(cq, &hw_cqe);
+	if (ret)
+		return ret;
+
+	PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x"
+	     " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
+	     CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
+	     CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
+	     CQE_WRID_LOW(hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) {
+
+		/*
+		 * If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (!wq->sq.oldest_read) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
+		*cqe_flushed = t4_wq_in_error(wq);
+		t4_set_wq_in_error(wq);
+		goto proc_cqe;
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with T4_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (t4_rq_empty(wq)) {
+			t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+		if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+			t4_set_wq_in_error(wq);
+			hw_cqe->header |= htonl(V_CQE_STATUS(T4_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
+		struct t4_swsqe *swsqe;
+
+		PDBG("%s out of order completion going in sw_sq at idx %u\n",
+		     __func__, CQE_WRID_SQ_IDX(hw_cqe));
+		swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+		swsqe->cqe = *hw_cqe;
+		swsqe->complete = 1;
+		ret = -EAGAIN;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(hw_cqe)) {
+		wq->sq.cidx = CQE_WRID_SQ_IDX(hw_cqe);
+		PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx);
+		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
+		t4_sq_consume(wq);
+	} else {
+		PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
+		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+		BUG_ON(t4_rq_empty(wq));
+		t4_rq_consume(wq);
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(hw_cqe)) {
+		PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->sw_cidx);
+		t4_swcq_consume(cq);
+	} else {
+		PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->cidx);
+		t4_hwcq_consume(cq);
+	}
+	return ret;
+}
+
+/*
+ * Get one cq entry from c4iw and map it to openib.
+ *
+ * Returns:
+ *	0			cqe returned
+ *	-ENODATA		EMPTY;
+ *	-EAGAIN			caller must try again
+ *	any other -errno	fatal error
+ */
+static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
+{
+	struct c4iw_qp *qhp = NULL;
+	struct t4_cqe cqe = {0, 0}, *rd_cqe;
+	struct t4_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie = 0;
+	int ret;
+
+	ret = t4_next_cqe(&chp->cq, &rd_cqe);
+
+	if (ret)
+		return ret;
+
+	qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	if (ret)
+		goto out;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(&cqe);
+	wc->wc_flags = 0;
+
+	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x "
+	     "lo 0x%x cookie 0x%llx\n", __func__, CQE_QPID(&cqe),
+	     CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_LEN(&cqe),
+	     CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie);
+
+	if (CQE_TYPE(&cqe) == 0) {
+		if (!CQE_STATUS(&cqe))
+			wc->byte_len = CQE_LEN(&cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV ||
+		    CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(&cqe)) {
+		case FW_RI_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case FW_RI_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(&cqe);
+			break;
+		case FW_RI_SEND_WITH_INV:
+		case FW_RI_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			break;
+		case FW_RI_SEND:
+		case FW_RI_SEND_WITH_SE:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case FW_RI_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		case FW_RI_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case FW_RI_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(&cqe)) {
+		case T4_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case T4_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case T4_ERR_QPID:
+		case T4_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case T4_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case T4_ERR_INVALIDATE_SHARED_MR:
+		case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case T4_ERR_CRC:
+		case T4_ERR_MARKER:
+		case T4_ERR_PDU_LEN_ERR:
+		case T4_ERR_OUT_OF_RQE:
+		case T4_ERR_DDP_VERSION:
+		case T4_ERR_RDMA_VERSION:
+		case T4_ERR_DDP_QUEUE_NUM:
+		case T4_ERR_MSN:
+		case T4_ERR_TBIT:
+		case T4_ERR_MO:
+		case T4_ERR_MSN_RANGE:
+		case T4_ERR_IRD_OVERFLOW:
+		case T4_ERR_OPCODE:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case T4_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			printk(KERN_ERR MOD
+			       "Unexpected cqe_status 0x%x for QPID=0x%0x\n",
+			       CQE_STATUS(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct c4iw_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_c4iw_cq(ibcq);
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		do {
+			err = c4iw_poll_cq_one(chp, wc + npolled);
+		} while (err == -EAGAIN);
+		if (err)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+	return !err || err == -ENODATA ? npolled : err;
+}
+
+int c4iw_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_ucontext *ucontext;
+
+	PDBG("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_c4iw_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
+				  : NULL;
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+	kfree(chp);
+	return 0;
+}
+
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+			     int vector, struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_cq *chp;
+	struct c4iw_create_cq_resp uresp;
+	struct c4iw_ucontext *ucontext = NULL;
+	int ret;
+	size_t memsize;
+	struct c4iw_mm_entry *mm, *mm2;
+
+	PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+
+	rhp = to_c4iw_dev(ibdev);
+
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context)
+		ucontext = to_c4iw_ucontext(ib_context);
+
+	/* account for the status page. */
+	entries++;
+
+	/*
+	 * entries must be multiple of 16 for HW.
+	 */
+	entries = roundup(entries, 16);
+	memsize = entries * sizeof *chp->cq.queue;
+
+	/*
+	 * memsize must be a multiple of the page size if its a user cq.
+	 */
+	if (ucontext)
+		memsize = roundup(memsize, PAGE_SIZE);
+	chp->cq.size = entries;
+	chp->cq.memsize = memsize;
+
+	ret = create_cq(&rhp->rdev, &chp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	chp->rhp = rhp;
+	chp->cq.size--;				/* status page */
+	chp->ibcq.cqe = chp->cq.size;
+	spin_lock_init(&chp->lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	if (ret)
+		goto err2;
+
+	if (ucontext) {
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm)
+			goto err3;
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2)
+			goto err4;
+
+		uresp.qid_mask = rhp->rdev.cqmask;
+		uresp.cqid = chp->cq.cqid;
+		uresp.size = chp->cq.size;
+		uresp.memsize = chp->cq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (ret)
+			goto err5;
+
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		mm->len = chp->cq.memsize;
+		insert_mmap(ucontext, mm);
+
+		mm2->key = uresp.gts_key;
+		mm2->addr = chp->cq.ugts;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	PDBG("%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
+	     __func__, chp->cq.cqid, chp, chp->cq.size,
+	     chp->cq.memsize,
+	     (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+err5:
+	kfree(mm2);
+err4:
+	kfree(mm);
+err3:
+	remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+err2:
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(chp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	return -ENOSYS;
+}
+
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct c4iw_cq *chp;
+	int ret;
+	unsigned long flag;
+
+	chp = to_c4iw_cq(ibcq);
+	spin_lock_irqsave(&chp->lock, flag);
+	ret = t4_arm_cq(&chp->cq,
+			(flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		ret = 0;
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
new file mode 100644
index 0000000..be23b5e
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "iw_cxgb4.h"
+
+#define DRV_VERSION "0.1"
+
+MODULE_AUTHOR("Steve Wise");
+MODULE_DESCRIPTION("Chelsio T4 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(dev_mutex);
+
+static struct dentry *c4iw_debugfs_root;
+
+struct debugfs_qp_data {
+	struct c4iw_dev *devp;
+	char *buf;
+	int bufsize;
+	int pos;
+};
+
+static int count_qps(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+	int *countp = data;
+
+	if (id != qp->wq.sq.qid)
+		return 0;
+
+	*countp = *countp + 1;
+	return 0;
+}
+
+static int dump_qps(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+	struct debugfs_qp_data *qpd = data;
+	int space;
+	int cc;
+
+	if (id != qp->wq.sq.qid)
+		return 0;
+
+	space = qpd->bufsize - qpd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (qp->ep)
+		cc = snprintf(qpd->buf + qpd->pos, space, "qp id %u state %u "
+			     "ep tid %u state %u %pI4:%u->%pI4:%u\n",
+			     qp->wq.sq.qid, (int)qp->attr.state,
+			     qp->ep->hwtid, (int)qp->ep->com.state,
+			     &qp->ep->com.local_addr.sin_addr.s_addr,
+			     ntohs(qp->ep->com.local_addr.sin_port),
+			     &qp->ep->com.remote_addr.sin_addr.s_addr,
+			     ntohs(qp->ep->com.remote_addr.sin_port));
+	else
+		cc = snprintf(qpd->buf + qpd->pos, space, "qp id %u state %u\n",
+			      qp->wq.sq.qid, (int)qp->attr.state);
+	if (cc < space)
+		qpd->pos += cc;
+	return 0;
+}
+
+static int qp_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_qp_data *qpd = file->private_data;
+	if (!qpd) {
+		printk(KERN_INFO "%s null qpd?\n", __func__);
+		return 0;
+	}
+	kfree(qpd->buf);
+	kfree(qpd);
+	return 0;
+}
+
+static int qp_open(struct inode *inode, struct file *file)
+{
+	struct debugfs_qp_data *qpd;
+	int ret = 0;
+	int count = 1;
+
+	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+	if (!qpd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	qpd->devp = inode->i_private;
+	qpd->pos = 0;
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, count_qps, &count);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->bufsize = count * 128;
+	qpd->buf = kmalloc(qpd->bufsize, GFP_KERNEL);
+	if (!qpd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, dump_qps, qpd);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->buf[qpd->pos++] = 0;
+	file->private_data = qpd;
+	goto out;
+err1:
+	kfree(qpd);
+out:
+	return ret;
+}
+
+static ssize_t qp_read(struct file *file, char __user *buf, size_t count,
+			loff_t *ppos)
+{
+	struct debugfs_qp_data *qpd = file->private_data;
+	loff_t pos = *ppos;
+	loff_t avail = qpd->pos;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= avail)
+		return 0;
+	if (count > avail - pos)
+		count = avail - pos;
+
+	while (count) {
+		size_t len = 0;
+
+		len = min((int)count, (int)qpd->pos - (int)pos);
+		if (copy_to_user(buf, qpd->buf + pos, len))
+			return -EFAULT;
+		if (len == 0)
+			return -EINVAL;
+
+		buf += len;
+		pos += len;
+		count -= len;
+	}
+	count = pos - *ppos;
+	*ppos = pos;
+	return count;
+}
+
+static const struct file_operations qp_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qp_open,
+	.release = qp_release,
+	.read    = qp_read,
+};
+
+static int setup_debugfs(struct c4iw_dev *devp)
+{
+	struct dentry *de;
+
+	if (!devp->debugfs_root)
+		return -1;
+
+	de = debugfs_create_file("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+	return 0;
+}
+
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_qid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qid & rdev->qpmask))
+			c4iw_put_resource(&rdev->resource.qid_fifo, entry->qid,
+					  &rdev->resource.qid_fifo_lock);
+		kfree(entry);
+	}
+
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	INIT_LIST_HEAD(&uctx->cqids);
+	mutex_init(&uctx->lock);
+}
+
+/* Caller takes care of locking if needed */
+static int c4iw_rdev_open(struct c4iw_rdev *rdev)
+{
+	int err;
+
+	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	rdev->qpshift = PAGE_SHIFT - ilog2(rdev->lldi.udb_density);
+	rdev->qpmask = rdev->lldi.udb_density - 1;
+	rdev->cqshift = PAGE_SHIFT - ilog2(rdev->lldi.ucq_density);
+	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	PDBG("%s dev %s stag start 0x%0x size 0x%0x num stags %d "
+	     "pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x\n",
+	     __func__, pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
+	     rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
+	     rdev->lldi.vr->pbl.start,
+	     rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start,
+	     rdev->lldi.vr->rq.size);
+	PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu "
+	     "qpmask 0x%x cqshift %lu cqmask 0x%x\n",
+	     (unsigned)pci_resource_len(rdev->lldi.pdev, 2),
+	     (void *)pci_resource_start(rdev->lldi.pdev, 2),
+	     rdev->lldi.db_reg,
+	     rdev->lldi.gts_reg,
+	     rdev->qpshift, rdev->qpmask,
+	     rdev->cqshift, rdev->cqmask);
+
+	if (c4iw_num_stags(rdev) == 0) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing resources\n", err);
+		goto err1;
+	}
+	err = c4iw_pblpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
+		goto err2;
+	}
+	err = c4iw_rqtpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
+		goto err3;
+	}
+	return 0;
+err3:
+	c4iw_pblpool_destroy(rdev);
+err2:
+	c4iw_destroy_resource(&rdev->resource);
+err1:
+	return err;
+}
+
+static void c4iw_rdev_close(struct c4iw_rdev *rdev)
+{
+	c4iw_pblpool_destroy(rdev);
+	c4iw_rqtpool_destroy(rdev);
+	c4iw_destroy_resource(&rdev->resource);
+}
+
+static void c4iw_remove(struct c4iw_dev *dev)
+{
+	PDBG("%s c4iw_dev %p\n", __func__,  dev);
+	cancel_delayed_work_sync(&dev->db_drop_task);
+	list_del(&dev->entry);
+	c4iw_unregister_device(dev);
+	c4iw_rdev_close(&dev->rdev);
+	idr_destroy(&dev->cqidr);
+	idr_destroy(&dev->qpidr);
+	idr_destroy(&dev->mmidr);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
+{
+	struct c4iw_dev *devp;
+	int ret;
+
+	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	if (!devp) {
+		printk(KERN_ERR MOD "Cannot allocate ib device\n");
+		return NULL;
+	}
+	devp->rdev.lldi = *infop;
+
+	mutex_lock(&dev_mutex);
+
+	ret = c4iw_rdev_open(&devp->rdev);
+	if (ret) {
+		mutex_unlock(&dev_mutex);
+		printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret);
+		ib_dealloc_device(&devp->ibdev);
+		return NULL;
+	}
+
+	idr_init(&devp->cqidr);
+	idr_init(&devp->qpidr);
+	idr_init(&devp->mmidr);
+	spin_lock_init(&devp->lock);
+	list_add_tail(&devp->entry, &dev_list);
+	mutex_unlock(&dev_mutex);
+
+	if (c4iw_register_device(devp)) {
+		printk(KERN_ERR MOD "Unable to register device\n");
+		mutex_lock(&dev_mutex);
+		c4iw_remove(devp);
+		mutex_unlock(&dev_mutex);
+	}
+	if (c4iw_debugfs_root) {
+		devp->debugfs_root = debugfs_create_dir(
+					pci_name(devp->rdev.lldi.pdev),
+					c4iw_debugfs_root);
+		setup_debugfs(devp);
+	}
+	return devp;
+}
+
+static void *c4iw_uld_add(const struct cxgb4_lld_info *infop)
+{
+	struct c4iw_dev *dev;
+	static int vers_printed;
+	int i;
+
+	if (!vers_printed++)
+		printk(KERN_INFO MOD "Chelsio T4 RDMA Driver - version %s\n",
+		       DRV_VERSION);
+
+	dev = c4iw_alloc(infop);
+	if (!dev)
+		goto out;
+
+	PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n",
+	     __func__, pci_name(dev->rdev.lldi.pdev),
+	     dev->rdev.lldi.nchan, dev->rdev.lldi.nrxq,
+	     dev->rdev.lldi.ntxq, dev->rdev.lldi.nports);
+
+	for (i = 0; i < dev->rdev.lldi.nrxq; i++)
+		PDBG("rxqid[%u] %u\n", i, dev->rdev.lldi.rxq_ids[i]);
+
+	printk(KERN_INFO MOD "Initialized device %s\n",
+	       pci_name(dev->rdev.lldi.pdev));
+out:
+	return dev;
+}
+
+static struct sk_buff *t4_pktgl_to_skb(const struct pkt_gl *gl,
+				       unsigned int skb_len,
+				       unsigned int pull_len)
+{
+	struct sk_buff *skb;
+	struct skb_shared_info *ssi;
+
+	if (gl->tot_len <= 512) {
+		skb = alloc_skb(gl->tot_len, GFP_ATOMIC);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, gl->tot_len);
+		skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
+	} else {
+		skb = alloc_skb(skb_len, GFP_ATOMIC);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, pull_len);
+		skb_copy_to_linear_data(skb, gl->va, pull_len);
+
+		ssi = skb_shinfo(skb);
+		ssi->frags[0].page = gl->frags[0].page;
+		ssi->frags[0].page_offset = gl->frags[0].page_offset + pull_len;
+		ssi->frags[0].size = gl->frags[0].size - pull_len;
+		if (gl->nfrags > 1)
+			memcpy(&ssi->frags[1], &gl->frags[1],
+			       (gl->nfrags - 1) * sizeof(skb_frag_t));
+		ssi->nr_frags = gl->nfrags;
+
+		skb->len = gl->tot_len;
+		skb->data_len = skb->len - pull_len;
+		skb->truesize += skb->data_len;
+
+		/* Get a reference for the last page, we don't own it */
+		get_page(gl->frags[gl->nfrags - 1].page);
+	}
+out:
+	return skb;
+}
+
+static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
+			const struct pkt_gl *gl)
+{
+	struct c4iw_dev *dev = handle;
+	struct sk_buff *skb;
+	const struct cpl_act_establish *rpl;
+	unsigned int opcode;
+
+	if (gl == NULL) {
+		/* omit RSS and rsp_ctrl at end of descriptor */
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else if (gl == CXGB4_MSG_AN) {
+		const struct rsp_ctrl *rc = (void *)rsp;
+
+		u32 qid = be32_to_cpu(rc->pldbuflen_qid);
+		c4iw_ev_handler(dev, qid);
+		return 0;
+	} else {
+		skb = t4_pktgl_to_skb(gl, 128, 128);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	rpl = cplhdr(skb);
+	opcode = rpl->ot.opcode;
+
+	if (c4iw_handlers[opcode])
+		c4iw_handlers[opcode](dev, skb);
+	else
+		printk(KERN_INFO "%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+
+	return 0;
+nomem:
+	return -1;
+}
+
+static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
+{
+	PDBG("%s new_state %u\n", __func__, new_state);
+	return 0;
+}
+
+static struct cxgb4_uld_info c4iw_uld_info = {
+	.name = DRV_NAME,
+	.add = c4iw_uld_add,
+	.rx_handler = c4iw_uld_rx_handler,
+	.state_change = c4iw_uld_state_change,
+};
+
+static int __init c4iw_init_module(void)
+{
+	int err;
+
+	err = c4iw_cm_init();
+	if (err)
+		return err;
+
+	c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (!c4iw_debugfs_root)
+		printk(KERN_WARNING MOD
+		       "could not create debugfs entry, continuing\n");
+
+	cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info);
+
+	return 0;
+}
+
+static void __exit c4iw_exit_module(void)
+{
+	struct c4iw_dev *dev, *tmp;
+
+	cxgb4_unregister_uld(CXGB4_ULD_RDMA);
+
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
+		c4iw_remove(dev);
+	}
+	mutex_unlock(&dev_mutex);
+
+	c4iw_cm_term();
+	debugfs_remove_recursive(c4iw_debugfs_root);
+}
+
+module_init(c4iw_init_module);
+module_exit(c4iw_exit_module);
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
new file mode 100644
index 0000000..1bd6a3e
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+
+#include "iw_cxgb4.h"
+
+static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
+			  struct c4iw_qp *qhp,
+			  struct t4_cqe *err_cqe,
+			  enum ib_event_type ib_event)
+{
+	struct ib_event event;
+	struct c4iw_qp_attributes attrs;
+
+	if ((qhp->attr.state == C4IW_QP_STATE_ERROR) ||
+	    (qhp->attr.state == C4IW_QP_STATE_TERMINATE)) {
+		PDBG("%s AE received after RTS - "
+		     "qp state %d qpid 0x%x status 0x%x\n", __func__,
+		     qhp->attr.state, qhp->wq.sq.qid, CQE_STATUS(err_cqe));
+		return;
+	}
+
+	printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x "
+	       "type %d wrid.hi 0x%x wrid.lo 0x%x\n", __func__,
+	       CQE_QPID(err_cqe), CQE_OPCODE(err_cqe),
+	       CQE_STATUS(err_cqe), CQE_TYPE(err_cqe),
+	       CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe));
+
+	if (qhp->attr.state == C4IW_QP_STATE_RTS) {
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE,
+			       &attrs, 0);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+}
+
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_qp *qhp;
+	u32 cqid;
+
+	spin_lock(&dev->lock);
+	qhp = get_qhp(dev, CQE_QPID(err_cqe));
+	if (!qhp) {
+		printk(KERN_ERR MOD "BAD AE qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock(&dev->lock);
+		goto out;
+	}
+
+	if (SQ_TYPE(err_cqe))
+		cqid = qhp->attr.scq;
+	else
+		cqid = qhp->attr.rcq;
+	chp = get_chp(dev, cqid);
+	if (!chp) {
+		printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       cqid, CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock(&dev->lock);
+		goto out;
+	}
+
+	c4iw_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock(&dev->lock);
+
+	/* Bad incoming write */
+	if (RQ_TYPE(err_cqe) &&
+	    (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) {
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR);
+		goto done;
+	}
+
+	switch (CQE_STATUS(err_cqe)) {
+
+	/* Completion Events */
+	case T4_ERR_SUCCESS:
+		printk(KERN_ERR MOD "AE with status 0!\n");
+		break;
+
+	case T4_ERR_STAG:
+	case T4_ERR_PDID:
+	case T4_ERR_QPID:
+	case T4_ERR_ACCESS:
+	case T4_ERR_WRAP:
+	case T4_ERR_BOUND:
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR);
+		break;
+
+	/* Device Fatal Errors */
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL);
+		break;
+
+	/* QP Fatal Errors */
+	case T4_ERR_OUT_OF_RQE:
+	case T4_ERR_PBL_ADDR_BOUND:
+	case T4_ERR_CRC:
+	case T4_ERR_MARKER:
+	case T4_ERR_PDU_LEN_ERR:
+	case T4_ERR_DDP_VERSION:
+	case T4_ERR_RDMA_VERSION:
+	case T4_ERR_OPCODE:
+	case T4_ERR_DDP_QUEUE_NUM:
+	case T4_ERR_MSN:
+	case T4_ERR_TBIT:
+	case T4_ERR_MO:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_RQE_ADDR_BOUND:
+	case T4_ERR_IRD_OVERFLOW:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+
+	default:
+		printk(KERN_ERR MOD "Unknown T4 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(err_cqe), qhp->wq.sq.qid);
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+		wake_up(&chp->wait);
+	c4iw_qp_rem_ref(&qhp->ibqp);
+out:
+	return;
+}
+
+int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
+{
+	struct c4iw_cq *chp;
+
+	chp = get_chp(dev, qid);
+	if (chp)
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	else
+		PDBG("%s unknown cqid 0x%x\n", __func__, qid);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
new file mode 100644
index 0000000..ccce6fe
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IW_CXGB4_H__
+#define __IW_CXGB4_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/kref.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/kfifo.h>
+
+#include <asm/byteorder.h>
+
+#include <net/net_namespace.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "l2t.h"
+#include "user.h"
+
+#define DRV_NAME "iw_cxgb4"
+#define MOD DRV_NAME ":"
+
+extern int c4iw_debug;
+#define PDBG(fmt, args...) \
+do { \
+	if (c4iw_debug) \
+		printk(MOD fmt, ## args); \
+} while (0)
+
+#include "t4.h"
+
+#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start)
+#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start)
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+#define C4IW_WR_TO (10*HZ)
+
+struct c4iw_wr_wait {
+	wait_queue_head_t wait;
+	int done;
+	int ret;
+};
+
+static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	wr_waitp->ret = 0;
+	wr_waitp->done = 0;
+	init_waitqueue_head(&wr_waitp->wait);
+}
+
+struct c4iw_resource {
+	struct kfifo tpt_fifo;
+	spinlock_t tpt_fifo_lock;
+	struct kfifo qid_fifo;
+	spinlock_t qid_fifo_lock;
+	struct kfifo pdid_fifo;
+	spinlock_t pdid_fifo_lock;
+};
+
+struct c4iw_qid_list {
+	struct list_head entry;
+	u32 qid;
+};
+
+struct c4iw_dev_ucontext {
+	struct list_head qpids;
+	struct list_head cqids;
+	struct mutex lock;
+};
+
+enum c4iw_rdev_flags {
+	T4_FATAL_ERROR = (1<<0),
+};
+
+struct c4iw_rdev {
+	struct c4iw_resource resource;
+	unsigned long qpshift;
+	u32 qpmask;
+	unsigned long cqshift;
+	u32 cqmask;
+	struct c4iw_dev_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	u32 flags;
+	struct cxgb4_lld_info lldi;
+};
+
+static inline int c4iw_fatal_error(struct c4iw_rdev *rdev)
+{
+	return rdev->flags & T4_FATAL_ERROR;
+}
+
+static inline int c4iw_num_stags(struct c4iw_rdev *rdev)
+{
+	return min((int)T4_MAX_NUM_STAG, (int)(rdev->lldi.vr->stag.size >> 5));
+}
+
+struct c4iw_dev {
+	struct ib_device ibdev;
+	struct c4iw_rdev rdev;
+	u32 device_cap_flags;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct list_head entry;
+	struct delayed_work db_drop_task;
+	struct dentry *debugfs_root;
+};
+
+static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c4iw_dev, ibdev);
+}
+
+static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
+{
+	return container_of(rdev, struct c4iw_dev, rdev);
+}
+
+static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	int ret;
+	int newid;
+
+	do {
+		if (!idr_pre_get(idr, GFP_KERNEL))
+			return -ENOMEM;
+		spin_lock_irq(&rhp->lock);
+		ret = idr_get_new_above(idr, handle, id, &newid);
+		BUG_ON(newid != id);
+		spin_unlock_irq(&rhp->lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+{
+	spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	spin_unlock_irq(&rhp->lock);
+}
+
+struct c4iw_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+};
+
+static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c4iw_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u64 len;
+	u64 va_fbo;
+	enum fw_ri_mem_perms perms;
+	u32 stag;
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 pbl_size;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+};
+
+struct c4iw_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c4iw_mr, ibmr);
+}
+
+struct c4iw_mw {
+	struct ib_mw ibmw;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct c4iw_mw, ibmw);
+}
+
+struct c4iw_fr_page_list {
+	struct ib_fast_reg_page_list ibpl;
+	DECLARE_PCI_UNMAP_ADDR(mapping);
+	dma_addr_t dma_addr;
+	struct c4iw_dev *dev;
+	int size;
+};
+
+static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list(
+					struct ib_fast_reg_page_list *ibpl)
+{
+	return container_of(ibpl, struct c4iw_fr_page_list, ibpl);
+}
+
+struct c4iw_cq {
+	struct ib_cq ibcq;
+	struct c4iw_dev *rhp;
+	struct t4_cq cq;
+	spinlock_t lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+};
+
+static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c4iw_cq, ibcq);
+}
+
+struct c4iw_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;
+	u8 crc_enabled;
+	u8 version;
+	u8 p2p_type;
+};
+
+struct c4iw_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct c4iw_mpa_attributes mpa_attr;
+	struct c4iw_ep *llp_stream_handle;
+};
+
+struct c4iw_qp {
+	struct ib_qp ibqp;
+	struct c4iw_dev *rhp;
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attr;
+	struct t4_wq wq;
+	spinlock_t lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	struct timer_list timer;
+};
+
+static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c4iw_qp, ibqp);
+}
+
+struct c4iw_ucontext {
+	struct ib_ucontext ibucontext;
+	struct c4iw_dev_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct c4iw_ucontext, ibucontext);
+}
+
+struct c4iw_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct c4iw_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct c4iw_ucontext *ucontext,
+			       struct c4iw_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum c4iw_qp_attr_mask {
+	C4IW_QP_ATTR_NEXT_STATE = 1 << 0,
+	C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	C4IW_QP_ATTR_MAX_ORD = 1 << 11,
+	C4IW_QP_ATTR_MAX_IRD = 1 << 12,
+	C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	C4IW_QP_ATTR_MPA_ATTR = 1 << 24,
+	C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ |
+				     C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+				     C4IW_QP_ATTR_MAX_ORD |
+				     C4IW_QP_ATTR_MAX_IRD |
+				     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+				     C4IW_QP_ATTR_STREAM_MSG_BUFFER |
+				     C4IW_QP_ATTR_MPA_ATTR |
+				     C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int c4iw_modify_qp(struct c4iw_dev *rhp,
+				struct c4iw_qp *qhp,
+				enum c4iw_qp_attr_mask mask,
+				struct c4iw_qp_attributes *attrs,
+				int internal);
+
+enum c4iw_qp_state {
+	C4IW_QP_STATE_IDLE,
+	C4IW_QP_STATE_RTS,
+	C4IW_QP_STATE_ERROR,
+	C4IW_QP_STATE_TERMINATE,
+	C4IW_QP_STATE_CLOSING,
+	C4IW_QP_STATE_TOT
+};
+
+static inline int c4iw_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return C4IW_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C4IW_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C4IW_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C4IW_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return C4IW_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline u32 c4iw_ib_to_tpt_access(int a)
+{
+	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) |
+	       (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) |
+	       FW_RI_MEM_ACCESS_LOCAL_READ;
+}
+
+static inline u32 c4iw_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0);
+}
+
+enum c4iw_mmid_state {
+	C4IW_STAG_STATE_VALID,
+	C4IW_STAG_STATE_INVALID
+};
+
+#define C4IW_NODE_DESC "cxgb4 Chelsio Communications"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define c4iw_put_ep(ep) { \
+	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	kref_put(&((ep)->kref), _c4iw_free_ep); \
+}
+
+#define c4iw_get_ep(ep) { \
+	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	kref_get(&((ep)->kref));  \
+}
+void _c4iw_free_ep(struct kref *kref);
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum c4iw_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum c4iw_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum c4iw_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum c4iw_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR		= 0x03
+};
+
+enum c4iw_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum c4iw_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+};
+
+struct c4iw_ep_common {
+	struct iw_cm_id *cm_id;
+	struct c4iw_qp *qp;
+	struct c4iw_dev *dev;
+	enum c4iw_ep_state state;
+	struct kref kref;
+	spinlock_t lock;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	wait_queue_head_t waitq;
+	int rpl_done;
+	int rpl_err;
+	unsigned long flags;
+};
+
+struct c4iw_listen_ep {
+	struct c4iw_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct c4iw_ep {
+	struct c4iw_ep_common com;
+	struct c4iw_ep *parent_ep;
+	struct timer_list timer;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct c4iw_mpa_attributes mpa_attr;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	unsigned int mpa_pkt_len;
+	u32 ird;
+	u32 ord;
+	u32 smac_idx;
+	u32 tx_chan;
+	u32 mtu;
+	u16 mss;
+	u16 emss;
+	u16 plen;
+	u16 rss_qid;
+	u16 txq_idx;
+	u8 tos;
+};
+
+static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb);
+
+int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid,
+		   struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_resource(struct kfifo *fifo, spinlock_t *lock);
+void c4iw_put_resource(struct kfifo *fifo, u32 entry, spinlock_t *lock);
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid);
+int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_pblpool_create(struct c4iw_rdev *rdev);
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev);
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_destroy_resource(struct c4iw_resource *rscp);
+int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_register_device(struct c4iw_dev *dev);
+void c4iw_unregister_device(struct c4iw_dev *dev);
+int __init c4iw_cm_init(void);
+void __exit c4iw_cm_term(void);
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx);
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx);
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind);
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
+int c4iw_destroy_listen(struct iw_cm_id *cm_id);
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+void c4iw_qp_add_ref(struct ib_qp *qp);
+void c4iw_qp_rem_ref(struct ib_qp *qp);
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len);
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth);
+int c4iw_dealloc_mw(struct ib_mw *mw);
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd);
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
+					   u64 length, u64 virt, int acc,
+					   struct ib_udata *udata);
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start);
+int c4iw_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf,
+				     int acc, u64 *iova_start);
+int c4iw_dereg_mr(struct ib_mr *ib_mr);
+int c4iw_destroy_cq(struct ib_cq *ib_cq);
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+					int vector,
+					struct ib_ucontext *ib_context,
+					struct ib_udata *udata);
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int c4iw_destroy_qp(struct ib_qp *ib_qp);
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata);
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				 int attr_mask, struct ib_udata *udata);
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn);
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb);
+void c4iw_flush_hw_cq(struct t4_cq *cq);
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+void c4iw_count_scqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp);
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count);
+int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count);
+int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid);
+u16 c4iw_rqes_posted(struct c4iw_qp *qhp);
+int c4iw_post_zb_read(struct c4iw_qp *qhp);
+int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe);
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
+
+extern struct cxgb4_client t4c_client;
+extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
new file mode 100644
index 0000000..e54ff6d
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <asm/atomic.h>
+
+#include "iw_cxgb4.h"
+
+#define T4_ULPTX_MIN_IO 32
+#define C4IW_MAX_INLINE_SIZE 96
+
+static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
+			     void *data)
+{
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_idata *sc;
+	u8 wr_len, *to_dp, *from_dp;
+	int copy_len, num_wqe, i, ret = 0;
+	struct c4iw_wr_wait wr_wait;
+
+	addr &= 0x7FFFFFF;
+	PDBG("%s addr 0x%x len %u\n", __func__, addr, len);
+	num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE);
+	c4iw_init_wr_wait(&wr_wait);
+	for (i = 0; i < num_wqe; i++) {
+
+		copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
+			   len;
+		wr_len = roundup(sizeof *req + sizeof *sc +
+				 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+
+		skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+		if (!skb)
+			return -ENOMEM;
+		set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+		req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+		memset(req, 0, wr_len);
+		INIT_ULPTX_WR(req, wr_len, 0, 0);
+
+		if (i == (num_wqe-1)) {
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) |
+						    FW_WR_COMPL(1));
+			req->wr.wr_lo = (__force __be64)&wr_wait;
+		} else
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR));
+		req->wr.wr_mid = cpu_to_be32(
+				       FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16)));
+
+		req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE) | (1<<23));
+		req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN(
+				DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO)));
+		req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr),
+						      16));
+		req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr + i * 3));
+
+		sc = (struct ulptx_idata *)(req + 1);
+		sc->cmd_more = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_IMM));
+		sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO));
+
+		to_dp = (u8 *)(sc + 1);
+		from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE;
+		if (data)
+			memcpy(to_dp, from_dp, copy_len);
+		else
+			memset(to_dp, 0, copy_len);
+		if (copy_len % T4_ULPTX_MIN_IO)
+			memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
+			       (copy_len % T4_ULPTX_MIN_IO));
+		ret = c4iw_ofld_send(rdev, skb);
+		if (ret)
+			return ret;
+		len -= C4IW_MAX_INLINE_SIZE;
+	}
+
+	wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+	if (!wr_wait.done) {
+		printk(KERN_ERR MOD "Device %s not responding!\n",
+		       pci_name(rdev->lldi.pdev));
+		rdev->flags = T4_FATAL_ERROR;
+		ret = -EIO;
+	} else
+		ret = wr_wait.ret;
+	return ret;
+}
+
+/*
+ * Build and write a TPT entry.
+ * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size,
+ *     pbl_size and pbl_addr
+ * OUT: stag index
+ */
+static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
+			   u32 *stag, u8 stag_state, u32 pdid,
+			   enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
+			   int bind_enabled, u32 zbva, u64 to,
+			   u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct fw_ri_tpte tpt;
+	u32 stag_idx;
+	static atomic_t key;
+
+	if (c4iw_fatal_error(rdev))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {
+		stag_idx = c4iw_get_resource(&rdev->resource.tpt_fifo,
+					     &rdev->resource.tpt_fifo_lock);
+		if (!stag_idx)
+			return -ENOMEM;
+		*stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff);
+	}
+	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+	     __func__, stag_state, type, pdid, stag_idx);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_to_pdid = cpu_to_be32(F_FW_RI_TPTE_VALID |
+			V_FW_RI_TPTE_STAGKEY((*stag & M_FW_RI_TPTE_STAGKEY)) |
+			V_FW_RI_TPTE_STAGSTATE(stag_state) |
+			V_FW_RI_TPTE_STAGTYPE(type) | V_FW_RI_TPTE_PDID(pdid));
+		tpt.locread_to_qpid = cpu_to_be32(V_FW_RI_TPTE_PERM(perm) |
+			(bind_enabled ? F_FW_RI_TPTE_MWBINDEN : 0) |
+			V_FW_RI_TPTE_ADDRTYPE((zbva ? FW_RI_ZERO_BASED_TO :
+						      FW_RI_VA_BASED_TO))|
+			V_FW_RI_TPTE_PS(page_size));
+		tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+			V_FW_RI_TPTE_PBLADDR(PBL_OFF(rdev, pbl_addr)>>3));
+		tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+		tpt.va_hi = cpu_to_be32((u32)(to >> 32));
+		tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+		tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
+		tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+	}
+	err = write_adapter_mem(rdev, stag_idx +
+				(rdev->lldi.vr->stag.start >> 5),
+				sizeof(tpt), &tpt);
+
+	if (reset_tpt_entry)
+		c4iw_put_resource(&rdev->resource.tpt_fifo, stag_idx,
+				  &rdev->resource.tpt_fifo_lock);
+	return err;
+}
+
+static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
+		     u32 pbl_addr, u32 pbl_size)
+{
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev->lldi.vr->pbl.start,
+	     pbl_size);
+
+	err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+	return err;
+}
+
+static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
+		     u32 pbl_addr)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
+			       pbl_size, pbl_addr);
+}
+
+static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
+			       0UL, 0, 0, 0, 0);
+}
+
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
+			       0);
+}
+
+static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
+			       0UL, 0, 0, pbl_size, pbl_addr);
+}
+
+static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+		      struct c4iw_mr *mhp, int shift)
+{
+	u32 stag = T4_STAG_UNSET;
+	int ret;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+			  struct c4iw_mr *mhp, int shift, int npages)
+{
+	u32 stag;
+	int ret;
+
+	if (npages > mhp->attr.pbl_size)
+		return -ENOMEM;
+
+	stag = mhp->attr.stag;
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+
+	return ret;
+}
+
+static int alloc_pbl(struct c4iw_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+static int build_phys_page_list(struct ib_phys_buf *buffer_list,
+				int num_phys_buf, u64 *iova_start,
+				u64 *total_size, int *npages,
+				int *shift, __be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return -EINVAL;
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return -ENOMEM;
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return -EINVAL;
+
+	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+	if (!*page_list)
+		return -ENOMEM;
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+	     __func__, (unsigned long long)*iova_start,
+	     (unsigned long long)mask, *shift, (unsigned long long)*total_size,
+	     *npages);
+
+	return 0;
+
+}
+
+int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
+			     struct ib_pd *pd, struct ib_phys_buf *buffer_list,
+			     int num_phys_buf, int acc, u64 *iova_start)
+{
+
+	struct c4iw_mr mh, *mhp;
+	struct c4iw_pd *php;
+	struct c4iw_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages;
+	int ret;
+
+	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_read(&mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(mr);
+	rhp = mhp->rhp;
+	php = to_c4iw_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return -EINVAL;
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_c4iw_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
+		mh.attr.perms = c4iw_ib_to_tpt_access(acc);
+		mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
+					 IB_ACCESS_MW_BIND;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+						iova_start,
+						&total_size, &npages,
+						&shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	ret = reregister_mem(rhp, php, &mh, shift, npages);
+	kfree(page_list);
+	if (ret)
+		return ret;
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf, int acc, u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+					&total_size, &npages, &shift,
+					&page_list);
+	if (ret)
+		goto err;
+
+	ret = alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
+			     npages);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+
+}
+
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+	u32 stag = T4_STAG_UNSET;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND;
+	mhp->attr.zbva = 0;
+	mhp->attr.va_fbo = 0;
+	mhp->attr.page_size = 0;
+	mhp->attr.len = ~0UL;
+	mhp->attr.pbl_size = 0;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, 0, 0, ~0UL, 0, 0, 0);
+	if (ret)
+		goto err1;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		goto err2;
+	return &mhp->ibmr;
+err2:
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr);
+err1:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, j, k;
+	int err = 0;
+	struct ib_umem_chunk *chunk;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (length == ~0ULL)
+		return ERR_PTR(-EINVAL);
+
+	if ((length + start) < start)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = 0;
+	list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
+		n += chunk->nents;
+
+	err = alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = sg_dma_len(&chunk->page_list[j]) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = cpu_to_be64(sg_dma_address(
+					&chunk->page_list[j]) +
+					mhp->umem->page_size * k);
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = write_pbl(&mhp->rhp->rdev,
+					      pages,
+					      mhp->attr.pbl_addr + (n << 3), i);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
+			}
+		}
+
+	if (i)
+		err = write_pbl(&mhp->rhp->rdev, pages,
+				     mhp->attr.pbl_addr + (n << 3), i);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = (u32) length;
+
+	err = register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+int c4iw_dealloc_mw(struct ib_mw *mw)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+
+	mhp = to_c4iw_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	deallocate_window(&rhp->rdev, mhp->attr.stag);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	kfree(mhp);
+	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	return 0;
+}
+
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		goto err;
+
+	mhp->rhp = rhp;
+	ret = alloc_pbl(mhp, pbl_depth);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = pbl_depth;
+	ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_NSMR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid))
+		goto err3;
+
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+err1:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device,
+						     int page_list_len)
+{
+	struct c4iw_fr_page_list *c4pl;
+	struct c4iw_dev *dev = to_c4iw_dev(device);
+	dma_addr_t dma_addr;
+	int size = sizeof *c4pl + page_list_len * sizeof(u64);
+
+	if (page_list_len > T4_MAX_FR_DEPTH)
+		return ERR_PTR(-EINVAL);
+
+	c4pl = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev, size,
+				  &dma_addr, GFP_KERNEL);
+	if (!c4pl)
+		return ERR_PTR(-ENOMEM);
+
+	pci_unmap_addr_set(c4pl, mapping, dma_addr);
+	c4pl->dma_addr = dma_addr;
+	c4pl->dev = dev;
+	c4pl->size = size;
+	c4pl->ibpl.page_list = (u64 *)(c4pl + 1);
+	c4pl->ibpl.max_page_list_len = page_list_len;
+
+	return &c4pl->ibpl;
+}
+
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl)
+{
+	struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl);
+
+	dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, c4pl->size,
+			  c4pl, pci_unmap_addr(c4pl, mapping));
+}
+
+int c4iw_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+
+	PDBG("%s ib_mr %p\n", __func__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_read(&ib_mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	if (mhp->attr.pbl_size)
+		c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+				  mhp->attr.pbl_size << 3);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
new file mode 100644
index 0000000..3cb50af
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/io.h>
+
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "iw_cxgb4.h"
+
+static int fastreg_support;
+module_param(fastreg_support, int, 0644);
+MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=0)");
+
+static int c4iw_modify_port(struct ib_device *ibdev,
+			    u8 port, int port_modify_mask,
+			    struct ib_port_modify *props)
+{
+	return -ENOSYS;
+}
+
+static struct ib_ah *c4iw_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c4iw_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags,
+			    u8 port_num, struct ib_wc *in_wc,
+			    struct ib_grh *in_grh, struct ib_mad *in_mad,
+			    struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct c4iw_dev *rhp = to_c4iw_dev(context->device);
+	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
+	struct c4iw_mm_entry *mm, *tmp;
+
+	PDBG("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
+					       struct ib_udata *udata)
+{
+	struct c4iw_ucontext *context;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+	return &context->ibucontext;
+}
+
+static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct c4iw_rdev *rdev;
+	int ret = 0;
+	struct c4iw_mm_entry *mm;
+	struct c4iw_ucontext *ucontext;
+	u64 addr;
+
+	PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1))
+		return -EINVAL;
+
+	rdev = &(to_c4iw_dev(context->device)->rdev);
+	ucontext = to_c4iw_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
+	    (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
+		       pci_resource_len(rdev->lldi.pdev, 2)))) {
+
+		/*
+		 * Map T4 DB register.
+		 */
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int c4iw_deallocate_pd(struct ib_pd *pd)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	c4iw_put_resource(&rhp->rdev.resource.pdid_fifo, php->pdid,
+			  &rhp->rdev.resource.pdid_fifo_lock);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct c4iw_pd *php;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct c4iw_dev *) ibdev;
+	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_fifo,
+				  &rhp->rdev.resource.pdid_fifo_lock);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		c4iw_put_resource(&rhp->rdev.resource.pdid_fifo, pdid,
+				  &rhp->rdev.resource.pdid_fifo_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) {
+			c4iw_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			   u16 *pkey)
+{
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct c4iw_dev *dev;
+
+	PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+	       __func__, ibdev, port, index, gid);
+	dev = to_c4iw_dev(ibdev);
+	BUG_ON(port == 0);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static int c4iw_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct c4iw_dev *dev;
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	props->hw_ver = dev->rdev.lldi.adapter_type;
+	props->fw_ver = dev->rdev.lldi.fw_vers;
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = T4_PAGESIZE_MASK;
+	props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
+	props->max_mr_size = T4_MAX_MR_SIZE;
+	props->max_qp = T4_MAX_NUM_QP;
+	props->max_qp_wr = T4_MAX_QP_DEPTH;
+	props->max_sge = T4_MAX_RECV_SGE;
+	props->max_sge_rd = 1;
+	props->max_qp_rd_atom = T4_MAX_READ_DEPTH;
+	props->max_qp_init_rd_atom = T4_MAX_READ_DEPTH;
+	props->max_cq = T4_MAX_NUM_CQ;
+	props->max_cqe = T4_MAX_CQ_DEPTH;
+	props->max_mr = c4iw_num_stags(&dev->rdev);
+	props->max_pd = T4_MAX_NUM_PD;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH;
+
+	return 0;
+}
+
+static int c4iw_query_port(struct ib_device *ibdev, u8 port,
+			   struct ib_port_attr *props)
+{
+	struct c4iw_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	netdev = dev->rdev.lldi.ports[port-1];
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+	props->max_mtu = IB_MTU_4096;
+	if (netdev->mtu >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = 2;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n", c4iw_dev->rdev.lldi.adapter_type);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+
+	return sprintf(buf, "%u.%u.%u.%u\n",
+			FW_HDR_FW_VER_MAJOR_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MINOR_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MICRO_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_BUILD_GET(c4iw_dev->rdev.lldi.fw_vers));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
+		       c4iw_dev->rdev.lldi.pdev->device);
+}
+
+static int c4iw_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	return -ENOSYS;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+int c4iw_register_device(struct c4iw_dev *dev)
+{
+	int ret;
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	BUG_ON(!dev->rdev.lldi.ports[0]);
+	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+	if (fastreg_support)
+		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	dev->ibdev.local_dma_lkey = 0;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev);
+	dev->ibdev.query_device = c4iw_query_device;
+	dev->ibdev.query_port = c4iw_query_port;
+	dev->ibdev.modify_port = c4iw_modify_port;
+	dev->ibdev.query_pkey = c4iw_query_pkey;
+	dev->ibdev.query_gid = c4iw_query_gid;
+	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
+	dev->ibdev.mmap = c4iw_mmap;
+	dev->ibdev.alloc_pd = c4iw_allocate_pd;
+	dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
+	dev->ibdev.create_ah = c4iw_ah_create;
+	dev->ibdev.destroy_ah = c4iw_ah_destroy;
+	dev->ibdev.create_qp = c4iw_create_qp;
+	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
+	dev->ibdev.destroy_qp = c4iw_destroy_qp;
+	dev->ibdev.create_cq = c4iw_create_cq;
+	dev->ibdev.destroy_cq = c4iw_destroy_cq;
+	dev->ibdev.resize_cq = c4iw_resize_cq;
+	dev->ibdev.poll_cq = c4iw_poll_cq;
+	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
+	dev->ibdev.dereg_mr = c4iw_dereg_mr;
+	dev->ibdev.alloc_mw = c4iw_alloc_mw;
+	dev->ibdev.bind_mw = c4iw_bind_mw;
+	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
+	dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
+	dev->ibdev.attach_mcast = c4iw_multicast_attach;
+	dev->ibdev.detach_mcast = c4iw_multicast_detach;
+	dev->ibdev.process_mad = c4iw_process_mad;
+	dev->ibdev.req_notify_cq = c4iw_arm_cq;
+	dev->ibdev.post_send = c4iw_post_send;
+	dev->ibdev.post_recv = c4iw_post_receive;
+	dev->ibdev.get_protocol_stats = c4iw_get_mib;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = c4iw_connect;
+	dev->ibdev.iwcm->accept = c4iw_accept_cr;
+	dev->ibdev.iwcm->reject = c4iw_reject_cr;
+	dev->ibdev.iwcm->create_listen = c4iw_create_listen;
+	dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
+	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
+
+	ret = ib_register_device(&dev->ibdev);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 c4iw_class_attributes[i]);
+		if (ret)
+			goto bail2;
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void c4iw_unregister_device(struct c4iw_dev *dev)
+{
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   c4iw_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
new file mode 100644
index 0000000..bd56c84
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -0,0 +1,1577 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iw_cxgb4.h"
+
+static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	/*
+	 * uP clears EQ contexts when the connection exits rdma mode,
+	 * so no need to post a RESET WR for these EQs.
+	 */
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  pci_unmap_addr(&wq->rq, mapping));
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->sq.memsize, wq->sq.queue,
+			  pci_unmap_addr(&wq->sq, mapping));
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+	kfree(wq->rq.sw_rq);
+	kfree(wq->sq.sw_sq);
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return 0;
+}
+
+static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		     struct t4_cq *rcq, struct t4_cq *scq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	int user = (uctx != &rdev->uctx);
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret;
+	int eqsize;
+
+	wq->sq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->sq.qid)
+		return -ENOMEM;
+
+	wq->rq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->rq.qid)
+		goto err1;
+
+	if (!user) {
+		wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq,
+				 GFP_KERNEL);
+		if (!wq->sq.sw_sq)
+			goto err2;
+
+		wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq,
+				 GFP_KERNEL);
+		if (!wq->rq.sw_rq)
+			goto err3;
+	}
+
+	/*
+	 * RQT must be a power of 2.
+	 */
+	wq->rq.rqt_size = roundup_pow_of_two(wq->rq.size);
+	wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size);
+	if (!wq->rq.rqt_hwaddr)
+		goto err4;
+
+	wq->sq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
+					  wq->sq.memsize, &(wq->sq.dma_addr),
+					  GFP_KERNEL);
+	if (!wq->sq.queue)
+		goto err5;
+	memset(wq->sq.queue, 0, wq->sq.memsize);
+	pci_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
+
+	wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
+					  wq->rq.memsize, &(wq->rq.dma_addr),
+					  GFP_KERNEL);
+	if (!wq->rq.queue)
+		goto err6;
+	PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n",
+		__func__, wq->sq.queue,
+		(unsigned long long)virt_to_phys(wq->sq.queue),
+		wq->rq.queue,
+		(unsigned long long)virt_to_phys(wq->rq.queue));
+	memset(wq->rq.queue, 0, wq->rq.memsize);
+	pci_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
+
+	wq->db = rdev->lldi.db_reg;
+	wq->gts = rdev->lldi.gts_reg;
+	if (user) {
+		wq->sq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) +
+					(wq->sq.qid << rdev->qpshift);
+		wq->sq.udb &= PAGE_MASK;
+		wq->rq.udb = (u64)pci_resource_start(rdev->lldi.pdev, 2) +
+					(wq->rq.qid << rdev->qpshift);
+		wq->rq.udb &= PAGE_MASK;
+	}
+	wq->rdev = rdev;
+	wq->rq.msn = 1;
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + 2 * sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto err7;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(2) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (u64)&wr_wait;
+	res = res_wr->res;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_SQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->sq.size * T4_SQ_NUM_SLOTS + T4_EQ_STATUS_ENTRIES;
+
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
+		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
+		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
+		V_FW_RI_RES_WR_IQID(scq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		V_FW_RI_RES_WR_DCAEN(0) |
+		V_FW_RI_RES_WR_DCACPU(0) |
+		V_FW_RI_RES_WR_FBMIN(3) |
+		V_FW_RI_RES_WR_FBMAX(3) |
+		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+		V_FW_RI_RES_WR_EQSIZE(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr);
+	res++;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_RQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + T4_EQ_STATUS_ENTRIES;
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
+		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
+		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
+		V_FW_RI_RES_WR_IQID(rcq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		V_FW_RI_RES_WR_DCAEN(0) |
+		V_FW_RI_RES_WR_DCACPU(0) |
+		V_FW_RI_RES_WR_FBMIN(3) |
+		V_FW_RI_RES_WR_FBMAX(3) |
+		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+		V_FW_RI_RES_WR_EQSIZE(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto err7;
+	wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+	if (!wr_wait.done) {
+		printk(KERN_ERR MOD "Device %s not responding!\n",
+		       pci_name(rdev->lldi.pdev));
+		rdev->flags = T4_FATAL_ERROR;
+		ret = -EIO;
+	} else
+		ret = wr_wait.ret;
+	if (ret)
+		goto err7;
+
+	PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx\n",
+	     __func__, wq->sq.qid, wq->rq.qid, wq->db,
+	     (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb);
+
+	return 0;
+err7:
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  pci_unmap_addr(&wq->rq, mapping));
+err6:
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->sq.memsize, wq->sq.queue,
+			  pci_unmap_addr(&wq->sq, mapping));
+err5:
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+err4:
+	kfree(wq->rq.sw_rq);
+err3:
+	kfree(wq->sq.sw_sq);
+err2:
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+err1:
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return -ENOMEM;
+}
+
+static int build_rdma_send(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	int i;
+	u32 plen;
+	int size;
+	u8 *datap;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND));
+		wqe->send.stag_inv = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV));
+		wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	plen = 0;
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			datap = (u8 *)wqe->send.u.immd_src[0].data;
+			for (i = 0; i < wr->num_sge; i++) {
+				if ((plen + wr->sg_list[i].length) >
+				    T4_MAX_SEND_INLINE) {
+					return -EMSGSIZE;
+				}
+				plen += wr->sg_list[i].length;
+				memcpy(datap,
+				     (void *)(unsigned long)wr->sg_list[i].addr,
+				     wr->sg_list[i].length);
+				datap += wr->sg_list[i].length;
+			}
+			wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
+			wqe->send.u.immd_src[0].r1 = 0;
+			wqe->send.u.immd_src[0].r2 = 0;
+			wqe->send.u.immd_src[0].immdlen = cpu_to_be32(plen);
+			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			for (i = 0; i < wr->num_sge; i++) {
+				if ((plen + wr->sg_list[i].length) < plen)
+					return -EMSGSIZE;
+				plen += wr->sg_list[i].length;
+				wqe->send.u.isgl_src[0].sge[i].stag =
+					cpu_to_be32(wr->sg_list[i].lkey);
+				wqe->send.u.isgl_src[0].sge[i].len =
+					cpu_to_be32(wr->sg_list[i].length);
+				wqe->send.u.isgl_src[0].sge[i].to =
+					cpu_to_be64(wr->sg_list[i].addr);
+			}
+			wqe->send.u.isgl_src[0].op = FW_RI_DATA_ISGL;
+			wqe->send.u.isgl_src[0].r1 = 0;
+			wqe->send.u.isgl_src[0].nsge = cpu_to_be16(wr->num_sge);
+			wqe->send.u.isgl_src[0].r2 = 0;
+			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->send.u.immd_src[0].r1 = 0;
+		wqe->send.u.immd_src[0].r2 = 0;
+		wqe->send.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	int i;
+	u32 plen;
+	int size;
+	u8 *datap;
+
+	if (wr->num_sge > T4_MAX_WRITE_SGE)
+		return -EINVAL;
+	wqe->write.r2 = 0;
+	wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+	plen = 0;
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			datap = (u8 *)wqe->write.u.immd_src[0].data;
+			for (i = 0; i < wr->num_sge; i++) {
+				if ((plen + wr->sg_list[i].length) >
+				    T4_MAX_WRITE_INLINE) {
+					return -EMSGSIZE;
+				}
+				plen += wr->sg_list[i].length;
+				memcpy(datap,
+				     (void *)(unsigned long)wr->sg_list[i].addr,
+				     wr->sg_list[i].length);
+				datap += wr->sg_list[i].length;
+			}
+			wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+			wqe->write.u.immd_src[0].r1 = 0;
+			wqe->write.u.immd_src[0].r2 = 0;
+			wqe->write.u.immd_src[0].immdlen = cpu_to_be32(plen);
+			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			for (i = 0; i < wr->num_sge; i++) {
+				if ((plen + wr->sg_list[i].length) < plen)
+					return -EMSGSIZE;
+				plen += wr->sg_list[i].length;
+				wqe->write.u.isgl_src[0].sge[i].stag =
+					cpu_to_be32(wr->sg_list[i].lkey);
+				wqe->write.u.isgl_src[0].sge[i].len =
+					cpu_to_be32(wr->sg_list[i].length);
+				wqe->write.u.isgl_src[0].sge[i].to =
+					cpu_to_be64(wr->sg_list[i].addr);
+			}
+			wqe->write.u.isgl_src[0].op = FW_RI_DATA_ISGL;
+			wqe->write.u.isgl_src[0].r1 = 0;
+			wqe->write.u.isgl_src[0].nsge =
+						       cpu_to_be16(wr->num_sge);
+			wqe->write.u.isgl_src[0].r2 = 0;
+			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->write.u.immd_src[0].r1 = 0;
+		wqe->write.u.immd_src[0].r2 = 0;
+		wqe->write.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	if (wr->num_sge) {
+		wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey);
+		wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr
+							>> 32));
+		wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr);
+		wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey);
+		wqe->read.plen = cpu_to_be32(wr->sg_list[0].length);
+		wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr
+							 >> 32));
+		wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr));
+	} else {
+		wqe->read.stag_src = cpu_to_be32(2);
+		wqe->read.to_src_hi = 0;
+		wqe->read.to_src_lo = 0;
+		wqe->read.stag_sink = cpu_to_be32(2);
+		wqe->read.plen = 0;
+		wqe->read.to_sink_hi = 0;
+		wqe->read.to_sink_lo = 0;
+	}
+	wqe->read.r2 = 0;
+	wqe->read.r5 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	return 0;
+}
+
+static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
+			   struct ib_recv_wr *wr, u8 *len16)
+{
+	int i;
+	int plen = 0;
+
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen)
+			return -EMSGSIZE;
+		plen += wr->sg_list[i].length;
+		wqe->recv.isgl.sge[i].stag =
+			cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->recv.isgl.sge[i].len =
+			cpu_to_be32(wr->sg_list[i].length);
+		wqe->recv.isgl.sge[i].to =
+			cpu_to_be64(wr->sg_list[i].addr);
+	}
+	for (; i < T4_MAX_RECV_SGE; i++) {
+		wqe->recv.isgl.sge[i].stag = 0;
+		wqe->recv.isgl.sge[i].len = 0;
+		wqe->recv.isgl.sge[i].to = 0;
+	}
+	wqe->recv.isgl.op = FW_RI_DATA_ISGL;
+	wqe->recv.isgl.r1 = 0;
+	wqe->recv.isgl.nsge = cpu_to_be16(wr->num_sge);
+	wqe->recv.isgl.r2 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
+static int build_fastreg(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+
+	struct fw_ri_immd *imdp;
+	__be64 *p;
+	int i;
+	int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32);
+
+	if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH)
+		return -EINVAL;
+
+	wqe->fr.qpbinde_to_dcacpu = 0;
+	wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12;
+	wqe->fr.addr_type = FW_RI_VA_BASED_TO;
+	wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags);
+	wqe->fr.len_hi = 0;
+	wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start &
+					0xffffffff);
+	if (pbllen > T4_MAX_FR_IMMD) {
+		struct c4iw_fr_page_list *c4pl =
+				to_c4iw_fr_page_list(wr->wr.fast_reg.page_list);
+		struct fw_ri_dsgl *sglp;
+
+		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
+		sglp->op = FW_RI_DATA_DSGL;
+		sglp->r1 = 0;
+		sglp->nsge = cpu_to_be16(1);
+		sglp->addr0 = cpu_to_be64(c4pl->dma_addr);
+		sglp->len0 = cpu_to_be32(pbllen);
+
+		*len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *sglp, 16);
+	} else {
+		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
+		imdp->op = FW_RI_DATA_IMMD;
+		imdp->r1 = 0;
+		imdp->r2 = 0;
+		imdp->immdlen = cpu_to_be32(pbllen);
+		p = (__be64 *)(imdp + 1);
+		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++)
+			*p = cpu_to_be64(
+				(u64)wr->wr.fast_reg.page_list->page_list[i]);
+		*len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen,
+				      16);
+	}
+	return 0;
+}
+
+static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
+			  u8 *len16)
+{
+	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->inv.r2 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+	return 0;
+}
+
+void c4iw_qp_add_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+}
+
+void c4iw_qp_rem_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
+		wake_up(&(to_c4iw_qp(qp)->wait));
+}
+
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		   struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 len16 = 0;
+	enum fw_wr_opcodes fw_opcode = 0;
+	enum fw_ri_wr_flags fw_flags;
+	struct c4iw_qp *qhp;
+	union t4_wr *wqe;
+	u32 num_wrs;
+	struct t4_swsqe *swsqe;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_sq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = &qhp->wq.sq.queue[qhp->wq.sq.pidx];
+		fw_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED)
+			fw_flags |= FW_RI_COMPLETION_FLAG;
+		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_INV:
+		case IB_WR_SEND:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_READ_FENCE_FLAG;
+			fw_opcode = FW_RI_SEND_WR;
+			if (wr->opcode == IB_WR_SEND)
+				swsqe->opcode = FW_RI_SEND;
+			else
+				swsqe->opcode = FW_RI_SEND_WITH_INV;
+			err = build_rdma_send(wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_WRITE:
+			fw_opcode = FW_RI_RDMA_WRITE_WR;
+			swsqe->opcode = FW_RI_RDMA_WRITE;
+			err = build_rdma_write(wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_READ:
+			fw_opcode = FW_RI_RDMA_READ_WR;
+			swsqe->opcode = FW_RI_READ_REQ;
+			fw_flags = 0;
+			err = build_rdma_read(wqe, wr, &len16);
+			if (err)
+				break;
+			swsqe->read_len = wr->sg_list[0].length;
+			if (!qhp->wq.sq.oldest_read)
+				qhp->wq.sq.oldest_read = swsqe;
+			break;
+		case IB_WR_FAST_REG_MR:
+			fw_opcode = FW_RI_FR_NSMR_WR;
+			swsqe->opcode = FW_RI_FAST_REGISTER;
+			err = build_fastreg(wqe, wr, &len16);
+			break;
+		case IB_WR_LOCAL_INV:
+			fw_opcode = FW_RI_INV_LSTAG_WR;
+			swsqe->opcode = FW_RI_LOCAL_INV;
+			err = build_inv_stag(wqe, wr, &len16);
+			break;
+		default:
+			PDBG("%s post of type=%d TBD!\n", __func__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		swsqe->idx = qhp->wq.sq.pidx;
+		swsqe->complete = 0;
+		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+		swsqe->wr_id = wr->wr_id;
+
+		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
+
+		PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n",
+		     __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx,
+		     swsqe->opcode, swsqe->read_len);
+		wr = wr->next;
+		num_wrs--;
+		t4_sq_produce(&qhp->wq);
+		idx++;
+	}
+	if (t4_wq_db_enabled(&qhp->wq))
+		t4_ring_sq_db(&qhp->wq, idx);
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	return err;
+}
+
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_qp *qhp;
+	union t4_recv_wr *wqe;
+	u32 num_wrs;
+	u8 len16 = 0;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_rq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = &qhp->wq.rq.queue[qhp->wq.rq.pidx];
+		if (num_wrs)
+			err = build_rdma_recv(qhp, wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id;
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = qhp->wq.rq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+		if (len16 < 5)
+			wqe->flits[8] = 0;
+
+		PDBG("%s cookie 0x%llx pidx %u\n", __func__,
+		     (unsigned long long) wr->wr_id, qhp->wq.rq.pidx);
+		t4_rq_produce(&qhp->wq);
+		wr = wr->next;
+		num_wrs--;
+		idx++;
+	}
+	if (t4_wq_db_enabled(&qhp->wq))
+		t4_ring_rq_db(&qhp->wq, idx);
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	return err;
+}
+
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
+{
+	return -ENOSYS;
+}
+
+static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
+				    u8 *ecode)
+{
+	int status;
+	int tagged;
+	int opcode;
+	int rqtype;
+	int send_inv;
+
+	if (!err_cqe) {
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		return;
+	}
+
+	status = CQE_STATUS(err_cqe);
+	opcode = CQE_OPCODE(err_cqe);
+	rqtype = RQ_TYPE(err_cqe);
+	send_inv = (opcode == FW_RI_SEND_WITH_INV) ||
+		   (opcode == FW_RI_SEND_WITH_SE_INV);
+	tagged = (opcode == FW_RI_RDMA_WRITE) ||
+		 (rqtype && (opcode == FW_RI_READ_RESP));
+
+	switch (status) {
+	case T4_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case T4_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == FW_RI_SEND_WITH_INV) ||
+		    (opcode == FW_RI_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case T4_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case T4_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case T4_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case T4_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case T4_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case T4_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case T4_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case T4_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case T4_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case T4_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case T4_ERR_MSN:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case T4_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+int c4iw_post_zb_read(struct c4iw_qp *qhp)
+{
+	union t4_wr *wqe;
+	struct sk_buff *skb;
+	u8 len16;
+
+	PDBG("%s enter\n", __func__);
+	skb = alloc_skb(40, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR "%s cannot send zb_read!!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (union t4_wr *)skb_put(skb, sizeof wqe->read);
+	memset(wqe, 0, sizeof wqe->read);
+	wqe->read.r2 = cpu_to_be64(0);
+	wqe->read.stag_sink = cpu_to_be32(1);
+	wqe->read.to_sink_hi = cpu_to_be32(0);
+	wqe->read.to_sink_lo = cpu_to_be32(1);
+	wqe->read.stag_src = cpu_to_be32(1);
+	wqe->read.plen = cpu_to_be32(0);
+	wqe->read.to_src_hi = cpu_to_be32(0);
+	wqe->read.to_src_lo = cpu_to_be32(1);
+	len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	init_wr_hdr(wqe, 0, FW_RI_RDMA_READ_WR, FW_RI_COMPLETION_FLAG, len16);
+
+	return c4iw_ofld_send(&qhp->rhp->rdev, skb);
+}
+
+int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe)
+{
+	struct fw_ri_wr *wqe;
+	struct sk_buff *skb;
+	struct terminate_message *term;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     qhp->ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(FW_WR_OP(FW_RI_INIT_WR));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(qhp->ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+
+	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
+	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+	term = (struct terminate_message *)wqe->u.terminate.termmsg;
+	build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
+	return c4iw_ofld_send(&qhp->rhp->rdev, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
+		       struct c4iw_cq *schp, unsigned long *flag)
+{
+	int count;
+	int flushed;
+
+	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+	/* take a ref on the qhp since we must release the lock */
+	atomic_inc(&qhp->refcnt);
+	spin_unlock_irqrestore(&qhp->lock, *flag);
+
+	/* locking heirarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&rchp->lock, *flag);
+	spin_lock(&qhp->lock);
+	c4iw_flush_hw_cq(&rchp->cq);
+	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&rchp->lock, *flag);
+	if (flushed)
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+
+	/* locking heirarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&schp->lock, *flag);
+	spin_lock(&qhp->lock);
+	c4iw_flush_hw_cq(&schp->cq);
+	c4iw_count_scqes(&schp->cq, &qhp->wq, &count);
+	flushed = c4iw_flush_sq(&qhp->wq, &schp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&schp->lock, *flag);
+	if (flushed)
+		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+
+	/* deref */
+	if (atomic_dec_and_test(&qhp->refcnt))
+		wake_up(&qhp->wait);
+
+	spin_lock_irqsave(&qhp->lock, *flag);
+}
+
+static void flush_qp(struct c4iw_qp *qhp, unsigned long *flag)
+{
+	struct c4iw_cq *rchp, *schp;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	if (qhp->ibqp.uobject) {
+		t4_set_wq_in_error(&qhp->wq);
+		t4_set_cq_in_error(&rchp->cq);
+		if (schp != rchp)
+			t4_set_cq_in_error(&schp->cq);
+		return;
+	}
+	__flush_qp(qhp, rchp, schp, flag);
+}
+
+static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     qhp->ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP(FW_RI_INIT_WR) |
+		FW_WR_COMPL(1));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(qhp->ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+	wqe->cookie = (u64)&wr_wait;
+
+	wqe->u.fini.type = FW_RI_TYPE_FINI;
+	c4iw_init_wr_wait(&wr_wait);
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto out;
+
+	wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+	if (!wr_wait.done) {
+		printk(KERN_ERR MOD "Device %s not responding!\n",
+		       pci_name(rhp->rdev.lldi.pdev));
+		rhp->rdev.flags = T4_FATAL_ERROR;
+		ret = -EIO;
+	} else {
+		ret = wr_wait.ret;
+		if (ret)
+			printk(KERN_WARNING MOD
+			       "%s: Abnormal close qpid %d ret %u\n",
+			       pci_name(rhp->rdev.lldi.pdev), qhp->wq.sq.qid,
+			       ret);
+	}
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
+{
+	memset(&init->u, 0, sizeof init->u);
+	switch (p2p_type) {
+	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
+		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
+		init->u.write.stag_sink = cpu_to_be32(1);
+		init->u.write.to_sink = cpu_to_be64(1);
+		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
+						   sizeof(struct fw_ri_immd),
+						   16);
+		break;
+	case FW_RI_INIT_P2PTYPE_READ_REQ:
+		init->u.write.opcode = FW_RI_RDMA_READ_WR;
+		init->u.read.stag_src = cpu_to_be32(1);
+		init->u.read.to_src_lo = cpu_to_be32(1);
+		init->u.read.stag_sink = cpu_to_be32(1);
+		init->u.read.to_sink_lo = cpu_to_be32(1);
+		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+		break;
+	}
+}
+
+static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     qhp->ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP(FW_RI_INIT_WR) |
+		FW_WR_COMPL(1));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(qhp->ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+
+	wqe->cookie = (u64)&wr_wait;
+
+	wqe->u.init.type = FW_RI_TYPE_INIT;
+	wqe->u.init.mpareqbit_p2ptype =
+		V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) |
+		V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type);
+	wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE;
+	if (qhp->attr.mpa_attr.recv_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.xmit_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.crc_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE;
+
+	wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE |
+			    FW_RI_QP_RDMA_WRITE_ENABLE |
+			    FW_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE |
+				     FW_RI_QP_STAG0_ENABLE;
+	wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq));
+	wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd);
+	wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid);
+	wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq);
+	wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq);
+	wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord);
+	wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird);
+	wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq);
+	wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq);
+	wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size);
+	wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr -
+					 rhp->rdev.lldi.vr->rq.start);
+	if (qhp->attr.mpa_attr.initiator)
+		build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+
+	c4iw_init_wr_wait(&wr_wait);
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto out;
+
+	wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO);
+	if (!wr_wait.done) {
+		printk(KERN_ERR MOD "Device %s not responding!\n",
+		       pci_name(rhp->rdev.lldi.pdev));
+		rhp->rdev.flags = T4_FATAL_ERROR;
+		ret = -EIO;
+	} else
+		ret = wr_wait.ret;
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		   enum c4iw_qp_attr_mask mask,
+		   struct c4iw_qp_attributes *attrs,
+		   int internal)
+{
+	int ret = 0;
+	struct c4iw_qp_attributes newattr = qhp->attr;
+	unsigned long flag;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct c4iw_ep *ep = NULL;
+
+	PDBG("%s qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n", __func__,
+	     qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state,
+	     (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/* Process attr changes if in IDLE */
+	if (mask & C4IW_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != C4IW_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & C4IW_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord > T4_MAX_READ_DEPTH) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & C4IW_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird > T4_MAX_READ_DEPTH) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case C4IW_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_RTS:
+			if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			qhp->attr.state = C4IW_QP_STATE_RTS;
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			c4iw_get_ep(&qhp->ep->com);
+			spin_unlock_irqrestore(&qhp->lock, flag);
+			ret = rdma_init(rhp, qhp);
+			spin_lock_irqsave(&qhp->lock, flag);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_ERROR:
+			qhp->attr.state = C4IW_QP_STATE_ERROR;
+			flush_qp(qhp, &flag);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_CLOSING:
+			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			qhp->attr.state = C4IW_QP_STATE_CLOSING;
+			if (!internal) {
+				abort = 0;
+				disconnect = 1;
+				ep = qhp->ep;
+				c4iw_get_ep(&ep->com);
+			}
+			spin_unlock_irqrestore(&qhp->lock, flag);
+			ret = rdma_fini(rhp, qhp);
+			spin_lock_irqsave(&qhp->lock, flag);
+			if (ret) {
+				ep = qhp->ep;
+				c4iw_get_ep(&ep->com);
+				disconnect = abort = 1;
+				goto err;
+			}
+			break;
+		case C4IW_QP_STATE_TERMINATE:
+			qhp->attr.state = C4IW_QP_STATE_TERMINATE;
+			if (qhp->ibqp.uobject)
+				t4_set_wq_in_error(&qhp->wq);
+			if (!internal) {
+				ep = qhp->ep;
+				c4iw_get_ep(&ep->com);
+				terminate = 1;
+				disconnect = 1;
+			}
+			break;
+		case C4IW_QP_STATE_ERROR:
+			qhp->attr.state = C4IW_QP_STATE_ERROR;
+			if (!internal) {
+				abort = 1;
+				disconnect = 1;
+				ep = qhp->ep;
+				c4iw_get_ep(&ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_IDLE:
+			flush_qp(qhp, &flag);
+			qhp->attr.state = C4IW_QP_STATE_IDLE;
+			qhp->attr.llp_stream_handle = NULL;
+			c4iw_put_ep(&qhp->ep->com);
+			qhp->ep = NULL;
+			wake_up(&qhp->wait);
+			break;
+		case C4IW_QP_STATE_ERROR:
+			goto err;
+		default:
+			ret = -EINVAL;
+			goto err;
+		}
+		break;
+	case C4IW_QP_STATE_ERROR:
+		if (attrs->next_state != C4IW_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		qhp->attr.state = C4IW_QP_STATE_IDLE;
+		break;
+	case C4IW_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		printk(KERN_ERR "%s in a bad state %d\n",
+		       __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+	     qhp->wq.sq.qid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	ep = qhp->ep;
+	qhp->ep = NULL;
+	qhp->attr.state = C4IW_QP_STATE_ERROR;
+	free = 1;
+	wake_up(&qhp->wait);
+	BUG_ON(!ep);
+	flush_qp(qhp, &flag);
+out:
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (terminate)
+		c4iw_post_terminate(qhp, NULL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		c4iw_ep_disconnect(ep, abort, GFP_KERNEL);
+		c4iw_put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		c4iw_put_ep(&ep->com);
+
+	PDBG("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
+
+int c4iw_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_qp_attributes attrs;
+	struct c4iw_ucontext *ucontext;
+
+	qhp = to_c4iw_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = C4IW_QP_STATE_ERROR;
+	c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.rq.qid);
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	ucontext = ib_qp->uobject ?
+		   to_c4iw_ucontext(ib_qp->uobject->context) : NULL;
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
+	kfree(qhp);
+	return 0;
+}
+
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_pd *php;
+	struct c4iw_cq *schp;
+	struct c4iw_cq *rchp;
+	struct c4iw_create_qp_resp uresp;
+	int sqsize, rqsize;
+	struct c4iw_ucontext *ucontext;
+	int ret;
+	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	rqsize = roundup(attrs->cap.max_recv_wr + 1, 16);
+	if (rqsize > T4_MAX_RQ_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	sqsize = roundup(attrs->cap.max_send_wr + 1, 16);
+	if (sqsize > T4_MAX_SQ_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+
+
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.sq.size = sqsize;
+	qhp->wq.sq.memsize = (sqsize + 1) * sizeof *qhp->wq.sq.queue;
+	qhp->wq.rq.size = rqsize;
+	qhp->wq.rq.memsize = (rqsize + 1) * sizeof *qhp->wq.rq.queue;
+
+	if (ucontext) {
+		qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE);
+		qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE);
+	}
+
+	PDBG("%s sqsize %u sqmemsize %zu rqsize %u rqmemsize %zu\n",
+	     __func__, sqsize, qhp->wq.sq.memsize, rqsize, qhp->wq.rq.memsize);
+
+	ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize - 1;
+	attrs->cap.max_inline_data = T4_MAX_SEND_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = C4IW_QP_STATE_IDLE;
+	qhp->attr.next_state = C4IW_QP_STATE_IDLE;
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 1;
+	qhp->attr.max_ird = 1;
+	spin_lock_init(&qhp->lock);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	if (ret)
+		goto err2;
+
+	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.rq.qid);
+	if (ret)
+		goto err3;
+
+	if (udata) {
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			ret = -ENOMEM;
+			goto err4;
+		}
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			ret = -ENOMEM;
+			goto err5;
+		}
+		mm3 = kmalloc(sizeof *mm3, GFP_KERNEL);
+		if (!mm3) {
+			ret = -ENOMEM;
+			goto err6;
+		}
+		mm4 = kmalloc(sizeof *mm4, GFP_KERNEL);
+		if (!mm4) {
+			ret = -ENOMEM;
+			goto err7;
+		}
+
+		uresp.qid_mask = rhp->rdev.qpmask;
+		uresp.sqid = qhp->wq.sq.qid;
+		uresp.sq_size = qhp->wq.sq.size;
+		uresp.sq_memsize = qhp->wq.sq.memsize;
+		uresp.rqid = qhp->wq.rq.qid;
+		uresp.rq_size = qhp->wq.rq.size;
+		uresp.rq_memsize = qhp->wq.rq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.sq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.sq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (ret)
+			goto err8;
+		mm1->key = uresp.sq_key;
+		mm1->addr = virt_to_phys(qhp->wq.sq.queue);
+		mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize);
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.rq_key;
+		mm2->addr = virt_to_phys(qhp->wq.rq.queue);
+		mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize);
+		insert_mmap(ucontext, mm2);
+		mm3->key = uresp.sq_db_gts_key;
+		mm3->addr = qhp->wq.sq.udb;
+		mm3->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm3);
+		mm4->key = uresp.rq_db_gts_key;
+		mm4->addr = qhp->wq.rq.udb;
+		mm4->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm4);
+	}
+	qhp->ibqp.qp_num = qhp->wq.sq.qid;
+	init_timer(&(qhp->timer));
+	PDBG("%s qhp %p sq_num_entries %d, rq_num_entries %d qpid 0x%0x\n",
+	     __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+	     qhp->wq.sq.qid);
+	return &qhp->ibqp;
+err8:
+	kfree(mm4);
+err7:
+	kfree(mm3);
+err6:
+	kfree(mm2);
+err5:
+	kfree(mm1);
+err4:
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.rq.qid);
+err3:
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+err2:
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(qhp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	enum c4iw_qp_attr_mask mask = 0;
+	struct c4iw_qp_attributes attrs;
+
+	PDBG("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_c4iw_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = c4iw_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(C4IW_QP_ATTR_ENABLE_RDMA_READ |
+			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
+{
+	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
+}
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
new file mode 100644
index 0000000..fb195d1
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include "iw_cxgb4.h"
+
+#define RANDOM_SIZE 16
+
+static int __c4iw_init_resource_fifo(struct kfifo *fifo,
+				   spinlock_t *fifo_lock,
+				   u32 nr, u32 skip_low,
+				   u32 skip_high,
+				   int random)
+{
+	u32 i, j, entry = 0, idx;
+	u32 random_bytes;
+	u32 rarray[16];
+	spin_lock_init(fifo_lock);
+
+	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
+	if (random) {
+		j = 0;
+		random_bytes = random32();
+		for (i = 0; i < RANDOM_SIZE; i++)
+			rarray[i] = i + skip_low;
+		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+			if (j >= RANDOM_SIZE) {
+				j = 0;
+				random_bytes = random32();
+			}
+			idx = (random_bytes >> (j * 2)) & 0xF;
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[idx],
+				sizeof(u32));
+			rarray[idx] = i;
+			j++;
+		}
+		for (i = 0; i < RANDOM_SIZE; i++)
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[i],
+				sizeof(u32));
+	} else
+		for (i = skip_low; i < nr - skip_high; i++)
+			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
+				     sizeof(u32), fifo_lock))
+			break;
+	return 0;
+}
+
+static int c4iw_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return __c4iw_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 0);
+}
+
+static int c4iw_init_resource_fifo_random(struct kfifo *fifo,
+				   spinlock_t *fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return __c4iw_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 1);
+}
+
+static int c4iw_init_qid_fifo(struct c4iw_rdev *rdev)
+{
+	u32 i;
+
+	spin_lock_init(&rdev->resource.qid_fifo_lock);
+
+	if (kfifo_alloc(&rdev->resource.qid_fifo, T4_MAX_QIDS * sizeof(u32),
+			GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = T4_QID_BASE; i < T4_QID_BASE + T4_MAX_QIDS; i++)
+		if (!(i & rdev->qpmask))
+			kfifo_in(&rdev->resource.qid_fifo,
+				    (unsigned char *) &i, sizeof(u32));
+	return 0;
+}
+
+/* nr_* must be power of 2 */
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid)
+{
+	int err = 0;
+	err = c4iw_init_resource_fifo_random(&rdev->resource.tpt_fifo,
+					     &rdev->resource.tpt_fifo_lock,
+					     nr_tpt, 1, 0);
+	if (err)
+		goto tpt_err;
+	err = c4iw_init_qid_fifo(rdev);
+	if (err)
+		goto qid_err;
+	err = c4iw_init_resource_fifo(&rdev->resource.pdid_fifo,
+				      &rdev->resource.pdid_fifo_lock,
+				      nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+pdid_err:
+	kfifo_free(&rdev->resource.qid_fifo);
+qid_err:
+	kfifo_free(&rdev->resource.tpt_fifo);
+tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+u32 c4iw_get_resource(struct kfifo *fifo, spinlock_t *lock)
+{
+	u32 entry;
+	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
+		return entry;
+	else
+		return 0;
+}
+
+void c4iw_put_resource(struct kfifo *fifo, u32 entry, spinlock_t *lock)
+{
+	PDBG("%s entry 0x%x\n", __func__, entry);
+	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock);
+}
+
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->cqids)) {
+		entry = list_entry(uctx->cqids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_fifo,
+					&rdev->resource.qid_fifo_lock);
+		if (!qid)
+			goto out;
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+
+		/*
+		 * now put the same ids on the qp list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->qpids);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	return qid;
+}
+
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->cqids);
+	mutex_unlock(&uctx->lock);
+}
+
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_fifo,
+					&rdev->resource.qid_fifo_lock);
+		if (!qid)
+			goto out;
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+
+		/*
+		 * now put the same ids on the cq list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->cqids);
+		for (i = qid; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	return qid;
+}
+
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_destroy_resource(struct c4iw_resource *rscp)
+{
+	kfifo_free(&rscp->tpt_fifo);
+	kfifo_free(&rscp->qid_fifo);
+	kfifo_free(&rscp->pdid_fifo);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size);
+}
+
+int c4iw_pblpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned pbl_start, pbl_chunk, pbl_top;
+
+	rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev->lldi.vr->pbl.start;
+	pbl_chunk = rdev->lldi.vr->pbl.size;
+	pbl_top = pbl_start + pbl_chunk;
+
+	while (pbl_start < pbl_top) {
+		pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk);
+		if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all PBL chunks (%x/%x)\n",
+				       pbl_start,
+				       pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == min RQT size (16 entries) */
+
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	return (u32)addr;
+}
+
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned rqt_start, rqt_chunk, rqt_top;
+
+	rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (!rdev->rqt_pool)
+		return -ENOMEM;
+
+	rqt_start = rdev->lldi.vr->rq.start;
+	rqt_chunk = rdev->lldi.vr->rq.size;
+	rqt_top = rqt_start + rqt_chunk;
+
+	while (rqt_start < rqt_top) {
+		rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk);
+		if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) {
+			PDBG("%s failed to add RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all RQT chunks (%x/%x)\n",
+				       rqt_start, rqt_top - rqt_start);
+				return 0;
+			}
+			rqt_chunk >>= 1;
+		} else {
+			PDBG("%s added RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			rqt_start += rqt_chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->rqt_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
new file mode 100644
index 0000000..3f0d217
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -0,0 +1,536 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_H__
+#define __T4_H__
+
+#include "t4_hw.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_ri_api.h"
+
+#define T4_MAX_READ_DEPTH 16
+#define T4_QID_BASE 1024
+#define T4_MAX_QIDS 256
+#define T4_MAX_NUM_QP (1<<16)
+#define T4_MAX_NUM_CQ (1<<15)
+#define T4_MAX_NUM_PD (1<<15)
+#define T4_MAX_PBL_SIZE 256
+#define T4_MAX_RQ_SIZE 1024
+#define T4_MAX_SQ_SIZE 1024
+#define T4_MAX_QP_DEPTH (T4_MAX_RQ_SIZE-1)
+#define T4_MAX_CQ_DEPTH 8192
+#define T4_MAX_NUM_STAG (1<<15)
+#define T4_MAX_MR_SIZE (~0ULL - 1)
+#define T4_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+#define T4_STAG_UNSET 0xffffffff
+#define T4_FW_MAJ 0
+#define T4_EQ_STATUS_ENTRIES (L1_CACHE_BYTES > 64 ? 2 : 1)
+
+struct t4_status_page {
+	__be32 rsvd1;	/* flit 0 - hw owns */
+	__be16 rsvd2;
+	__be16 qid;
+	__be16 cidx;
+	__be16 pidx;
+	u8 qp_err;	/* flit 1 - sw owns */
+	u8 db_off;
+};
+
+#define T4_EQ_SIZE 64
+
+#define T4_SQ_NUM_SLOTS 4
+#define T4_SQ_NUM_BYTES (T4_EQ_SIZE * T4_SQ_NUM_SLOTS)
+#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_FR_DEPTH 255
+
+#define T4_RQ_NUM_SLOTS 2
+#define T4_RQ_NUM_BYTES (T4_EQ_SIZE * T4_RQ_NUM_SLOTS)
+#define T4_MAX_RECV_SGE ((T4_RQ_NUM_BYTES - sizeof(struct fw_ri_recv_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+
+union t4_wr {
+	struct fw_ri_res_wr res;
+	struct fw_ri_wr ri;
+	struct fw_ri_rdma_write_wr write;
+	struct fw_ri_send_wr send;
+	struct fw_ri_rdma_read_wr read;
+	struct fw_ri_bind_mw_wr bind;
+	struct fw_ri_fr_nsmr_wr fr;
+	struct fw_ri_inv_lstag_wr inv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
+};
+
+union t4_recv_wr {
+	struct fw_ri_recv_wr recv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
+};
+
+static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid,
+			       enum fw_wr_opcodes opcode, u8 flags, u8 len16)
+{
+	int slots_used;
+
+	wqe->send.opcode = (u8)opcode;
+	wqe->send.flags = flags;
+	wqe->send.wrid = wrid;
+	wqe->send.r1[0] = 0;
+	wqe->send.r1[1] = 0;
+	wqe->send.r1[2] = 0;
+	wqe->send.len16 = len16;
+
+	slots_used = DIV_ROUND_UP(len16*16, T4_EQ_SIZE);
+	while (slots_used < T4_SQ_NUM_SLOTS) {
+		wqe->flits[slots_used * T4_EQ_SIZE / sizeof(__be64)] = 0;
+		slots_used++;
+	}
+}
+
+/* CQE/AE status codes */
+#define T4_ERR_SUCCESS                     0x0
+#define T4_ERR_STAG                        0x1	/* STAG invalid: either the */
+						/* STAG is offlimt, being 0, */
+						/* or STAG_key mismatch */
+#define T4_ERR_PDID                        0x2	/* PDID mismatch */
+#define T4_ERR_QPID                        0x3	/* QPID mismatch */
+#define T4_ERR_ACCESS                      0x4	/* Invalid access right */
+#define T4_ERR_WRAP                        0x5	/* Wrap error */
+#define T4_ERR_BOUND                       0x6	/* base and bounds voilation */
+#define T4_ERR_INVALIDATE_SHARED_MR        0x7	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_ECC                         0x9	/* ECC error detected */
+#define T4_ERR_ECC_PSTAG                   0xA	/* ECC error detected when  */
+						/* reading PSTAG for a MW  */
+						/* Invalidate */
+#define T4_ERR_PBL_ADDR_BOUND              0xB	/* pbl addr out of bounds:  */
+						/* software error */
+#define T4_ERR_SWFLUSH			   0xC	/* SW FLUSHED */
+#define T4_ERR_CRC                         0x10 /* CRC error */
+#define T4_ERR_MARKER                      0x11 /* Marker error */
+#define T4_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define T4_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define T4_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define T4_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define T4_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define T4_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define T4_ERR_MSN                         0x18 /* MSN error */
+#define T4_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define T4_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						/* or READ_REQ */
+#define T4_ERR_MSN_GAP                     0x1B
+#define T4_ERR_MSN_RANGE                   0x1C
+#define T4_ERR_IRD_OVERFLOW                0x1D
+#define T4_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						/* software error */
+#define T4_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						/* mismatch) */
+/*
+ * CQE defs
+ */
+struct t4_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 nada1;
+			u16 nada2;
+			u16 cidx;
+		} scqe;
+		struct {
+			__be32 wrid_hi;
+			__be32 wrid_low;
+		} gen;
+	} u;
+	__be64 reserved;
+	__be64 bits_type_ts;
+};
+
+/* macros for flit 0 of the cqe */
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0xFFFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x)->header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x)->header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x)->header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x)->header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x)->header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x)->len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x)->u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x)->u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_IDX(x)	((x)->u.scqe.cidx)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		((x)->u.gen.wrid_hi)
+#define CQE_WRID_LOW(x)		((x)->u.gen.wrid_low)
+
+/* macros for flit 3 of the cqe */
+#define S_CQE_GENBIT	63
+#define M_CQE_GENBIT	0x1
+#define G_CQE_GENBIT(x)	(((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x) ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_OVFBIT	62
+#define M_CQE_OVFBIT	0x1
+#define G_CQE_OVFBIT(x)	((((x) >> S_CQE_OVFBIT)) & M_CQE_OVFBIT)
+
+#define S_CQE_IQTYPE	60
+#define M_CQE_IQTYPE	0x3
+#define G_CQE_IQTYPE(x)	((((x) >> S_CQE_IQTYPE)) & M_CQE_IQTYPE)
+
+#define M_CQE_TS	0x0fffffffffffffffULL
+#define G_CQE_TS(x)	((x) & M_CQE_TS)
+
+#define CQE_OVFBIT(x)	((unsigned)G_CQE_OVFBIT(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_GENBIT(x)	((unsigned)G_CQE_GENBIT(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_TS(x)	(G_CQE_TS(be64_to_cpu((x)->bits_type_ts)))
+
+struct t4_swsqe {
+	u64			wr_id;
+	struct t4_cqe		cqe;
+	int			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+	u16			idx;
+};
+
+struct t4_sq {
+	union t4_wr *queue;
+	dma_addr_t dma_addr;
+	DECLARE_PCI_UNMAP_ADDR(mapping);
+	struct t4_swsqe *sw_sq;
+	struct t4_swsqe *oldest_read;
+	u64 udb;
+	size_t memsize;
+	u32 qid;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+};
+
+struct t4_swrqe {
+	u64 wr_id;
+};
+
+struct t4_rq {
+	union  t4_recv_wr *queue;
+	dma_addr_t dma_addr;
+	DECLARE_PCI_UNMAP_ADDR(mapping);
+	struct t4_swrqe *sw_rq;
+	u64 udb;
+	size_t memsize;
+	u32 qid;
+	u32 msn;
+	u32 rqt_hwaddr;
+	u16 rqt_size;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+};
+
+struct t4_wq {
+	struct t4_sq sq;
+	struct t4_rq rq;
+	void __iomem *db;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+};
+
+static inline int t4_rqes_posted(struct t4_wq *wq)
+{
+	return wq->rq.in_use;
+}
+
+static inline int t4_rq_empty(struct t4_wq *wq)
+{
+	return wq->rq.in_use == 0;
+}
+
+static inline int t4_rq_full(struct t4_wq *wq)
+{
+	return wq->rq.in_use == (wq->rq.size - 1);
+}
+
+static inline u32 t4_rq_avail(struct t4_wq *wq)
+{
+	return wq->rq.size - 1 - wq->rq.in_use;
+}
+
+static inline void t4_rq_produce(struct t4_wq *wq)
+{
+	wq->rq.in_use++;
+	if (++wq->rq.pidx == wq->rq.size)
+		wq->rq.pidx = 0;
+}
+
+static inline void t4_rq_consume(struct t4_wq *wq)
+{
+	wq->rq.in_use--;
+	wq->rq.msn++;
+	if (++wq->rq.cidx == wq->rq.size)
+		wq->rq.cidx = 0;
+}
+
+static inline int t4_sq_empty(struct t4_wq *wq)
+{
+	return wq->sq.in_use == 0;
+}
+
+static inline int t4_sq_full(struct t4_wq *wq)
+{
+	return wq->sq.in_use == (wq->sq.size - 1);
+}
+
+static inline u32 t4_sq_avail(struct t4_wq *wq)
+{
+	return wq->sq.size - 1 - wq->sq.in_use;
+}
+
+static inline void t4_sq_produce(struct t4_wq *wq)
+{
+	wq->sq.in_use++;
+	if (++wq->sq.pidx == wq->sq.size)
+		wq->sq.pidx = 0;
+}
+
+static inline void t4_sq_consume(struct t4_wq *wq)
+{
+	wq->sq.in_use--;
+	if (++wq->sq.cidx == wq->sq.size)
+		wq->sq.cidx = 0;
+}
+
+static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc)
+{
+	inc *= T4_SQ_NUM_SLOTS;
+	wmb();
+	writel(QID(wq->sq.qid) | PIDX(inc), wq->db);
+}
+
+static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc)
+{
+	inc *= T4_RQ_NUM_SLOTS;
+	wmb();
+	writel(QID(wq->rq.qid) | PIDX(inc), wq->db);
+}
+
+static inline int t4_wq_in_error(struct t4_wq *wq)
+{
+	return wq->sq.queue[wq->sq.size].status.qp_err;
+}
+
+static inline void t4_set_wq_in_error(struct t4_wq *wq)
+{
+	wq->sq.queue[wq->sq.size].status.qp_err = 1;
+	wq->rq.queue[wq->rq.size].status.qp_err = 1;
+}
+
+static inline void t4_disable_wq_db(struct t4_wq *wq)
+{
+	wq->sq.queue[wq->sq.size].status.db_off = 1;
+	wq->rq.queue[wq->rq.size].status.db_off = 1;
+}
+
+static inline void t4_enable_wq_db(struct t4_wq *wq)
+{
+	wq->sq.queue[wq->sq.size].status.db_off = 0;
+	wq->rq.queue[wq->rq.size].status.db_off = 0;
+}
+
+static inline int t4_wq_db_enabled(struct t4_wq *wq)
+{
+	return !wq->sq.queue[wq->sq.size].status.db_off;
+}
+
+struct t4_cq {
+	struct t4_cqe *queue;
+	dma_addr_t dma_addr;
+	DECLARE_PCI_UNMAP_ADDR(mapping);
+	struct t4_cqe *sw_queue;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+	u64 ugts;
+	size_t memsize;
+	u64 timestamp;
+	u32 cqid;
+	u16 size; /* including status page */
+	u16 cidx;
+	u16 sw_pidx;
+	u16 sw_cidx;
+	u16 sw_in_use;
+	u16 cidx_inc;
+	u8 gen;
+	u8 error;
+};
+
+static inline int t4_arm_cq(struct t4_cq *cq, int se)
+{
+	u32 val;
+
+	val = SEINTARM(se) | CIDXINC(cq->cidx_inc) | TIMERREG(6) |
+	      INGRESSQID(cq->cqid);
+	cq->cidx_inc = 0;
+	writel(val, cq->gts);
+	return 0;
+}
+
+static inline void t4_swcq_produce(struct t4_cq *cq)
+{
+	cq->sw_in_use++;
+	if (++cq->sw_pidx == cq->size)
+		cq->sw_pidx = 0;
+}
+
+static inline void t4_swcq_consume(struct t4_cq *cq)
+{
+	cq->sw_in_use--;
+	if (++cq->sw_cidx == cq->size)
+		cq->sw_cidx = 0;
+}
+
+static inline void t4_hwcq_consume(struct t4_cq *cq)
+{
+	cq->cidx_inc++;
+	if (++cq->cidx == cq->size) {
+		cq->cidx = 0;
+		cq->gen ^= 1;
+	}
+}
+
+static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
+{
+	return (CQE_GENBIT(cqe) == cq->gen);
+}
+
+static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret = 0;
+
+	if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) {
+		*cqe = &cq->queue[cq->cidx];
+		cq->timestamp = CQE_TS(*cqe);
+	} else if (CQE_TS(&cq->queue[cq->cidx]) > cq->timestamp)
+		ret = -EOVERFLOW;
+	else
+		ret = -ENODATA;
+	if (ret == -EOVERFLOW) {
+		printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid);
+		cq->error = 1;
+	}
+	return ret;
+}
+
+static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq)
+{
+	if (cq->sw_in_use)
+		return &cq->sw_queue[cq->sw_cidx];
+	return NULL;
+}
+
+static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret = 0;
+
+	if (cq->error)
+		ret = -ENODATA;
+	else if (cq->sw_in_use)
+		*cqe = &cq->sw_queue[cq->sw_cidx];
+	else
+		ret = t4_next_hw_cqe(cq, cqe);
+	return ret;
+}
+
+static inline int t4_cq_in_error(struct t4_cq *cq)
+{
+	return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err;
+}
+
+static inline void t4_set_cq_in_error(struct t4_cq *cq)
+{
+	((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1;
+}
+#endif
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
new file mode 100644
index 0000000..fc706bd
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T4FW_RI_API_H_
+#define _T4FW_RI_API_H_
+
+#include "t4fw_api.h"
+
+enum fw_ri_wr_opcode {
+	FW_RI_RDMA_WRITE		= 0x0,	/* IETF RDMAP v1.0 ... */
+	FW_RI_READ_REQ			= 0x1,
+	FW_RI_READ_RESP			= 0x2,
+	FW_RI_SEND			= 0x3,
+	FW_RI_SEND_WITH_INV		= 0x4,
+	FW_RI_SEND_WITH_SE		= 0x5,
+	FW_RI_SEND_WITH_SE_INV		= 0x6,
+	FW_RI_TERMINATE			= 0x7,
+	FW_RI_RDMA_INIT			= 0x8,	/* CHELSIO RI specific ... */
+	FW_RI_BIND_MW			= 0x9,
+	FW_RI_FAST_REGISTER		= 0xa,
+	FW_RI_LOCAL_INV			= 0xb,
+	FW_RI_QP_MODIFY			= 0xc,
+	FW_RI_BYPASS			= 0xd,
+	FW_RI_RECEIVE			= 0xe,
+
+	FW_RI_SGE_EC_CR_RETURN		= 0xf
+};
+
+enum fw_ri_wr_flags {
+	FW_RI_COMPLETION_FLAG		= 0x01,
+	FW_RI_NOTIFICATION_FLAG		= 0x02,
+	FW_RI_SOLICITED_EVENT_FLAG	= 0x04,
+	FW_RI_READ_FENCE_FLAG		= 0x08,
+	FW_RI_LOCAL_FENCE_FLAG		= 0x10,
+	FW_RI_RDMA_READ_INVALIDATE	= 0x20
+};
+
+enum fw_ri_mpa_attrs {
+	FW_RI_MPA_RX_MARKER_ENABLE	= 0x01,
+	FW_RI_MPA_TX_MARKER_ENABLE	= 0x02,
+	FW_RI_MPA_CRC_ENABLE		= 0x04,
+	FW_RI_MPA_IETF_ENABLE		= 0x08
+};
+
+enum fw_ri_qp_caps {
+	FW_RI_QP_RDMA_READ_ENABLE	= 0x01,
+	FW_RI_QP_RDMA_WRITE_ENABLE	= 0x02,
+	FW_RI_QP_BIND_ENABLE		= 0x04,
+	FW_RI_QP_FAST_REGISTER_ENABLE	= 0x08,
+	FW_RI_QP_STAG0_ENABLE		= 0x10
+};
+
+enum fw_ri_addr_type {
+	FW_RI_ZERO_BASED_TO		= 0x00,
+	FW_RI_VA_BASED_TO		= 0x01
+};
+
+enum fw_ri_mem_perms {
+	FW_RI_MEM_ACCESS_REM_WRITE	= 0x01,
+	FW_RI_MEM_ACCESS_REM_READ	= 0x02,
+	FW_RI_MEM_ACCESS_REM		= 0x03,
+	FW_RI_MEM_ACCESS_LOCAL_WRITE	= 0x04,
+	FW_RI_MEM_ACCESS_LOCAL_READ	= 0x08,
+	FW_RI_MEM_ACCESS_LOCAL		= 0x0C
+};
+
+enum fw_ri_stag_type {
+	FW_RI_STAG_NSMR			= 0x00,
+	FW_RI_STAG_SMR			= 0x01,
+	FW_RI_STAG_MW			= 0x02,
+	FW_RI_STAG_MW_RELAXED		= 0x03
+};
+
+enum fw_ri_data_op {
+	FW_RI_DATA_IMMD			= 0x81,
+	FW_RI_DATA_DSGL			= 0x82,
+	FW_RI_DATA_ISGL			= 0x83
+};
+
+enum fw_ri_sgl_depth {
+	FW_RI_SGL_DEPTH_MAX_SQ		= 16,
+	FW_RI_SGL_DEPTH_MAX_RQ		= 4
+};
+
+struct fw_ri_dsge_pair {
+	__be32	len[2];
+	__be64	addr[2];
+};
+
+struct fw_ri_dsgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	len0;
+	__be64	addr0;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_dsge_pair sge[0];
+#endif
+};
+
+struct fw_ri_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+struct fw_ri_isgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	r2;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_sge sge[0];
+#endif
+};
+
+struct fw_ri_immd {
+	__u8	op;
+	__u8	r1;
+	__be16	r2;
+	__be32	immdlen;
+#ifndef C99_NOT_SUPPORTED
+	__u8	data[0];
+#endif
+};
+
+struct fw_ri_tpte {
+	__be32 valid_to_pdid;
+	__be32 locread_to_qpid;
+	__be32 nosnoop_pbladdr;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+	__be32 dca_mwbcnt_pstag;
+	__be32 len_hi;
+};
+
+#define S_FW_RI_TPTE_VALID		31
+#define M_FW_RI_TPTE_VALID		0x1
+#define V_FW_RI_TPTE_VALID(x)		((x) << S_FW_RI_TPTE_VALID)
+#define G_FW_RI_TPTE_VALID(x)		\
+    (((x) >> S_FW_RI_TPTE_VALID) & M_FW_RI_TPTE_VALID)
+#define F_FW_RI_TPTE_VALID		V_FW_RI_TPTE_VALID(1U)
+
+#define S_FW_RI_TPTE_STAGKEY		23
+#define M_FW_RI_TPTE_STAGKEY		0xff
+#define V_FW_RI_TPTE_STAGKEY(x)		((x) << S_FW_RI_TPTE_STAGKEY)
+#define G_FW_RI_TPTE_STAGKEY(x)		\
+    (((x) >> S_FW_RI_TPTE_STAGKEY) & M_FW_RI_TPTE_STAGKEY)
+
+#define S_FW_RI_TPTE_STAGSTATE		22
+#define M_FW_RI_TPTE_STAGSTATE		0x1
+#define V_FW_RI_TPTE_STAGSTATE(x)	((x) << S_FW_RI_TPTE_STAGSTATE)
+#define G_FW_RI_TPTE_STAGSTATE(x)	\
+    (((x) >> S_FW_RI_TPTE_STAGSTATE) & M_FW_RI_TPTE_STAGSTATE)
+#define F_FW_RI_TPTE_STAGSTATE		V_FW_RI_TPTE_STAGSTATE(1U)
+
+#define S_FW_RI_TPTE_STAGTYPE		20
+#define M_FW_RI_TPTE_STAGTYPE		0x3
+#define V_FW_RI_TPTE_STAGTYPE(x)	((x) << S_FW_RI_TPTE_STAGTYPE)
+#define G_FW_RI_TPTE_STAGTYPE(x)	\
+    (((x) >> S_FW_RI_TPTE_STAGTYPE) & M_FW_RI_TPTE_STAGTYPE)
+
+#define S_FW_RI_TPTE_PDID		0
+#define M_FW_RI_TPTE_PDID		0xfffff
+#define V_FW_RI_TPTE_PDID(x)		((x) << S_FW_RI_TPTE_PDID)
+#define G_FW_RI_TPTE_PDID(x)		\
+    (((x) >> S_FW_RI_TPTE_PDID) & M_FW_RI_TPTE_PDID)
+
+#define S_FW_RI_TPTE_PERM		28
+#define M_FW_RI_TPTE_PERM		0xf
+#define V_FW_RI_TPTE_PERM(x)		((x) << S_FW_RI_TPTE_PERM)
+#define G_FW_RI_TPTE_PERM(x)		\
+    (((x) >> S_FW_RI_TPTE_PERM) & M_FW_RI_TPTE_PERM)
+
+#define S_FW_RI_TPTE_REMINVDIS		27
+#define M_FW_RI_TPTE_REMINVDIS		0x1
+#define V_FW_RI_TPTE_REMINVDIS(x)	((x) << S_FW_RI_TPTE_REMINVDIS)
+#define G_FW_RI_TPTE_REMINVDIS(x)	\
+    (((x) >> S_FW_RI_TPTE_REMINVDIS) & M_FW_RI_TPTE_REMINVDIS)
+#define F_FW_RI_TPTE_REMINVDIS		V_FW_RI_TPTE_REMINVDIS(1U)
+
+#define S_FW_RI_TPTE_ADDRTYPE		26
+#define M_FW_RI_TPTE_ADDRTYPE		1
+#define V_FW_RI_TPTE_ADDRTYPE(x)	((x) << S_FW_RI_TPTE_ADDRTYPE)
+#define G_FW_RI_TPTE_ADDRTYPE(x)	\
+    (((x) >> S_FW_RI_TPTE_ADDRTYPE) & M_FW_RI_TPTE_ADDRTYPE)
+#define F_FW_RI_TPTE_ADDRTYPE		V_FW_RI_TPTE_ADDRTYPE(1U)
+
+#define S_FW_RI_TPTE_MWBINDEN		25
+#define M_FW_RI_TPTE_MWBINDEN		0x1
+#define V_FW_RI_TPTE_MWBINDEN(x)	((x) << S_FW_RI_TPTE_MWBINDEN)
+#define G_FW_RI_TPTE_MWBINDEN(x)	\
+    (((x) >> S_FW_RI_TPTE_MWBINDEN) & M_FW_RI_TPTE_MWBINDEN)
+#define F_FW_RI_TPTE_MWBINDEN		V_FW_RI_TPTE_MWBINDEN(1U)
+
+#define S_FW_RI_TPTE_PS			20
+#define M_FW_RI_TPTE_PS			0x1f
+#define V_FW_RI_TPTE_PS(x)		((x) << S_FW_RI_TPTE_PS)
+#define G_FW_RI_TPTE_PS(x)		\
+    (((x) >> S_FW_RI_TPTE_PS) & M_FW_RI_TPTE_PS)
+
+#define S_FW_RI_TPTE_QPID		0
+#define M_FW_RI_TPTE_QPID		0xfffff
+#define V_FW_RI_TPTE_QPID(x)		((x) << S_FW_RI_TPTE_QPID)
+#define G_FW_RI_TPTE_QPID(x)		\
+    (((x) >> S_FW_RI_TPTE_QPID) & M_FW_RI_TPTE_QPID)
+
+#define S_FW_RI_TPTE_NOSNOOP		30
+#define M_FW_RI_TPTE_NOSNOOP		0x1
+#define V_FW_RI_TPTE_NOSNOOP(x)		((x) << S_FW_RI_TPTE_NOSNOOP)
+#define G_FW_RI_TPTE_NOSNOOP(x)		\
+    (((x) >> S_FW_RI_TPTE_NOSNOOP) & M_FW_RI_TPTE_NOSNOOP)
+#define F_FW_RI_TPTE_NOSNOOP		V_FW_RI_TPTE_NOSNOOP(1U)
+
+#define S_FW_RI_TPTE_PBLADDR		0
+#define M_FW_RI_TPTE_PBLADDR		0x1fffffff
+#define V_FW_RI_TPTE_PBLADDR(x)		((x) << S_FW_RI_TPTE_PBLADDR)
+#define G_FW_RI_TPTE_PBLADDR(x)		\
+    (((x) >> S_FW_RI_TPTE_PBLADDR) & M_FW_RI_TPTE_PBLADDR)
+
+#define S_FW_RI_TPTE_DCA		24
+#define M_FW_RI_TPTE_DCA		0x1f
+#define V_FW_RI_TPTE_DCA(x)		((x) << S_FW_RI_TPTE_DCA)
+#define G_FW_RI_TPTE_DCA(x)		\
+    (((x) >> S_FW_RI_TPTE_DCA) & M_FW_RI_TPTE_DCA)
+
+#define S_FW_RI_TPTE_MWBCNT_PSTAG	0
+#define M_FW_RI_TPTE_MWBCNT_PSTAG	0xffffff
+#define V_FW_RI_TPTE_MWBCNT_PSTAT(x)	\
+    ((x) << S_FW_RI_TPTE_MWBCNT_PSTAG)
+#define G_FW_RI_TPTE_MWBCNT_PSTAG(x)	\
+    (((x) >> S_FW_RI_TPTE_MWBCNT_PSTAG) & M_FW_RI_TPTE_MWBCNT_PSTAG)
+
+enum fw_ri_res_type {
+	FW_RI_RES_TYPE_SQ,
+	FW_RI_RES_TYPE_RQ,
+	FW_RI_RES_TYPE_CQ,
+};
+
+enum fw_ri_res_op {
+	FW_RI_RES_OP_WRITE,
+	FW_RI_RES_OP_RESET,
+};
+
+struct fw_ri_res {
+	union fw_ri_restype {
+		struct fw_ri_res_sqrq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 eqid;
+			__be32 r4[2];
+			__be32 fetchszm_to_iqid;
+			__be32 dcaen_to_eqsize;
+			__be64 eqaddr;
+		} sqrq;
+		struct fw_ri_res_cq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 iqid;
+			__be32 r4[2];
+			__be32 iqandst_to_iqandstindex;
+			__be16 iqdroprss_to_iqesize;
+			__be16 iqsize;
+			__be64 iqaddr;
+			__be32 iqns_iqro;
+			__be32 r6_lo;
+			__be64 r7;
+		} cq;
+	} u;
+};
+
+struct fw_ri_res_wr {
+	__be32 op_nres;
+	__be32 len16_pkd;
+	__u64  cookie;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_res res[0];
+#endif
+};
+
+#define S_FW_RI_RES_WR_NRES	0
+#define M_FW_RI_RES_WR_NRES	0xff
+#define V_FW_RI_RES_WR_NRES(x)	((x) << S_FW_RI_RES_WR_NRES)
+#define G_FW_RI_RES_WR_NRES(x)	\
+    (((x) >> S_FW_RI_RES_WR_NRES) & M_FW_RI_RES_WR_NRES)
+
+#define S_FW_RI_RES_WR_FETCHSZM		26
+#define M_FW_RI_RES_WR_FETCHSZM		0x1
+#define V_FW_RI_RES_WR_FETCHSZM(x)	((x) << S_FW_RI_RES_WR_FETCHSZM)
+#define G_FW_RI_RES_WR_FETCHSZM(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHSZM) & M_FW_RI_RES_WR_FETCHSZM)
+#define F_FW_RI_RES_WR_FETCHSZM	V_FW_RI_RES_WR_FETCHSZM(1U)
+
+#define S_FW_RI_RES_WR_STATUSPGNS	25
+#define M_FW_RI_RES_WR_STATUSPGNS	0x1
+#define V_FW_RI_RES_WR_STATUSPGNS(x)	((x) << S_FW_RI_RES_WR_STATUSPGNS)
+#define G_FW_RI_RES_WR_STATUSPGNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_STATUSPGNS) & M_FW_RI_RES_WR_STATUSPGNS)
+#define F_FW_RI_RES_WR_STATUSPGNS	V_FW_RI_RES_WR_STATUSPGNS(1U)
+
+#define S_FW_RI_RES_WR_STATUSPGRO	24
+#define M_FW_RI_RES_WR_STATUSPGRO	0x1
+#define V_FW_RI_RES_WR_STATUSPGRO(x)	((x) << S_FW_RI_RES_WR_STATUSPGRO)
+#define G_FW_RI_RES_WR_STATUSPGRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_STATUSPGRO) & M_FW_RI_RES_WR_STATUSPGRO)
+#define F_FW_RI_RES_WR_STATUSPGRO	V_FW_RI_RES_WR_STATUSPGRO(1U)
+
+#define S_FW_RI_RES_WR_FETCHNS		23
+#define M_FW_RI_RES_WR_FETCHNS		0x1
+#define V_FW_RI_RES_WR_FETCHNS(x)	((x) << S_FW_RI_RES_WR_FETCHNS)
+#define G_FW_RI_RES_WR_FETCHNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHNS) & M_FW_RI_RES_WR_FETCHNS)
+#define F_FW_RI_RES_WR_FETCHNS	V_FW_RI_RES_WR_FETCHNS(1U)
+
+#define S_FW_RI_RES_WR_FETCHRO		22
+#define M_FW_RI_RES_WR_FETCHRO		0x1
+#define V_FW_RI_RES_WR_FETCHRO(x)	((x) << S_FW_RI_RES_WR_FETCHRO)
+#define G_FW_RI_RES_WR_FETCHRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHRO) & M_FW_RI_RES_WR_FETCHRO)
+#define F_FW_RI_RES_WR_FETCHRO	V_FW_RI_RES_WR_FETCHRO(1U)
+
+#define S_FW_RI_RES_WR_HOSTFCMODE	20
+#define M_FW_RI_RES_WR_HOSTFCMODE	0x3
+#define V_FW_RI_RES_WR_HOSTFCMODE(x)	((x) << S_FW_RI_RES_WR_HOSTFCMODE)
+#define G_FW_RI_RES_WR_HOSTFCMODE(x)	\
+    (((x) >> S_FW_RI_RES_WR_HOSTFCMODE) & M_FW_RI_RES_WR_HOSTFCMODE)
+
+#define S_FW_RI_RES_WR_CPRIO	19
+#define M_FW_RI_RES_WR_CPRIO	0x1
+#define V_FW_RI_RES_WR_CPRIO(x)	((x) << S_FW_RI_RES_WR_CPRIO)
+#define G_FW_RI_RES_WR_CPRIO(x)	\
+    (((x) >> S_FW_RI_RES_WR_CPRIO) & M_FW_RI_RES_WR_CPRIO)
+#define F_FW_RI_RES_WR_CPRIO	V_FW_RI_RES_WR_CPRIO(1U)
+
+#define S_FW_RI_RES_WR_ONCHIP		18
+#define M_FW_RI_RES_WR_ONCHIP		0x1
+#define V_FW_RI_RES_WR_ONCHIP(x)	((x) << S_FW_RI_RES_WR_ONCHIP)
+#define G_FW_RI_RES_WR_ONCHIP(x)	\
+    (((x) >> S_FW_RI_RES_WR_ONCHIP) & M_FW_RI_RES_WR_ONCHIP)
+#define F_FW_RI_RES_WR_ONCHIP	V_FW_RI_RES_WR_ONCHIP(1U)
+
+#define S_FW_RI_RES_WR_PCIECHN		16
+#define M_FW_RI_RES_WR_PCIECHN		0x3
+#define V_FW_RI_RES_WR_PCIECHN(x)	((x) << S_FW_RI_RES_WR_PCIECHN)
+#define G_FW_RI_RES_WR_PCIECHN(x)	\
+    (((x) >> S_FW_RI_RES_WR_PCIECHN) & M_FW_RI_RES_WR_PCIECHN)
+
+#define S_FW_RI_RES_WR_IQID	0
+#define M_FW_RI_RES_WR_IQID	0xffff
+#define V_FW_RI_RES_WR_IQID(x)	((x) << S_FW_RI_RES_WR_IQID)
+#define G_FW_RI_RES_WR_IQID(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQID) & M_FW_RI_RES_WR_IQID)
+
+#define S_FW_RI_RES_WR_DCAEN	31
+#define M_FW_RI_RES_WR_DCAEN	0x1
+#define V_FW_RI_RES_WR_DCAEN(x)	((x) << S_FW_RI_RES_WR_DCAEN)
+#define G_FW_RI_RES_WR_DCAEN(x)	\
+    (((x) >> S_FW_RI_RES_WR_DCAEN) & M_FW_RI_RES_WR_DCAEN)
+#define F_FW_RI_RES_WR_DCAEN	V_FW_RI_RES_WR_DCAEN(1U)
+
+#define S_FW_RI_RES_WR_DCACPU		26
+#define M_FW_RI_RES_WR_DCACPU		0x1f
+#define V_FW_RI_RES_WR_DCACPU(x)	((x) << S_FW_RI_RES_WR_DCACPU)
+#define G_FW_RI_RES_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_RES_WR_DCACPU) & M_FW_RI_RES_WR_DCACPU)
+
+#define S_FW_RI_RES_WR_FBMIN	23
+#define M_FW_RI_RES_WR_FBMIN	0x7
+#define V_FW_RI_RES_WR_FBMIN(x)	((x) << S_FW_RI_RES_WR_FBMIN)
+#define G_FW_RI_RES_WR_FBMIN(x)	\
+    (((x) >> S_FW_RI_RES_WR_FBMIN) & M_FW_RI_RES_WR_FBMIN)
+
+#define S_FW_RI_RES_WR_FBMAX	20
+#define M_FW_RI_RES_WR_FBMAX	0x7
+#define V_FW_RI_RES_WR_FBMAX(x)	((x) << S_FW_RI_RES_WR_FBMAX)
+#define G_FW_RI_RES_WR_FBMAX(x)	\
+    (((x) >> S_FW_RI_RES_WR_FBMAX) & M_FW_RI_RES_WR_FBMAX)
+
+#define S_FW_RI_RES_WR_CIDXFTHRESHO	19
+#define M_FW_RI_RES_WR_CIDXFTHRESHO	0x1
+#define V_FW_RI_RES_WR_CIDXFTHRESHO(x)	((x) << S_FW_RI_RES_WR_CIDXFTHRESHO)
+#define G_FW_RI_RES_WR_CIDXFTHRESHO(x)	\
+    (((x) >> S_FW_RI_RES_WR_CIDXFTHRESHO) & M_FW_RI_RES_WR_CIDXFTHRESHO)
+#define F_FW_RI_RES_WR_CIDXFTHRESHO	V_FW_RI_RES_WR_CIDXFTHRESHO(1U)
+
+#define S_FW_RI_RES_WR_CIDXFTHRESH	16
+#define M_FW_RI_RES_WR_CIDXFTHRESH	0x7
+#define V_FW_RI_RES_WR_CIDXFTHRESH(x)	((x) << S_FW_RI_RES_WR_CIDXFTHRESH)
+#define G_FW_RI_RES_WR_CIDXFTHRESH(x)	\
+    (((x) >> S_FW_RI_RES_WR_CIDXFTHRESH) & M_FW_RI_RES_WR_CIDXFTHRESH)
+
+#define S_FW_RI_RES_WR_EQSIZE		0
+#define M_FW_RI_RES_WR_EQSIZE		0xffff
+#define V_FW_RI_RES_WR_EQSIZE(x)	((x) << S_FW_RI_RES_WR_EQSIZE)
+#define G_FW_RI_RES_WR_EQSIZE(x)	\
+    (((x) >> S_FW_RI_RES_WR_EQSIZE) & M_FW_RI_RES_WR_EQSIZE)
+
+#define S_FW_RI_RES_WR_IQANDST		15
+#define M_FW_RI_RES_WR_IQANDST		0x1
+#define V_FW_RI_RES_WR_IQANDST(x)	((x) << S_FW_RI_RES_WR_IQANDST)
+#define G_FW_RI_RES_WR_IQANDST(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANDST) & M_FW_RI_RES_WR_IQANDST)
+#define F_FW_RI_RES_WR_IQANDST	V_FW_RI_RES_WR_IQANDST(1U)
+
+#define S_FW_RI_RES_WR_IQANUS		14
+#define M_FW_RI_RES_WR_IQANUS		0x1
+#define V_FW_RI_RES_WR_IQANUS(x)	((x) << S_FW_RI_RES_WR_IQANUS)
+#define G_FW_RI_RES_WR_IQANUS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANUS) & M_FW_RI_RES_WR_IQANUS)
+#define F_FW_RI_RES_WR_IQANUS	V_FW_RI_RES_WR_IQANUS(1U)
+
+#define S_FW_RI_RES_WR_IQANUD		12
+#define M_FW_RI_RES_WR_IQANUD		0x3
+#define V_FW_RI_RES_WR_IQANUD(x)	((x) << S_FW_RI_RES_WR_IQANUD)
+#define G_FW_RI_RES_WR_IQANUD(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANUD) & M_FW_RI_RES_WR_IQANUD)
+
+#define S_FW_RI_RES_WR_IQANDSTINDEX	0
+#define M_FW_RI_RES_WR_IQANDSTINDEX	0xfff
+#define V_FW_RI_RES_WR_IQANDSTINDEX(x)	((x) << S_FW_RI_RES_WR_IQANDSTINDEX)
+#define G_FW_RI_RES_WR_IQANDSTINDEX(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANDSTINDEX) & M_FW_RI_RES_WR_IQANDSTINDEX)
+
+#define S_FW_RI_RES_WR_IQDROPRSS	15
+#define M_FW_RI_RES_WR_IQDROPRSS	0x1
+#define V_FW_RI_RES_WR_IQDROPRSS(x)	((x) << S_FW_RI_RES_WR_IQDROPRSS)
+#define G_FW_RI_RES_WR_IQDROPRSS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDROPRSS) & M_FW_RI_RES_WR_IQDROPRSS)
+#define F_FW_RI_RES_WR_IQDROPRSS	V_FW_RI_RES_WR_IQDROPRSS(1U)
+
+#define S_FW_RI_RES_WR_IQGTSMODE	14
+#define M_FW_RI_RES_WR_IQGTSMODE	0x1
+#define V_FW_RI_RES_WR_IQGTSMODE(x)	((x) << S_FW_RI_RES_WR_IQGTSMODE)
+#define G_FW_RI_RES_WR_IQGTSMODE(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQGTSMODE) & M_FW_RI_RES_WR_IQGTSMODE)
+#define F_FW_RI_RES_WR_IQGTSMODE	V_FW_RI_RES_WR_IQGTSMODE(1U)
+
+#define S_FW_RI_RES_WR_IQPCIECH		12
+#define M_FW_RI_RES_WR_IQPCIECH		0x3
+#define V_FW_RI_RES_WR_IQPCIECH(x)	((x) << S_FW_RI_RES_WR_IQPCIECH)
+#define G_FW_RI_RES_WR_IQPCIECH(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQPCIECH) & M_FW_RI_RES_WR_IQPCIECH)
+
+#define S_FW_RI_RES_WR_IQDCAEN		11
+#define M_FW_RI_RES_WR_IQDCAEN		0x1
+#define V_FW_RI_RES_WR_IQDCAEN(x)	((x) << S_FW_RI_RES_WR_IQDCAEN)
+#define G_FW_RI_RES_WR_IQDCAEN(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDCAEN) & M_FW_RI_RES_WR_IQDCAEN)
+#define F_FW_RI_RES_WR_IQDCAEN	V_FW_RI_RES_WR_IQDCAEN(1U)
+
+#define S_FW_RI_RES_WR_IQDCACPU		6
+#define M_FW_RI_RES_WR_IQDCACPU		0x1f
+#define V_FW_RI_RES_WR_IQDCACPU(x)	((x) << S_FW_RI_RES_WR_IQDCACPU)
+#define G_FW_RI_RES_WR_IQDCACPU(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDCACPU) & M_FW_RI_RES_WR_IQDCACPU)
+
+#define S_FW_RI_RES_WR_IQINTCNTTHRESH		4
+#define M_FW_RI_RES_WR_IQINTCNTTHRESH		0x3
+#define V_FW_RI_RES_WR_IQINTCNTTHRESH(x)	\
+    ((x) << S_FW_RI_RES_WR_IQINTCNTTHRESH)
+#define G_FW_RI_RES_WR_IQINTCNTTHRESH(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQINTCNTTHRESH) & M_FW_RI_RES_WR_IQINTCNTTHRESH)
+
+#define S_FW_RI_RES_WR_IQO	3
+#define M_FW_RI_RES_WR_IQO	0x1
+#define V_FW_RI_RES_WR_IQO(x)	((x) << S_FW_RI_RES_WR_IQO)
+#define G_FW_RI_RES_WR_IQO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQO) & M_FW_RI_RES_WR_IQO)
+#define F_FW_RI_RES_WR_IQO	V_FW_RI_RES_WR_IQO(1U)
+
+#define S_FW_RI_RES_WR_IQCPRIO		2
+#define M_FW_RI_RES_WR_IQCPRIO		0x1
+#define V_FW_RI_RES_WR_IQCPRIO(x)	((x) << S_FW_RI_RES_WR_IQCPRIO)
+#define G_FW_RI_RES_WR_IQCPRIO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQCPRIO) & M_FW_RI_RES_WR_IQCPRIO)
+#define F_FW_RI_RES_WR_IQCPRIO	V_FW_RI_RES_WR_IQCPRIO(1U)
+
+#define S_FW_RI_RES_WR_IQESIZE		0
+#define M_FW_RI_RES_WR_IQESIZE		0x3
+#define V_FW_RI_RES_WR_IQESIZE(x)	((x) << S_FW_RI_RES_WR_IQESIZE)
+#define G_FW_RI_RES_WR_IQESIZE(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQESIZE) & M_FW_RI_RES_WR_IQESIZE)
+
+#define S_FW_RI_RES_WR_IQNS	31
+#define M_FW_RI_RES_WR_IQNS	0x1
+#define V_FW_RI_RES_WR_IQNS(x)	((x) << S_FW_RI_RES_WR_IQNS)
+#define G_FW_RI_RES_WR_IQNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQNS) & M_FW_RI_RES_WR_IQNS)
+#define F_FW_RI_RES_WR_IQNS	V_FW_RI_RES_WR_IQNS(1U)
+
+#define S_FW_RI_RES_WR_IQRO	30
+#define M_FW_RI_RES_WR_IQRO	0x1
+#define V_FW_RI_RES_WR_IQRO(x)	((x) << S_FW_RI_RES_WR_IQRO)
+#define G_FW_RI_RES_WR_IQRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQRO) & M_FW_RI_RES_WR_IQRO)
+#define F_FW_RI_RES_WR_IQRO	V_FW_RI_RES_WR_IQRO(1U)
+
+struct fw_ri_rdma_write_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 plen;
+	__be32 stag_sink;
+	__be64 to_sink;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+struct fw_ri_send_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 sendop_pkd;
+	__be32 stag_inv;
+	__be32 plen;
+	__be32 r3;
+	__be64 r4;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+#define S_FW_RI_SEND_WR_SENDOP		0
+#define M_FW_RI_SEND_WR_SENDOP		0xf
+#define V_FW_RI_SEND_WR_SENDOP(x)	((x) << S_FW_RI_SEND_WR_SENDOP)
+#define G_FW_RI_SEND_WR_SENDOP(x)	\
+    (((x) >> S_FW_RI_SEND_WR_SENDOP) & M_FW_RI_SEND_WR_SENDOP)
+
+struct fw_ri_rdma_read_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 stag_sink;
+	__be32 to_sink_hi;
+	__be32 to_sink_lo;
+	__be32 plen;
+	__be32 stag_src;
+	__be32 to_src_hi;
+	__be32 to_src_lo;
+	__be32 r5;
+};
+
+struct fw_ri_recv_wr {
+	__u8   opcode;
+	__u8   r1;
+	__u16  wrid;
+	__u8   r2[3];
+	__u8   len16;
+	struct fw_ri_isgl isgl;
+};
+
+struct fw_ri_bind_mw_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag_mr;
+	__be32 stag_mw;
+	__be32 r3;
+	__be64 len_mw;
+	__be64 va_fbo;
+	__be64 r4;
+};
+
+#define S_FW_RI_BIND_MW_WR_QPBINDE	6
+#define M_FW_RI_BIND_MW_WR_QPBINDE	0x1
+#define V_FW_RI_BIND_MW_WR_QPBINDE(x)	((x) << S_FW_RI_BIND_MW_WR_QPBINDE)
+#define G_FW_RI_BIND_MW_WR_QPBINDE(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_QPBINDE) & M_FW_RI_BIND_MW_WR_QPBINDE)
+#define F_FW_RI_BIND_MW_WR_QPBINDE	V_FW_RI_BIND_MW_WR_QPBINDE(1U)
+
+#define S_FW_RI_BIND_MW_WR_NS		5
+#define M_FW_RI_BIND_MW_WR_NS		0x1
+#define V_FW_RI_BIND_MW_WR_NS(x)	((x) << S_FW_RI_BIND_MW_WR_NS)
+#define G_FW_RI_BIND_MW_WR_NS(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_NS) & M_FW_RI_BIND_MW_WR_NS)
+#define F_FW_RI_BIND_MW_WR_NS	V_FW_RI_BIND_MW_WR_NS(1U)
+
+#define S_FW_RI_BIND_MW_WR_DCACPU	0
+#define M_FW_RI_BIND_MW_WR_DCACPU	0x1f
+#define V_FW_RI_BIND_MW_WR_DCACPU(x)	((x) << S_FW_RI_BIND_MW_WR_DCACPU)
+#define G_FW_RI_BIND_MW_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_DCACPU) & M_FW_RI_BIND_MW_WR_DCACPU)
+
+struct fw_ri_fr_nsmr_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag;
+	__be32 len_hi;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+};
+
+#define S_FW_RI_FR_NSMR_WR_QPBINDE	6
+#define M_FW_RI_FR_NSMR_WR_QPBINDE	0x1
+#define V_FW_RI_FR_NSMR_WR_QPBINDE(x)	((x) << S_FW_RI_FR_NSMR_WR_QPBINDE)
+#define G_FW_RI_FR_NSMR_WR_QPBINDE(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_QPBINDE) & M_FW_RI_FR_NSMR_WR_QPBINDE)
+#define F_FW_RI_FR_NSMR_WR_QPBINDE	V_FW_RI_FR_NSMR_WR_QPBINDE(1U)
+
+#define S_FW_RI_FR_NSMR_WR_NS		5
+#define M_FW_RI_FR_NSMR_WR_NS		0x1
+#define V_FW_RI_FR_NSMR_WR_NS(x)	((x) << S_FW_RI_FR_NSMR_WR_NS)
+#define G_FW_RI_FR_NSMR_WR_NS(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_NS) & M_FW_RI_FR_NSMR_WR_NS)
+#define F_FW_RI_FR_NSMR_WR_NS	V_FW_RI_FR_NSMR_WR_NS(1U)
+
+#define S_FW_RI_FR_NSMR_WR_DCACPU	0
+#define M_FW_RI_FR_NSMR_WR_DCACPU	0x1f
+#define V_FW_RI_FR_NSMR_WR_DCACPU(x)	((x) << S_FW_RI_FR_NSMR_WR_DCACPU)
+#define G_FW_RI_FR_NSMR_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_DCACPU) & M_FW_RI_FR_NSMR_WR_DCACPU)
+
+struct fw_ri_inv_lstag_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 r2;
+	__be32 stag_inv;
+};
+
+enum fw_ri_type {
+	FW_RI_TYPE_INIT,
+	FW_RI_TYPE_FINI,
+	FW_RI_TYPE_TERMINATE
+};
+
+enum fw_ri_init_p2ptype {
+	FW_RI_INIT_P2PTYPE_RDMA_WRITE		= FW_RI_RDMA_WRITE,
+	FW_RI_INIT_P2PTYPE_READ_REQ		= FW_RI_READ_REQ,
+	FW_RI_INIT_P2PTYPE_SEND			= FW_RI_SEND,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_INV	= FW_RI_SEND_WITH_INV,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE		= FW_RI_SEND_WITH_SE,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV	= FW_RI_SEND_WITH_SE_INV,
+	FW_RI_INIT_P2PTYPE_DISABLED		= 0xf,
+};
+
+struct fw_ri_wr {
+	__be32 op_compl;
+	__be32 flowid_len16;
+	__u64  cookie;
+	union fw_ri {
+		struct fw_ri_init {
+			__u8   type;
+			__u8   mpareqbit_p2ptype;
+			__u8   r4[2];
+			__u8   mpa_attrs;
+			__u8   qp_caps;
+			__be16 nrqe;
+			__be32 pdid;
+			__be32 qpid;
+			__be32 sq_eqid;
+			__be32 rq_eqid;
+			__be32 scqid;
+			__be32 rcqid;
+			__be32 ord_max;
+			__be32 ird_max;
+			__be32 iss;
+			__be32 irs;
+			__be32 hwrqsize;
+			__be32 hwrqaddr;
+			__be64 r5;
+			union fw_ri_init_p2p {
+				struct fw_ri_rdma_write_wr write;
+				struct fw_ri_rdma_read_wr read;
+				struct fw_ri_send_wr send;
+			} u;
+		} init;
+		struct fw_ri_fini {
+			__u8   type;
+			__u8   r3[7];
+			__be64 r4;
+		} fini;
+		struct fw_ri_terminate {
+			__u8   type;
+			__u8   r3[3];
+			__be32 immdlen;
+			__u8   termmsg[40];
+		} terminate;
+	} u;
+};
+
+#define S_FW_RI_WR_MPAREQBIT	7
+#define M_FW_RI_WR_MPAREQBIT	0x1
+#define V_FW_RI_WR_MPAREQBIT(x)	((x) << S_FW_RI_WR_MPAREQBIT)
+#define G_FW_RI_WR_MPAREQBIT(x)	\
+    (((x) >> S_FW_RI_WR_MPAREQBIT) & M_FW_RI_WR_MPAREQBIT)
+#define F_FW_RI_WR_MPAREQBIT	V_FW_RI_WR_MPAREQBIT(1U)
+
+#define S_FW_RI_WR_P2PTYPE	0
+#define M_FW_RI_WR_P2PTYPE	0xf
+#define V_FW_RI_WR_P2PTYPE(x)	((x) << S_FW_RI_WR_P2PTYPE)
+#define G_FW_RI_WR_P2PTYPE(x)	\
+    (((x) >> S_FW_RI_WR_P2PTYPE) & M_FW_RI_WR_P2PTYPE)
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8:4;
+	__u8 unknown:1;
+	__u8:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8:1;
+	__u8 unknown:1;
+	__u8:4;
+#endif
+};
+
+struct cpl_pass_accept_req {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 hdr_len;
+	__be16 vlan;
+	__be16 l2info;
+	__be32 tos_stid;
+	struct tcp_options tcpopt;
+};
+
+/* cpl_pass_accept_req.hdr_len fields */
+#define S_SYN_RX_CHAN    0
+#define M_SYN_RX_CHAN    0xF
+#define V_SYN_RX_CHAN(x) ((x) << S_SYN_RX_CHAN)
+#define G_SYN_RX_CHAN(x) (((x) >> S_SYN_RX_CHAN) & M_SYN_RX_CHAN)
+
+#define S_TCP_HDR_LEN    10
+#define M_TCP_HDR_LEN    0x3F
+#define V_TCP_HDR_LEN(x) ((x) << S_TCP_HDR_LEN)
+#define G_TCP_HDR_LEN(x) (((x) >> S_TCP_HDR_LEN) & M_TCP_HDR_LEN)
+
+#define S_IP_HDR_LEN    16
+#define M_IP_HDR_LEN    0x3FF
+#define V_IP_HDR_LEN(x) ((x) << S_IP_HDR_LEN)
+#define G_IP_HDR_LEN(x) (((x) >> S_IP_HDR_LEN) & M_IP_HDR_LEN)
+
+#define S_ETH_HDR_LEN    26
+#define M_ETH_HDR_LEN    0x1F
+#define V_ETH_HDR_LEN(x) ((x) << S_ETH_HDR_LEN)
+#define G_ETH_HDR_LEN(x) (((x) >> S_ETH_HDR_LEN) & M_ETH_HDR_LEN)
+
+/* cpl_pass_accept_req.l2info fields */
+#define S_SYN_MAC_IDX    0
+#define M_SYN_MAC_IDX    0x1FF
+#define V_SYN_MAC_IDX(x) ((x) << S_SYN_MAC_IDX)
+#define G_SYN_MAC_IDX(x) (((x) >> S_SYN_MAC_IDX) & M_SYN_MAC_IDX)
+
+#define S_SYN_XACT_MATCH    9
+#define V_SYN_XACT_MATCH(x) ((x) << S_SYN_XACT_MATCH)
+#define F_SYN_XACT_MATCH    V_SYN_XACT_MATCH(1U)
+
+#define S_SYN_INTF    12
+#define M_SYN_INTF    0xF
+#define V_SYN_INTF(x) ((x) << S_SYN_INTF)
+#define G_SYN_INTF(x) (((x) >> S_SYN_INTF) & M_SYN_INTF)
+
+struct ulptx_idata {
+	__be32 cmd_more;
+	__be32 len;
+};
+
+#define S_ULPTX_NSGE    0
+#define M_ULPTX_NSGE    0xFFFF
+#define V_ULPTX_NSGE(x) ((x) << S_ULPTX_NSGE)
+#endif /* _T4FW_RI_API_H_ */
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
new file mode 100644
index 0000000..ed6414a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __C4IW_USER_H__
+#define __C4IW_USER_H__
+
+#define C4IW_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct c4iw_create_cq_resp {
+	__u64 key;
+	__u64 gts_key;
+	__u64 memsize;
+	__u32 cqid;
+	__u32 size;
+	__u32 qid_mask;
+};
+
+struct c4iw_create_qp_resp {
+	__u64 sq_key;
+	__u64 rq_key;
+	__u64 sq_db_gts_key;
+	__u64 rq_db_gts_key;
+	__u64 sq_memsize;
+	__u64 rq_memsize;
+	__u32 sqid;
+	__u32 rqid;
+	__u32 sq_size;
+	__u32 rq_size;
+	__u32 qid_mask;
+};
+#endif
-- 
cgit v0.10.2


From ce6e74f23d8018f50609f694b6177c139486ebe5 Mon Sep 17 00:00:00 2001
From: Chien Tung <chien.tin.tung@intel.com>
Date: Tue, 9 Mar 2010 21:50:40 +0000
Subject: RDMA/nes: Make nesadapter->phy_lock usage consistent

nes_{read,write}_1G_phy_reg() are using phy_lock while
nes_{read,write}_10G_phy_reg() leave that to the caller.

Remove phy_lock from 1G routines and leave the locking to the caller.
Add additional phy_lock calls around 1G read/write.

Signed-off-by: Chien Tung <chien.tin.tung@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index c36a3f5..8b67207 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -2458,7 +2458,6 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
 		return;
 	}
 	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 
 	/* ack the MAC interrupt */
 	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
@@ -2469,11 +2468,9 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
 
 	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
 		nesdev->link_status_interrupts++;
-		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) {
-			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
 			nes_reset_link(nesdev, mac_index);
-			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-		}
+
 		/* read the PHY interrupt status register */
 		if ((nesadapter->OneG_Mode) &&
 		(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
@@ -2587,6 +2584,7 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
 				break;
 			}
 		}
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 
 		if (phy_data & 0x0004) {
 			if (wide_ppm_offset &&
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index b7c813f..9f4cadf 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -1461,11 +1461,14 @@ static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd
 			et_cmd->transceiver = XCVR_INTERNAL;
 			et_cmd->phy_address = mac_index;
 		} else {
+			unsigned long flags;
 			et_cmd->supported   = SUPPORTED_1000baseT_Full
 					    | SUPPORTED_Autoneg;
 			et_cmd->advertising = ADVERTISED_1000baseT_Full
 					    | ADVERTISED_Autoneg;
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
 			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 			if (phy_data & 0x1000)
 				et_cmd->autoneg = AUTONEG_ENABLE;
 			else
@@ -1503,12 +1506,15 @@ static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd
 	struct nes_vnic *nesvnic = netdev_priv(netdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u16 phy_data;
 
 	if ((nesadapter->OneG_Mode) &&
 	    (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-		nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index],
-				&phy_data);
+		unsigned long flags;
+		u16 phy_data;
+		u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
+
+		spin_lock_irqsave(&nesadapter->phy_lock, flags);
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
 		if (et_cmd->autoneg) {
 			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
 			phy_data |= 0x1300;
@@ -1516,8 +1522,8 @@ static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd
 			/* Turn off autoneg */
 			phy_data &= ~0x1000;
 		}
-		nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index],
-				phy_data);
+		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 	}
 
 	return 0;
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 186623d..a9f5dd2 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -381,12 +381,8 @@ static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
  */
 void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
 {
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	u32 u32temp;
 	u32 counter;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
 
 	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
 			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
@@ -402,8 +398,6 @@ void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u1
 	if (!(u32temp & 1))
 		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
 				u32temp);
-
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 }
 
 
@@ -414,14 +408,11 @@ void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u1
  */
 void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
 {
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	u32 u32temp;
 	u32 counter;
-	unsigned long flags;
 
 	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
 			phy_addr, nesdev->mac_index); */
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
 
 	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
 			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
@@ -441,7 +432,6 @@ void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16
 	} else {
 		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
 	}
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
 }
 
 
-- 
cgit v0.10.2


From 53978b46cd946ef1dba96ed6d0276ff656dd5d42 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Wed, 21 Apr 2010 15:58:28 -0700
Subject: RDMA/nes: Make unnecessarily global functions static

This allows the compiler to do a bit better; on my x86-64 build:

add/remove: 0/2 grow/shrink: 1/0 up/down: 2288/-2365 (-77)
function                                     old     new   delta
nes_init_phy                                 273    2561   +2288
nes_init_1g_phy                              469       -    -469
nes_init_2025_phy                           1896       -   -1896

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index 8b67207..86acb7d 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -1297,7 +1297,7 @@ int nes_destroy_cqp(struct nes_device *nesdev)
 /**
  * nes_init_1g_phy
  */
-int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
 {
 	u32 counter = 0;
 	u16 phy_data;
@@ -1351,7 +1351,7 @@ int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
 /**
  * nes_init_2025_phy
  */
-int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
 {
 	u32 temp_phy_data = 0;
 	u32 temp_phy_data2 = 0;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index e54f312..925e1f2 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -374,7 +374,7 @@ static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 /*
  * nes_alloc_fast_reg_mr
  */
-struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
 {
 	struct nes_pd *nespd = to_nespd(ibpd);
 	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-- 
cgit v0.10.2


From 5d7220e8dc24feed4bbd66667b7696906a147ac4 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 15 Apr 2010 11:29:04 +0900
Subject: RDMA/cma: Randomize local port allocation

Randomize local port allocation in the way sctp_get_port_local() does.
Update rover at the end of loop since we're likely to pick a valid port
on the first try.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 6d77706..6ae418e 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -79,7 +79,6 @@ static DEFINE_IDR(sdp_ps);
 static DEFINE_IDR(tcp_ps);
 static DEFINE_IDR(udp_ps);
 static DEFINE_IDR(ipoib_ps);
-static int next_port;
 
 struct cma_device {
 	struct list_head	list;
@@ -1970,47 +1969,33 @@ err1:
 
 static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
 {
-	struct rdma_bind_list *bind_list;
-	int port, ret, low, high;
-
-	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
-	if (!bind_list)
-		return -ENOMEM;
-
-retry:
-	/* FIXME: add proper port randomization per like inet_csk_get_port */
-	do {
-		ret = idr_get_new_above(ps, bind_list, next_port, &port);
-	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
-
-	if (ret)
-		goto err1;
+	static unsigned int last_used_port;
+	int low, high, remaining;
+	unsigned int rover;
 
 	inet_get_local_port_range(&low, &high);
-	if (port > high) {
-		if (next_port != low) {
-			idr_remove(ps, port);
-			next_port = low;
-			goto retry;
-		}
-		ret = -EADDRNOTAVAIL;
-		goto err2;
+	remaining = (high - low) + 1;
+	rover = net_random() % remaining + low;
+retry:
+	if (last_used_port != rover &&
+	    !idr_find(ps, (unsigned short) rover)) {
+		int ret = cma_alloc_port(ps, id_priv, rover);
+		/*
+		 * Remember previously used port number in order to avoid
+		 * re-using same port immediately after it is closed.
+		 */
+		if (!ret)
+			last_used_port = rover;
+		if (ret != -EADDRNOTAVAIL)
+			return ret;
 	}
-
-	if (port == high)
-		next_port = low;
-	else
-		next_port = port + 1;
-
-	bind_list->ps = ps;
-	bind_list->port = (unsigned short) port;
-	cma_bind_port(bind_list, id_priv);
-	return 0;
-err2:
-	idr_remove(ps, port);
-err1:
-	kfree(bind_list);
-	return ret;
+	if (--remaining) {
+		rover++;
+		if ((rover < low) || (rover > high))
+			rover = low;
+		goto retry;
+	}
+	return -EADDRNOTAVAIL;
 }
 
 static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
@@ -2995,12 +2980,7 @@ static void cma_remove_one(struct ib_device *device)
 
 static int __init cma_init(void)
 {
-	int ret, low, high, remaining;
-
-	get_random_bytes(&next_port, sizeof next_port);
-	inet_get_local_port_range(&low, &high);
-	remaining = (high - low) + 1;
-	next_port = ((unsigned int) next_port % remaining) + low;
+	int ret;
 
 	cma_wq = create_singlethread_workqueue("rdma_cm");
 	if (!cma_wq)
-- 
cgit v0.10.2


From 5e80ba8ff0bd33ff4af2365969a231cbdb98cafb Mon Sep 17 00:00:00 2001
From: Vladimir Sokolovsky <vlad@mellanox.co.il>
Date: Wed, 14 Apr 2010 17:23:01 +0300
Subject: IB/core: Add support for masked atomic operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 - Add new IB_WR_MASKED_ATOMIC_CMP_AND_SWP and IB_WR_MASKED_ATOMIC_FETCH_AND_ADD
   send opcodes that can be used to post "masked atomic compare and
   swap" and "masked atomic fetch and add" work request respectively.
 - Add masked_atomic_cap capability.
 - Add mask fields to atomic struct of ib_send_wr
 - Add new opcodes to ib_wc_opcode

The new operations are described more precisely below:

* Masked Compare and Swap (MskCmpSwap)

The MskCmpSwap atomic operation is an extension to the CmpSwap
operation defined in the IB spec.  MskCmpSwap allows the user to
select a portion of the 64 bit target data for the “compare” check as
well as to restrict the swap to a (possibly different) portion.  The
pseudo code below describes the operation:

| atomic_response = *va
| if (!((compare_add ^ *va) & compare_add_mask)) then
|     *va = (*va & ~(swap_mask)) | (swap & swap_mask)
|
| return atomic_response

The additional operands are carried in the Extended Transport Header.
Atomic response generation and packet format for MskCmpSwap is as for
standard IB Atomic operations.

* Masked Fetch and Add (MFetchAdd)

The MFetchAdd Atomic operation extends the functionality of the
standard IB FetchAdd by allowing the user to split the target into
multiple fields of selectable length. The atomic add is done
independently on each one of this fields. A bit set in the
field_boundary parameter specifies the field boundaries. The pseudo
code below describes the operation:

| bit_adder(ci, b1, b2, *co)
| {
|	value = ci + b1 + b2
|	*co = !!(value & 2)
|
|	return value & 1
| }
|
| #define MASK_IS_SET(mask, attr)      (!!((mask)&(attr)))
| bit_position = 1
| carry = 0
| atomic_response = 0
|
| for i = 0 to 63
| {
|         if ( i != 0 )
|                 bit_position =  bit_position << 1
|
|         bit_add_res = bit_adder(carry, MASK_IS_SET(*va, bit_position),
|                                 MASK_IS_SET(compare_add, bit_position), &new_carry)
|         if (bit_add_res)
|                 atomic_response |= bit_position
|
|         carry = ((new_carry) && (!MASK_IS_SET(compare_add_mask, bit_position)))
| }
|
| return atomic_response

Signed-off-by: Vladimir Sokolovsky <vlad@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a585e0f..310d314 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -136,6 +136,7 @@ struct ib_device_attr {
 	int			max_qp_init_rd_atom;
 	int			max_ee_init_rd_atom;
 	enum ib_atomic_cap	atomic_cap;
+	enum ib_atomic_cap	masked_atomic_cap;
 	int			max_ee;
 	int			max_rdd;
 	int			max_mw;
@@ -467,6 +468,8 @@ enum ib_wc_opcode {
 	IB_WC_LSO,
 	IB_WC_LOCAL_INV,
 	IB_WC_FAST_REG_MR,
+	IB_WC_MASKED_COMP_SWAP,
+	IB_WC_MASKED_FETCH_ADD,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -689,6 +692,8 @@ enum ib_wr_opcode {
 	IB_WR_RDMA_READ_WITH_INV,
 	IB_WR_LOCAL_INV,
 	IB_WR_FAST_REG_MR,
+	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
+	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
 };
 
 enum ib_send_flags {
@@ -731,6 +736,8 @@ struct ib_send_wr {
 			u64	remote_addr;
 			u64	compare_add;
 			u64	swap;
+			u64	compare_add_mask;
+			u64	swap_mask;
 			u32	rkey;
 		} atomic;
 		struct {
-- 
cgit v0.10.2


From 6fa8f719844b8455033e295f720e739c1dc3804a Mon Sep 17 00:00:00 2001
From: Vladimir Sokolovsky <vlad@mellanox.co.il>
Date: Wed, 14 Apr 2010 17:23:39 +0300
Subject: IB/mlx4: Add support for masked atomic operations

Add support for masked atomic operations (masked compare and swap,
masked fetch and add).

Signed-off-by: Vladimir Sokolovsky <vlad@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index cc2ddd2..5a219a2 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -661,6 +661,14 @@ repoll:
 			wc->opcode    = IB_WC_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_CS:
+			wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_FA:
+			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
 		case MLX4_OPCODE_BIND_MW:
 			wc->opcode    = IB_WC_BIND_MW;
 			break;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 01f2a3f..3905141 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -139,6 +139,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
 	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
 		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = IB_ATOMIC_HCA;
 	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
 	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
 	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 5643f4a..6a60827 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -74,17 +74,19 @@ enum {
 };
 
 static const __be32 mlx4_ib_opcode[] = {
-	[IB_WR_SEND]			= cpu_to_be32(MLX4_OPCODE_SEND),
-	[IB_WR_LSO]			= cpu_to_be32(MLX4_OPCODE_LSO),
-	[IB_WR_SEND_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
-	[IB_WR_RDMA_WRITE]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
-	[IB_WR_RDMA_WRITE_WITH_IMM]	= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
-	[IB_WR_RDMA_READ]		= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
-	[IB_WR_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
-	[IB_WR_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
-	[IB_WR_SEND_WITH_INV]		= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
-	[IB_WR_LOCAL_INV]		= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
-	[IB_WR_FAST_REG_MR]		= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]			= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 };
 
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -1407,6 +1409,9 @@ static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *
 	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
 		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
 		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
 	} else {
 		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
 		aseg->compare  = 0;
@@ -1414,6 +1419,15 @@ static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *
 
 }
 
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ib_send_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
+	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 			     struct ib_send_wr *wr)
 {
@@ -1567,6 +1581,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			switch (wr->opcode) {
 			case IB_WR_ATOMIC_CMP_AND_SWP:
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
 				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
 					      wr->wr.atomic.rkey);
 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
@@ -1579,6 +1594,19 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 				break;
 
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+				break;
+
 			case IB_WR_RDMA_READ:
 			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_WRITE_WITH_IMM:
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e92d1bf..7a7f9c1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -123,8 +123,8 @@ enum {
 	MLX4_OPCODE_RDMA_READ		= 0x10,
 	MLX4_OPCODE_ATOMIC_CS		= 0x11,
 	MLX4_OPCODE_ATOMIC_FA		= 0x12,
-	MLX4_OPCODE_ATOMIC_MASK_CS	= 0x14,
-	MLX4_OPCODE_ATOMIC_MASK_FA	= 0x15,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
 	MLX4_OPCODE_BIND_MW		= 0x18,
 	MLX4_OPCODE_FMR			= 0x19,
 	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 9f29d86..7abe643 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -285,6 +285,13 @@ struct mlx4_wqe_atomic_seg {
 	__be64			compare;
 };
 
+struct mlx4_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
 struct mlx4_wqe_data_seg {
 	__be32			byte_count;
 	__be32			lkey;
-- 
cgit v0.10.2


From d414371795d54fa916938f948105d08928abfbb9 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Thu, 4 Mar 2010 13:16:52 +0000
Subject: IPoIB: Allow disabling/enabling TSO on the fly through ethtool

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index d10b4ec..40e8584 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -49,6 +49,25 @@ static u32 ipoib_get_rx_csum(struct net_device *dev)
 		!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 }
 
+static int ipoib_set_tso(struct net_device *dev, u32 data)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (data) {
+		if (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
+		    (dev->features & NETIF_F_SG) &&
+		    (priv->hca_caps & IB_DEVICE_UD_TSO)) {
+			dev->features |= NETIF_F_TSO;
+		} else {
+			ipoib_warn(priv, "can't set TSO on\n");
+			return -EOPNOTSUPP;
+		}
+	} else
+		dev->features &= ~NETIF_F_TSO;
+
+	return 0;
+}
+
 static int ipoib_get_coalesce(struct net_device *dev,
 			      struct ethtool_coalesce *coal)
 {
@@ -131,6 +150,7 @@ static void ipoib_get_ethtool_stats(struct net_device *dev,
 static const struct ethtool_ops ipoib_ethtool_ops = {
 	.get_drvinfo		= ipoib_get_drvinfo,
 	.get_rx_csum		= ipoib_get_rx_csum,
+	.set_tso		= ipoib_set_tso,
 	.get_coalesce		= ipoib_get_coalesce,
 	.set_coalesce		= ipoib_set_coalesce,
 	.get_flags		= ethtool_op_get_flags,
-- 
cgit v0.10.2


From 617c9a7e398878d036a3aa9a063ccba145854b45 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Wed, 28 Apr 2010 14:57:40 -0700
Subject: RDMA/cxgb3: Shrink .text with compile-time init of handlers arrays

Using compile-time designated initializers for the handler arrays
instead of open-coding the initialization in iwch_cm_init() is (IMHO)
cleaner, and leads to substantially smaller code: on my x86-64 build,
bloat-o-meter shows:

add/remove: 0/1 grow/shrink: 4/3 up/down: 4/-1682 (-1678)
function                                     old     new   delta
tx_ack                                       167     168      +1
state_set                                     55      56      +1
start_ep_timer                                99     100      +1
pass_establish                               177     178      +1
act_open_req_arp_failure                      39      38      -1
sched                                         84      82      -2
iwch_cm_init                                 442      91    -351
work_handlers                               1328       -   -1328

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 63f975f..8e77dc5 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -47,8 +47,6 @@ MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
-cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
-
 static void open_rnic_dev(struct t3cdev *);
 static void close_rnic_dev(struct t3cdev *);
 static void iwch_event_handler(struct t3cdev *, u32, u32);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index cfd6db0..ebfb117 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -102,12 +102,9 @@ static unsigned int cong_flavor = 1;
 module_param(cong_flavor, uint, 0644);
 MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
 
-static void process_work(struct work_struct *work);
 static struct workqueue_struct *workq;
-static DECLARE_WORK(skb_work, process_work);
 
 static struct sk_buff_head rxq;
-static cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS];
 
 static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
 static void ep_timeout(unsigned long arg);
@@ -302,27 +299,6 @@ static void release_ep_resources(struct iwch_ep *ep)
 	put_ep(&ep->com);
 }
 
-static void process_work(struct work_struct *work)
-{
-	struct sk_buff *skb = NULL;
-	void *ep;
-	struct t3cdev *tdev;
-	int ret;
-
-	while ((skb = skb_dequeue(&rxq))) {
-		ep = *((void **) (skb->cb));
-		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
-		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
-		if (ret & CPL_RET_BUF_DONE)
-			kfree_skb(skb);
-
-		/*
-		 * ep was referenced in sched(), and is freed here.
-		 */
-		put_ep((struct iwch_ep_common *)ep);
-	}
-}
-
 static int status2errno(int status)
 {
 	switch (status) {
@@ -2157,7 +2133,49 @@ int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
 
 /*
  * All the CM events are handled on a work queue to have a safe context.
+ * These are the real handlers that are called from the work queue.
  */
+static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= act_establish,
+	[CPL_ACT_OPEN_RPL]	= act_open_rpl,
+	[CPL_RX_DATA]		= rx_data,
+	[CPL_TX_DMA_ACK]	= tx_ack,
+	[CPL_ABORT_RPL_RSS]	= abort_rpl,
+	[CPL_ABORT_RPL]		= abort_rpl,
+	[CPL_PASS_OPEN_RPL]	= pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL]	= close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ]	= pass_accept_req,
+	[CPL_PASS_ESTABLISH]	= pass_establish,
+	[CPL_PEER_CLOSE]	= peer_close,
+	[CPL_ABORT_REQ_RSS]	= peer_abort,
+	[CPL_CLOSE_CON_RPL]	= close_con_rpl,
+	[CPL_RDMA_TERMINATE]	= terminate,
+	[CPL_RDMA_EC_STATUS]	= ec_status,
+};
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	void *ep;
+	struct t3cdev *tdev;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		ep = *((void **) (skb->cb));
+		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
+		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+
+		/*
+		 * ep was referenced in sched(), and is freed here.
+		 */
+		put_ep((struct iwch_ep_common *)ep);
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
 static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 {
 	struct iwch_ep_common *epc = ctx;
@@ -2189,6 +2207,29 @@ static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	return CPL_RET_BUF_DONE;
 }
 
+/*
+ * All upcalls from the T3 Core go to sched() to schedule the
+ * processing on a work queue.
+ */
+cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= sched,
+	[CPL_ACT_OPEN_RPL]	= sched,
+	[CPL_RX_DATA]		= sched,
+	[CPL_TX_DMA_ACK]	= sched,
+	[CPL_ABORT_RPL_RSS]	= sched,
+	[CPL_ABORT_RPL]		= sched,
+	[CPL_PASS_OPEN_RPL]	= sched,
+	[CPL_CLOSE_LISTSRV_RPL]	= sched,
+	[CPL_PASS_ACCEPT_REQ]	= sched,
+	[CPL_PASS_ESTABLISH]	= sched,
+	[CPL_PEER_CLOSE]	= sched,
+	[CPL_CLOSE_CON_RPL]	= sched,
+	[CPL_ABORT_REQ_RSS]	= sched,
+	[CPL_RDMA_TERMINATE]	= sched,
+	[CPL_RDMA_EC_STATUS]	= sched,
+	[CPL_SET_TCB_RPL]	= set_tcb_rpl,
+};
+
 int __init iwch_cm_init(void)
 {
 	skb_queue_head_init(&rxq);
@@ -2197,46 +2238,6 @@ int __init iwch_cm_init(void)
 	if (!workq)
 		return -ENOMEM;
 
-	/*
-	 * All upcalls from the T3 Core go to sched() to
-	 * schedule the processing on a work queue.
-	 */
-	t3c_handlers[CPL_ACT_ESTABLISH] = sched;
-	t3c_handlers[CPL_ACT_OPEN_RPL] = sched;
-	t3c_handlers[CPL_RX_DATA] = sched;
-	t3c_handlers[CPL_TX_DMA_ACK] = sched;
-	t3c_handlers[CPL_ABORT_RPL_RSS] = sched;
-	t3c_handlers[CPL_ABORT_RPL] = sched;
-	t3c_handlers[CPL_PASS_OPEN_RPL] = sched;
-	t3c_handlers[CPL_CLOSE_LISTSRV_RPL] = sched;
-	t3c_handlers[CPL_PASS_ACCEPT_REQ] = sched;
-	t3c_handlers[CPL_PASS_ESTABLISH] = sched;
-	t3c_handlers[CPL_PEER_CLOSE] = sched;
-	t3c_handlers[CPL_CLOSE_CON_RPL] = sched;
-	t3c_handlers[CPL_ABORT_REQ_RSS] = sched;
-	t3c_handlers[CPL_RDMA_TERMINATE] = sched;
-	t3c_handlers[CPL_RDMA_EC_STATUS] = sched;
-	t3c_handlers[CPL_SET_TCB_RPL] = set_tcb_rpl;
-
-	/*
-	 * These are the real handlers that are called from a
-	 * work queue.
-	 */
-	work_handlers[CPL_ACT_ESTABLISH] = act_establish;
-	work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl;
-	work_handlers[CPL_RX_DATA] = rx_data;
-	work_handlers[CPL_TX_DMA_ACK] = tx_ack;
-	work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl;
-	work_handlers[CPL_ABORT_RPL] = abort_rpl;
-	work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl;
-	work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl;
-	work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req;
-	work_handlers[CPL_PASS_ESTABLISH] = pass_establish;
-	work_handlers[CPL_PEER_CLOSE] = peer_close;
-	work_handlers[CPL_ABORT_REQ_RSS] = peer_abort;
-	work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl;
-	work_handlers[CPL_RDMA_TERMINATE] = terminate;
-	work_handlers[CPL_RDMA_EC_STATUS] = ec_status;
 	return 0;
 }
 
-- 
cgit v0.10.2


From be4c9bad9d0edb6bc3bd8fffc2f98e0e2112da39 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Wed, 5 May 2010 14:45:40 -0700
Subject: MAINTAINERS: Add cxgb4 and iw_cxgb4 entries

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a9ccda..8a32aec 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1719,6 +1719,20 @@ W:	http://www.openfabrics.org
 S:	Supported
 F:	drivers/infiniband/hw/cxgb3/
 
+CXGB4 ETHERNET DRIVER (CXGB4)
+M:	Dimitris Michailidis <dm@chelsio.com>
+L:	netdev@vger.kernel.org
+W:	http://www.chelsio.com
+S:	Supported
+F:	drivers/net/cxgb4/
+
+CXGB4 IWARP RNIC DRIVER (IW_CXGB4)
+M:	Steve Wise <swise@chelsio.com>
+L:	linux-rdma@vger.kernel.org
+W:	http://www.openfabrics.org
+S:	Supported
+F:	drivers/infiniband/hw/cxgb4/
+
 CYBERPRO FB DRIVER
 M:	Russell King <linux@arm.linux.org.uk>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 07b068b..30ce0a8 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -61,6 +61,10 @@ static char *states[] = {
 	NULL,
 };
 
+int c4iw_max_read_depth = 8;
+module_param(c4iw_max_read_depth, int, 0644);
+MODULE_PARM_DESC(c4iw_max_read_depth, "Per-connection max ORD/IRD (default=8)");
+
 static int enable_tcp_timestamps;
 module_param(enable_tcp_timestamps, int, 0644);
 MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)");
@@ -113,18 +117,17 @@ static int snd_win = 32 * 1024;
 module_param(snd_win, int, 0644);
 MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
 
-static void process_work(struct work_struct *work);
 static struct workqueue_struct *workq;
-static DECLARE_WORK(skb_work, process_work);
 
 static struct sk_buff_head rxq;
-static c4iw_handler_func work_handlers[NUM_CPL_CMDS];
-c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
 
 static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
 static void ep_timeout(unsigned long arg);
 static void connect_reply_upcall(struct c4iw_ep *ep, int status);
 
+static LIST_HEAD(timeout_list);
+static spinlock_t timeout_lock;
+
 static void start_ep_timer(struct c4iw_ep *ep)
 {
 	PDBG("%s ep %p\n", __func__, ep);
@@ -271,26 +274,6 @@ static void release_ep_resources(struct c4iw_ep *ep)
 	c4iw_put_ep(&ep->com);
 }
 
-static void process_work(struct work_struct *work)
-{
-	struct sk_buff *skb = NULL;
-	struct c4iw_dev *dev;
-	struct cpl_act_establish *rpl = cplhdr(skb);
-	unsigned int opcode;
-	int ret;
-
-	while ((skb = skb_dequeue(&rxq))) {
-		rpl = cplhdr(skb);
-		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
-		opcode = rpl->ot.opcode;
-
-		BUG_ON(!work_handlers[opcode]);
-		ret = work_handlers[opcode](dev, skb);
-		if (!ret)
-			kfree_skb(skb);
-	}
-}
-
 static int status2errno(int status)
 {
 	switch (status) {
@@ -1795,76 +1778,6 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }
 
-static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
-{
-	struct cpl_fw6_msg *rpl = cplhdr(skb);
-	struct c4iw_wr_wait *wr_waitp;
-	int ret;
-
-	PDBG("%s type %u\n", __func__, rpl->type);
-
-	switch (rpl->type) {
-	case 1:
-		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
-		wr_waitp = (__force struct c4iw_wr_wait *)rpl->data[1];
-		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
-		if (wr_waitp) {
-			wr_waitp->ret = ret;
-			wr_waitp->done = 1;
-			wake_up(&wr_waitp->wait);
-		}
-		break;
-	case 2:
-		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
-		break;
-	default:
-		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
-		       rpl->type);
-		break;
-	}
-	return 0;
-}
-
-static void ep_timeout(unsigned long arg)
-{
-	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
-	struct c4iw_qp_attributes attrs;
-	unsigned long flags;
-	int abort = 1;
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
-	     ep->com.state);
-	switch (ep->com.state) {
-	case MPA_REQ_SENT:
-		__state_set(&ep->com, ABORTING);
-		connect_reply_upcall(ep, -ETIMEDOUT);
-		break;
-	case MPA_REQ_WAIT:
-		__state_set(&ep->com, ABORTING);
-		break;
-	case CLOSING:
-	case MORIBUND:
-		if (ep->com.cm_id && ep->com.qp) {
-			attrs.next_state = C4IW_QP_STATE_ERROR;
-			c4iw_modify_qp(ep->com.qp->rhp,
-				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
-				     &attrs, 1);
-		}
-		__state_set(&ep->com, ABORTING);
-		break;
-	default:
-		printk(KERN_ERR "%s unexpected state ep %p tid %u state %u\n",
-			__func__, ep, ep->hwtid, ep->com.state);
-		WARN_ON(1);
-		abort = 0;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (abort)
-		abort_connection(ep, NULL, GFP_ATOMIC);
-	c4iw_put_ep(&ep->com);
-}
-
 int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 {
 	int err;
@@ -1904,8 +1817,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
 	BUG_ON(!qp);
 
-	if ((conn_param->ord > T4_MAX_READ_DEPTH) ||
-	    (conn_param->ird > T4_MAX_READ_DEPTH)) {
+	if ((conn_param->ord > c4iw_max_read_depth) ||
+	    (conn_param->ird > c4iw_max_read_depth)) {
 		abort_connection(ep, NULL, GFP_KERNEL);
 		err = -EINVAL;
 		goto err;
@@ -1968,6 +1881,11 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	struct net_device *pdev;
 	int step;
 
+	if ((conn_param->ord > c4iw_max_read_depth) ||
+	    (conn_param->ird > c4iw_max_read_depth)) {
+		err = -EINVAL;
+		goto out;
+	}
 	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
 	if (!ep) {
 		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
@@ -2115,7 +2033,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 	 */
 	ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, PF_INET, ep);
 	if (ep->stid == -1) {
-		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__);
 		err = -ENOMEM;
 		goto fail2;
 	}
@@ -2244,6 +2162,116 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 }
 
 /*
+ * These are the real handlers that are called from a
+ * work queue.
+ */
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = act_establish,
+	[CPL_ACT_OPEN_RPL] = act_open_rpl,
+	[CPL_RX_DATA] = rx_data,
+	[CPL_ABORT_RPL_RSS] = abort_rpl,
+	[CPL_ABORT_RPL] = abort_rpl,
+	[CPL_PASS_OPEN_RPL] = pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ] = pass_accept_req,
+	[CPL_PASS_ESTABLISH] = pass_establish,
+	[CPL_PEER_CLOSE] = peer_close,
+	[CPL_ABORT_REQ_RSS] = peer_abort,
+	[CPL_CLOSE_CON_RPL] = close_con_rpl,
+	[CPL_RDMA_TERMINATE] = terminate,
+	[CPL_FW4_ACK] = fw4_ack
+};
+
+static void process_timeout(struct c4iw_ep *ep)
+{
+	struct c4iw_qp_attributes attrs;
+	int abort = 1;
+
+	spin_lock_irq(&ep->com.lock);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		break;
+	default:
+		printk(KERN_ERR "%s unexpected state ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, ep->com.state);
+		WARN_ON(1);
+		abort = 0;
+	}
+	spin_unlock_irq(&ep->com.lock);
+	if (abort)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+}
+
+static void process_timedout_eps(void)
+{
+	struct c4iw_ep *ep;
+
+	spin_lock_irq(&timeout_lock);
+	while (!list_empty(&timeout_list)) {
+		struct list_head *tmp;
+
+		tmp = timeout_list.next;
+		list_del(tmp);
+		spin_unlock_irq(&timeout_lock);
+		ep = list_entry(tmp, struct c4iw_ep, entry);
+		process_timeout(ep);
+		spin_lock_irq(&timeout_lock);
+	}
+	spin_unlock_irq(&timeout_lock);
+}
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct c4iw_dev *dev;
+	struct cpl_act_establish *rpl = cplhdr(skb);
+	unsigned int opcode;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		rpl = cplhdr(skb);
+		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+		opcode = rpl->ot.opcode;
+
+		BUG_ON(!work_handlers[opcode]);
+		ret = work_handlers[opcode](dev, skb);
+		if (!ret)
+			kfree_skb(skb);
+	}
+	process_timedout_eps();
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static void ep_timeout(unsigned long arg)
+{
+	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
+
+	spin_lock(&timeout_lock);
+	list_add_tail(&ep->entry, &timeout_list);
+	spin_unlock(&timeout_lock);
+	queue_work(workq, &skb_work);
+}
+
+/*
  * All the CM events are handled on a work queue to have a safe context.
  */
 static int sched(struct c4iw_dev *dev, struct sk_buff *skb)
@@ -2273,58 +2301,74 @@ static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }
 
+static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct c4iw_wr_wait *wr_waitp;
+	int ret;
+
+	PDBG("%s type %u\n", __func__, rpl->type);
+
+	switch (rpl->type) {
+	case 1:
+		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
+		wr_waitp = (__force struct c4iw_wr_wait *)rpl->data[1];
+		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
+		if (wr_waitp) {
+			wr_waitp->ret = ret;
+			wr_waitp->done = 1;
+			wake_up(&wr_waitp->wait);
+		}
+		break;
+	case 2:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	default:
+		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
+		       rpl->type);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Most upcalls from the T4 Core go to sched() to
+ * schedule the processing on a work queue.
+ */
+c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = sched,
+	[CPL_ACT_OPEN_RPL] = sched,
+	[CPL_RX_DATA] = sched,
+	[CPL_ABORT_RPL_RSS] = sched,
+	[CPL_ABORT_RPL] = sched,
+	[CPL_PASS_OPEN_RPL] = sched,
+	[CPL_CLOSE_LISTSRV_RPL] = sched,
+	[CPL_PASS_ACCEPT_REQ] = sched,
+	[CPL_PASS_ESTABLISH] = sched,
+	[CPL_PEER_CLOSE] = sched,
+	[CPL_CLOSE_CON_RPL] = sched,
+	[CPL_ABORT_REQ_RSS] = sched,
+	[CPL_RDMA_TERMINATE] = sched,
+	[CPL_FW4_ACK] = sched,
+	[CPL_SET_TCB_RPL] = set_tcb_rpl,
+	[CPL_FW6_MSG] = fw6_msg
+};
+
 int __init c4iw_cm_init(void)
 {
+	spin_lock_init(&timeout_lock);
 	skb_queue_head_init(&rxq);
 
 	workq = create_singlethread_workqueue("iw_cxgb4");
 	if (!workq)
 		return -ENOMEM;
 
-	/*
-	 * Most upcalls from the T4 Core go to sched() to
-	 * schedule the processing on a work queue.
-	 */
-	c4iw_handlers[CPL_ACT_ESTABLISH] = sched;
-	c4iw_handlers[CPL_ACT_OPEN_RPL] = sched;
-	c4iw_handlers[CPL_RX_DATA] = sched;
-	c4iw_handlers[CPL_ABORT_RPL_RSS] = sched;
-	c4iw_handlers[CPL_ABORT_RPL] = sched;
-	c4iw_handlers[CPL_PASS_OPEN_RPL] = sched;
-	c4iw_handlers[CPL_CLOSE_LISTSRV_RPL] = sched;
-	c4iw_handlers[CPL_PASS_ACCEPT_REQ] = sched;
-	c4iw_handlers[CPL_PASS_ESTABLISH] = sched;
-	c4iw_handlers[CPL_PEER_CLOSE] = sched;
-	c4iw_handlers[CPL_CLOSE_CON_RPL] = sched;
-	c4iw_handlers[CPL_ABORT_REQ_RSS] = sched;
-	c4iw_handlers[CPL_RDMA_TERMINATE] = sched;
-	c4iw_handlers[CPL_FW4_ACK] = sched;
-	c4iw_handlers[CPL_SET_TCB_RPL] = set_tcb_rpl;
-	c4iw_handlers[CPL_FW6_MSG] = fw6_msg;
-
-	/*
-	 * These are the real handlers that are called from a
-	 * work queue.
-	 */
-	work_handlers[CPL_ACT_ESTABLISH] = act_establish;
-	work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl;
-	work_handlers[CPL_RX_DATA] = rx_data;
-	work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl;
-	work_handlers[CPL_ABORT_RPL] = abort_rpl;
-	work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl;
-	work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl;
-	work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req;
-	work_handlers[CPL_PASS_ESTABLISH] = pass_establish;
-	work_handlers[CPL_PEER_CLOSE] = peer_close;
-	work_handlers[CPL_ABORT_REQ_RSS] = peer_abort;
-	work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl;
-	work_handlers[CPL_RDMA_TERMINATE] = terminate;
-	work_handlers[CPL_FW4_ACK] = fw4_ack;
 	return 0;
 }
 
 void __exit c4iw_cm_term(void)
 {
+	WARN_ON(!list_empty(&timeout_list));
 	flush_workqueue(workq);
 	destroy_workqueue(workq);
 }
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
index 1bd6a3e..491e76a 100644
--- a/drivers/infiniband/hw/cxgb4/ev.c
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -51,8 +51,8 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
 		return;
 	}
 
-	printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x "
-	       "type %d wrid.hi 0x%x wrid.lo 0x%x\n", __func__,
+	printk(KERN_ERR MOD "AE qpid 0x%x opcode %d status 0x%x "
+	       "type %d wrid.hi 0x%x wrid.lo 0x%x\n",
 	       CQE_QPID(err_cqe), CQE_OPCODE(err_cqe),
 	       CQE_STATUS(err_cqe), CQE_TYPE(err_cqe),
 	       CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe));
@@ -60,7 +60,7 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
 	if (qhp->attr.state == C4IW_QP_STATE_RTS) {
 		attrs.next_state = C4IW_QP_STATE_TERMINATE;
 		c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE,
-			       &attrs, 0);
+			       &attrs, 1);
 	}
 
 	event.event = ib_event;
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index ccce6fe..a626998 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -597,6 +597,7 @@ struct c4iw_ep {
 	struct c4iw_ep_common com;
 	struct c4iw_ep *parent_ep;
 	struct timer_list timer;
+	struct list_head entry;
 	unsigned int atid;
 	u32 hwtid;
 	u32 snd_seq;
@@ -739,5 +740,6 @@ void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
 
 extern struct cxgb4_client t4c_client;
 extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+extern int c4iw_max_read_depth;
 
 #endif
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 3cb50af..dfc4902 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -267,8 +267,8 @@ static int c4iw_query_device(struct ib_device *ibdev,
 	props->max_qp_wr = T4_MAX_QP_DEPTH;
 	props->max_sge = T4_MAX_RECV_SGE;
 	props->max_sge_rd = 1;
-	props->max_qp_rd_atom = T4_MAX_READ_DEPTH;
-	props->max_qp_init_rd_atom = T4_MAX_READ_DEPTH;
+	props->max_qp_rd_atom = c4iw_max_read_depth;
+	props->max_qp_init_rd_atom = c4iw_max_read_depth;
 	props->max_cq = T4_MAX_NUM_CQ;
 	props->max_cqe = T4_MAX_CQ_DEPTH;
 	props->max_mr = c4iw_num_stags(&dev->rdev);
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index bd56c84..83a01dc 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -856,7 +856,8 @@ int c4iw_post_zb_read(struct c4iw_qp *qhp)
 	return c4iw_ofld_send(&qhp->rhp->rdev, skb);
 }
 
-int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe)
+static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
+			   gfp_t gfp)
 {
 	struct fw_ri_wr *wqe;
 	struct sk_buff *skb;
@@ -865,9 +866,9 @@ int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe)
 	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
 	     qhp->ep->hwtid);
 
-	skb = alloc_skb(sizeof *wqe, GFP_KERNEL | __GFP_NOFAIL);
+	skb = alloc_skb(sizeof *wqe, gfp);
 	if (!skb)
-		return -ENOMEM;
+		return;
 	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
 
 	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -881,7 +882,7 @@ int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe)
 	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
 	term = (struct terminate_message *)wqe->u.terminate.termmsg;
 	build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
-	return c4iw_ofld_send(&qhp->rhp->rdev, skb);
+	c4iw_ofld_send(&qhp->rhp->rdev, skb);
 }
 
 /*
@@ -1130,14 +1131,14 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
 			newattr.enable_bind = attrs->enable_bind;
 		if (mask & C4IW_QP_ATTR_MAX_ORD) {
-			if (attrs->max_ord > T4_MAX_READ_DEPTH) {
+			if (attrs->max_ord > c4iw_max_read_depth) {
 				ret = -EINVAL;
 				goto out;
 			}
 			newattr.max_ord = attrs->max_ord;
 		}
 		if (mask & C4IW_QP_ATTR_MAX_IRD) {
-			if (attrs->max_ird > T4_MAX_READ_DEPTH) {
+			if (attrs->max_ird > c4iw_max_read_depth) {
 				ret = -EINVAL;
 				goto out;
 			}
@@ -1215,12 +1216,10 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 			qhp->attr.state = C4IW_QP_STATE_TERMINATE;
 			if (qhp->ibqp.uobject)
 				t4_set_wq_in_error(&qhp->wq);
-			if (!internal) {
-				ep = qhp->ep;
-				c4iw_get_ep(&ep->com);
-				terminate = 1;
-				disconnect = 1;
-			}
+			ep = qhp->ep;
+			c4iw_get_ep(&ep->com);
+			terminate = 1;
+			disconnect = 1;
 			break;
 		case C4IW_QP_STATE_ERROR:
 			qhp->attr.state = C4IW_QP_STATE_ERROR;
@@ -1301,7 +1300,7 @@ out:
 	spin_unlock_irqrestore(&qhp->lock, flag);
 
 	if (terminate)
-		c4iw_post_terminate(qhp, NULL);
+		post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL);
 
 	/*
 	 * If disconnect is 1, then we need to initiate a disconnect
@@ -1309,7 +1308,8 @@ out:
 	 * an abnormal close (RTS/CLOSING->ERROR).
 	 */
 	if (disconnect) {
-		c4iw_ep_disconnect(ep, abort, GFP_KERNEL);
+		c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC :
+							 GFP_KERNEL);
 		c4iw_put_ep(&ep->com);
 	}
 
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 3f0d217..d0e8af3 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -36,7 +36,6 @@
 #include "t4_msg.h"
 #include "t4fw_ri_api.h"
 
-#define T4_MAX_READ_DEPTH 16
 #define T4_QID_BASE 1024
 #define T4_MAX_QIDS 256
 #define T4_MAX_NUM_QP (1<<16)
@@ -450,11 +449,25 @@ struct t4_cq {
 static inline int t4_arm_cq(struct t4_cq *cq, int se)
 {
 	u32 val;
-
-	val = SEINTARM(se) | CIDXINC(cq->cidx_inc) | TIMERREG(6) |
-	      INGRESSQID(cq->cqid);
-	cq->cidx_inc = 0;
-	writel(val, cq->gts);
+	u16 inc;
+
+	do {
+		/*
+		 * inc must be less the both the max update value -and-
+		 * the size of the CQ.
+		 */
+		inc = cq->cidx_inc <= CIDXINC_MASK ? cq->cidx_inc :
+						     CIDXINC_MASK;
+		inc = inc <= (cq->size - 1) ? inc : (cq->size - 1);
+		if (inc == cq->cidx_inc)
+			val = SEINTARM(se) | CIDXINC(inc) | TIMERREG(6) |
+			      INGRESSQID(cq->cqid);
+		else
+			val = SEINTARM(0) | CIDXINC(inc) | TIMERREG(7) |
+			      INGRESSQID(cq->cqid);
+		cq->cidx_inc -= inc;
+		writel(val, cq->gts);
+	} while (cq->cidx_inc);
 	return 0;
 }
 
@@ -489,11 +502,12 @@ static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
 static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
 {
 	int ret = 0;
+	u64 bits_type_ts = be64_to_cpu(cq->queue[cq->cidx].bits_type_ts);
 
-	if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) {
+	if (G_CQE_GENBIT(bits_type_ts) == cq->gen) {
 		*cqe = &cq->queue[cq->cidx];
-		cq->timestamp = CQE_TS(*cqe);
-	} else if (CQE_TS(&cq->queue[cq->cidx]) > cq->timestamp)
+		cq->timestamp = G_CQE_TS(bits_type_ts);
+	} else if (G_CQE_TS(bits_type_ts) > cq->timestamp)
 		ret = -EOVERFLOW;
 	else
 		ret = -ENODATA;
-- 
cgit v0.10.2


From 2110f9bf37511df06220bb7e977f417baecf2950 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Wed, 5 May 2010 17:30:10 +0300
Subject: IB/iser: Add asynchronous event handler

Add handler to handle events such as port up and down.  This is useful
when testing high-availability schemes such as multi-pathing.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 036934c..53da74b 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -232,6 +232,7 @@ struct iser_device {
 	struct ib_cq	             *tx_cq;
 	struct ib_mr	             *mr;
 	struct tasklet_struct	     cq_tasklet;
+	struct ib_event_handler      event_handler;
 	struct list_head             ig_list; /* entry in ig devices list */
 	int                          refcount;
 };
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index b89d76b..b9d6aa1 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -54,6 +54,13 @@ static void iser_qp_event_callback(struct ib_event *cause, void *context)
 	iser_err("got qp event %d\n",cause->event);
 }
 
+static void iser_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	iser_err("async event %d on device %s port %d\n", event->event,
+		event->device->name, event->element.port_num);
+}
+
 /**
  * iser_create_device_ib_res - creates Protection Domain (PD), Completion
  * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
@@ -96,8 +103,15 @@ static int iser_create_device_ib_res(struct iser_device *device)
 	if (IS_ERR(device->mr))
 		goto dma_mr_err;
 
+	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
+				iser_event_handler);
+	if (ib_register_event_handler(&device->event_handler))
+		goto handler_err;
+
 	return 0;
 
+handler_err:
+	ib_dereg_mr(device->mr);
 dma_mr_err:
 	tasklet_kill(&device->cq_tasklet);
 cq_arm_err:
@@ -120,7 +134,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 	BUG_ON(device->mr == NULL);
 
 	tasklet_kill(&device->cq_tasklet);
-
+	(void)ib_unregister_event_handler(&device->event_handler);
 	(void)ib_dereg_mr(device->mr);
 	(void)ib_destroy_cq(device->tx_cq);
 	(void)ib_destroy_cq(device->rx_cq);
-- 
cgit v0.10.2


From d265b9808272c9f25e1c36d3fb5ddb466efd90e9 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Wed, 5 May 2010 17:30:34 +0300
Subject: IB/iser: Remove buggy back-pointer setting

The iscsi connection object life cycle includes binding and unbinding
(conn_stop) to/from the iscsi transport connection object.  Since
iscsi connection objects are recycled, at the time the transport
connection (e.g iser's IB connection) is released, it is not valid to
touch the iscsi connection tied to the transport back-pointer since it
may already point to a different transport connection.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index b9d6aa1..ed7c901 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -346,8 +346,6 @@ static void iser_conn_release(struct iser_conn *ib_conn)
 	/* on EVENT_ADDR_ERROR there's no device yet for this conn */
 	if (device != NULL)
 		iser_device_try_release(device);
-	if (ib_conn->iser_conn)
-		ib_conn->iser_conn->ib_conn = NULL;
 	iscsi_destroy_endpoint(ib_conn->ep);
 }
 
-- 
cgit v0.10.2


From 39ff05dbbbdb082bbabf06206c56b3cd4ef73904 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Wed, 5 May 2010 17:31:44 +0300
Subject: IB/iser: Enhance disconnection logic for multi-pathing

The iser connection teardown flow isn't over until the underlying
Connection Manager (e.g the IB CM) delivers a disconnected or timeout
event through the RDMA-CM.  When the remote (target) side isn't
reachable, e.g when some HW e.g port/hca/switch isn't functioning or
taken down administratively, the CM timeout flow is used and the event
may be generated only after relatively long time -- on the order of
tens of seconds.

The current iser code exposes this possibly long delay to higher
layers, specifically to the iscsid daemon and iscsi kernel stack. As a
result, the iscsi stack doesn't respond well: this low-level CM delay
is added to the fail-over time under HA schemes such as the one
provided by DM multipath through the multipathd(8) service.

This patch enhances the reference counting scheme on iser's IB
connections so that the disconnect flow initiated by iscsid from user
space (ep_disconnect) doesn't wait for the CM to deliver the
disconnect/timeout event.  (The connection teardown isn't done from
iser's view point until the event is delivered)

The iser ib (rdma) connection object is destroyed when its reference
count reaches zero.  When this happens on the RDMA-CM callback
context, extra care is taken so that the RDMA-CM does the actual
destroying of the associated ID, since doing it in the callback is
prohibited.

The reference count of iser ib connection normally reaches three,
where the <ref, deref> relations are

 1. conn <init, terminate>
 2. conn <bind, stop/destroy>
 3. cma id <create, disconnect/error/timeout callbacks>

With this patch, multipath fail-over time is about 30 seconds, while
without this patch, multipath fail-over time is about 130 seconds.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 93399df..7b2fc98 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -325,7 +325,7 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn)
 	 */
 	if (ib_conn) {
 		ib_conn->iser_conn = NULL;
-		iser_conn_put(ib_conn);
+		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
 	}
 }
 
@@ -357,11 +357,12 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	/* binds the iSER connection retrieved from the previously
 	 * connected ep_handle to the iSCSI layer connection. exchanges
 	 * connection pointers */
-	iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn);
+	iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n",
+					conn, conn->dd_data, ib_conn);
 	iser_conn = conn->dd_data;
 	ib_conn->iser_conn = iser_conn;
 	iser_conn->ib_conn  = ib_conn;
-	iser_conn_get(ib_conn);
+	iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */
 	return 0;
 }
 
@@ -382,7 +383,7 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
 		 * There is no unbind event so the stop callback
 		 * must release the ref from the bind.
 		 */
-		iser_conn_put(ib_conn);
+		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
 	}
 	iser_conn->ib_conn = NULL;
 }
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 53da74b..f1df015 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -247,7 +247,6 @@ struct iser_conn {
 	struct rdma_cm_id            *cma_id;       /* CMA ID		       */
 	struct ib_qp	             *qp;           /* QP 		       */
 	struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */
-	int                          disc_evt_flag; /* disconn event delivered */
 	wait_queue_head_t	     wait;          /* waitq for conn/disconn  */
 	int                          post_recv_buf_count; /* posted rx count  */
 	atomic_t                     post_send_buf_count; /* posted tx count   */
@@ -321,7 +320,7 @@ void iser_conn_init(struct iser_conn *ib_conn);
 
 void iser_conn_get(struct iser_conn *ib_conn);
 
-void iser_conn_put(struct iser_conn *ib_conn);
+int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed);
 
 void iser_conn_terminate(struct iser_conn *ib_conn);
 
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index ed7c901..78fdeca 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -238,7 +238,7 @@ alloc_err:
  * releases the FMR pool, QP and CMA ID objects, returns 0 on success,
  * -1 on failure
  */
-static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
+static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
 {
 	BUG_ON(ib_conn == NULL);
 
@@ -253,7 +253,8 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
 	if (ib_conn->qp != NULL)
 		rdma_destroy_qp(ib_conn->cma_id);
 
-	if (ib_conn->cma_id != NULL)
+	/* if cma handler context, the caller acts s.t the cma destroy the id */
+	if (ib_conn->cma_id != NULL && can_destroy_id)
 		rdma_destroy_id(ib_conn->cma_id);
 
 	ib_conn->fmr_pool = NULL;
@@ -331,7 +332,7 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
 /**
  * Frees all conn objects and deallocs conn descriptor
  */
-static void iser_conn_release(struct iser_conn *ib_conn)
+static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id)
 {
 	struct iser_device  *device = ib_conn->device;
 
@@ -341,7 +342,7 @@ static void iser_conn_release(struct iser_conn *ib_conn)
 	list_del(&ib_conn->conn_list);
 	mutex_unlock(&ig.connlist_mutex);
 	iser_free_rx_descriptors(ib_conn);
-	iser_free_ib_conn_res(ib_conn);
+	iser_free_ib_conn_res(ib_conn, can_destroy_id);
 	ib_conn->device = NULL;
 	/* on EVENT_ADDR_ERROR there's no device yet for this conn */
 	if (device != NULL)
@@ -354,10 +355,13 @@ void iser_conn_get(struct iser_conn *ib_conn)
 	atomic_inc(&ib_conn->refcount);
 }
 
-void iser_conn_put(struct iser_conn *ib_conn)
+int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id)
 {
-	if (atomic_dec_and_test(&ib_conn->refcount))
-		iser_conn_release(ib_conn);
+	if (atomic_dec_and_test(&ib_conn->refcount)) {
+		iser_conn_release(ib_conn, can_destroy_id);
+		return 1;
+	}
+	return 0;
 }
 
 /**
@@ -381,19 +385,20 @@ void iser_conn_terminate(struct iser_conn *ib_conn)
 	wait_event_interruptible(ib_conn->wait,
 				 ib_conn->state == ISER_CONN_DOWN);
 
-	iser_conn_put(ib_conn);
+	iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
 }
 
-static void iser_connect_error(struct rdma_cm_id *cma_id)
+static int iser_connect_error(struct rdma_cm_id *cma_id)
 {
 	struct iser_conn *ib_conn;
 	ib_conn = (struct iser_conn *)cma_id->context;
 
 	ib_conn->state = ISER_CONN_DOWN;
 	wake_up_interruptible(&ib_conn->wait);
+	return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
 }
 
-static void iser_addr_handler(struct rdma_cm_id *cma_id)
+static int iser_addr_handler(struct rdma_cm_id *cma_id)
 {
 	struct iser_device *device;
 	struct iser_conn   *ib_conn;
@@ -402,8 +407,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 	device = iser_device_find_by_ib_device(cma_id);
 	if (!device) {
 		iser_err("device lookup/creation failed\n");
-		iser_connect_error(cma_id);
-		return;
+		return iser_connect_error(cma_id);
 	}
 
 	ib_conn = (struct iser_conn *)cma_id->context;
@@ -412,11 +416,13 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 	ret = rdma_resolve_route(cma_id, 1000);
 	if (ret) {
 		iser_err("resolve route failed: %d\n", ret);
-		iser_connect_error(cma_id);
+		return iser_connect_error(cma_id);
 	}
+
+	return 0;
 }
 
-static void iser_route_handler(struct rdma_cm_id *cma_id)
+static int iser_route_handler(struct rdma_cm_id *cma_id)
 {
 	struct rdma_conn_param conn_param;
 	int    ret;
@@ -437,9 +443,9 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
 		goto failure;
 	}
 
-	return;
+	return 0;
 failure:
-	iser_connect_error(cma_id);
+	return iser_connect_error(cma_id);
 }
 
 static void iser_connected_handler(struct rdma_cm_id *cma_id)
@@ -451,12 +457,12 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
 	wake_up_interruptible(&ib_conn->wait);
 }
 
-static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
+static int iser_disconnected_handler(struct rdma_cm_id *cma_id)
 {
 	struct iser_conn *ib_conn;
+	int ret;
 
 	ib_conn = (struct iser_conn *)cma_id->context;
-	ib_conn->disc_evt_flag = 1;
 
 	/* getting here when the state is UP means that the conn is being *
 	 * terminated asynchronously from the iSCSI layer's perspective.  */
@@ -471,20 +477,24 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
 		ib_conn->state = ISER_CONN_DOWN;
 		wake_up_interruptible(&ib_conn->wait);
 	}
+
+	ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
+	return ret;
 }
 
 static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
 	int ret = 0;
 
-	iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id);
+	iser_err("event %d status %d conn %p id %p\n",
+		event->event, event->status, cma_id->context, cma_id);
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
-		iser_addr_handler(cma_id);
+		ret = iser_addr_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-		iser_route_handler(cma_id);
+		ret = iser_route_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_ESTABLISHED:
 		iser_connected_handler(cma_id);
@@ -494,13 +504,12 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
-		iser_err("event: %d, error: %d\n", event->event, event->status);
-		iser_connect_error(cma_id);
+		ret = iser_connect_error(cma_id);
 		break;
 	case RDMA_CM_EVENT_DISCONNECTED:
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 	case RDMA_CM_EVENT_ADDR_CHANGE:
-		iser_disconnected_handler(cma_id);
+		ret = iser_disconnected_handler(cma_id);
 		break;
 	default:
 		iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@@ -515,7 +524,7 @@ void iser_conn_init(struct iser_conn *ib_conn)
 	init_waitqueue_head(&ib_conn->wait);
 	ib_conn->post_recv_buf_count = 0;
 	atomic_set(&ib_conn->post_send_buf_count, 0);
-	atomic_set(&ib_conn->refcount, 1);
+	atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */
 	INIT_LIST_HEAD(&ib_conn->conn_list);
 	spin_lock_init(&ib_conn->lock);
 }
@@ -543,6 +552,7 @@ int iser_connect(struct iser_conn   *ib_conn,
 
 	ib_conn->state = ISER_CONN_PENDING;
 
+	iser_conn_get(ib_conn); /* ref ib conn's cma id */
 	ib_conn->cma_id = rdma_create_id(iser_cma_handler,
 					     (void *)ib_conn,
 					     RDMA_PS_TCP);
@@ -580,7 +590,7 @@ id_failure:
 addr_failure:
 	ib_conn->state = ISER_CONN_DOWN;
 connect_failure:
-	iser_conn_release(ib_conn);
+	iser_conn_release(ib_conn, 1);
 	return err;
 }
 
@@ -749,12 +759,10 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc,
 			iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
 					   ISCSI_ERR_CONN_FAILED);
 
-		/* complete the termination process if disconnect event was delivered *
-		 * note there are no more non completed posts to the QP               */
-		if (ib_conn->disc_evt_flag) {
-			ib_conn->state = ISER_CONN_DOWN;
-			wake_up_interruptible(&ib_conn->wait);
-		}
+		/* no more non completed posts to the QP, complete the
+		 * termination process w.o worrying on disconnect event */
+		ib_conn->state = ISER_CONN_DOWN;
+		wake_up_interruptible(&ib_conn->wait);
 	}
 }
 
-- 
cgit v0.10.2


From 9fda1ac5fa09c49e9148f85be14f55e2bb856c0f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 6 May 2010 16:22:21 +0300
Subject: IB/iser: Fix error flow in iser_create_ib_conn_res()

We shouldn't free things here because we free them later.
The call tree looks like this:
	iser_connect() ==> initiating the connection establishment
and later
	iser_cma_handler() => iser_route_handler() => iser_create_ib_conn_res()
if we fail here, eventually iser_conn_release() is called, resulting
in a double free.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 78fdeca..98768657 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -163,10 +163,8 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 	device = ib_conn->device;
 
 	ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-	if (!ib_conn->login_buf) {
-		goto alloc_err;
-		ret = -ENOMEM;
-	}
+	if (!ib_conn->login_buf)
+		goto out_err;
 
 	ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device,
 				(void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE,
@@ -175,10 +173,9 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 	ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
 				    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
 				    GFP_KERNEL);
-	if (!ib_conn->page_vec) {
-		ret = -ENOMEM;
-		goto alloc_err;
-	}
+	if (!ib_conn->page_vec)
+		goto out_err;
+
 	ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1);
 
 	params.page_shift        = SHIFT_4K;
@@ -198,7 +195,8 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 	ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, &params);
 	if (IS_ERR(ib_conn->fmr_pool)) {
 		ret = PTR_ERR(ib_conn->fmr_pool);
-		goto fmr_pool_err;
+		ib_conn->fmr_pool = NULL;
+		goto out_err;
 	}
 
 	memset(&init_attr, 0, sizeof init_attr);
@@ -216,7 +214,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 
 	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
 	if (ret)
-		goto qp_err;
+		goto out_err;
 
 	ib_conn->qp = ib_conn->cma_id->qp;
 	iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n",
@@ -224,12 +222,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 		 ib_conn->fmr_pool, ib_conn->cma_id->qp);
 	return ret;
 
-qp_err:
-	(void)ib_destroy_fmr_pool(ib_conn->fmr_pool);
-fmr_pool_err:
-	kfree(ib_conn->page_vec);
-	kfree(ib_conn->login_buf);
-alloc_err:
+out_err:
 	iser_err("unable to alloc mem or create resource, err %d\n", ret);
 	return ret;
 }
-- 
cgit v0.10.2


From 9893e742a0d942dda2277e9f3e19b726900adf27 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sat, 15 May 2010 23:22:38 +0200
Subject: IB/core: Use kmemdup() instead of kmalloc()+memcpy()

Use kmemdup when some other buffer is immediately copied into the
allocated region.

A simplified version of the semantic patch that makes this change is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@@
expression from,to,size,flag;
statement S;
@@

-  to = \(kmalloc\|kzalloc\)(size,flag);
+  to = kmemdup(from,size,flag);
   if (to==NULL || ...) S
-  memcpy(to, from, size);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 6d77706..ce2e87b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1677,13 +1677,13 @@ int rdma_set_ib_paths(struct rdma_cm_id *id,
 	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
 		return -EINVAL;
 
-	id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+	id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+				     GFP_KERNEL);
 	if (!id->route.path_rec) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
 	id->route.num_paths = num_paths;
 	return 0;
 err:
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 1df1194..6dc7b77 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -291,13 +291,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	}
 
 	if (mad_reg_req) {
-		reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
 		if (!reg_req) {
 			ret = ERR_PTR(-ENOMEM);
 			goto error3;
 		}
-		/* Make a copy of the MAD registration request */
-		memcpy(reg_req, mad_reg_req, sizeof *reg_req);
 	}
 
 	/* Now, fill in the various structures */
-- 
cgit v0.10.2