From 8b7e15772a286d0ef8e4f8eca422ce5368b6fa97 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@dev.mellanox.co.il>
Date: Sun, 27 May 2007 18:06:42 +0300
Subject: IB/mthca: Fix handling of send CQE with error for QPs connected to
 SRQ

mthca_free_err_wqe() currently treats both send and receive CQEs
identically if a QP is using an SRQ.  But for Tavor hardware, send
CQEs with error can be chained together even if the RQ is part of SRQ,
so we may miss some CQEs.

Fix by following the WQE chain for all send CQEs even for non-SRQ QPs.

This fixes crashes in IPoIB CM:
<https://bugs.openfabrics.org//show_bug.cgi?id=604>

Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 0276649..eef415b 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -2284,10 +2284,10 @@ void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
 	struct mthca_next_seg *next;
 
 	/*
-	 * For SRQs, all WQEs generate a CQE, so we're always at the
-	 * end of the doorbell chain.
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
 	 */
-	if (qp->ibqp.srq) {
+	if (qp->ibqp.srq && !is_send) {
 		*new_wqe = 0;
 		return;
 	}
-- 
cgit v0.10.2


From ec56dc0b7f6c3fec20bbc2e98ff1a06edf2fc9b9 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@dev.mellanox.co.il>
Date: Mon, 28 May 2007 14:37:27 +0300
Subject: IPoIB/cm: Fix performance regression on Mellanox

commit 518b1646 ("IPoIB/cm: Fix SRQ WR leak") introduced a severe
performance regression on Mellanox cards, because keeping a QP in the
error state for extended periods of time moves hardware to the slow
path (until the QP is destroyed).  For example, MPI latency goes from
~3 usecs to ~7 usecs.

Fix this by posting a send WR on one of the QPs that are being
flushed, instead of using a separate drain QP that is kept in the
error state.

This fixes bug <https://bugs.openfabrics.org/show_bug.cgi?id=636>,
reported and bisected by Scott Weitzenkamp at Cisco and debugged by
Sasha Mikheev at Voltaire.

Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 158759e..285c143 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -156,7 +156,7 @@ struct ipoib_cm_data {
  * - and then invoke a Destroy QP or Reset QP.
  *
  * We use the second option and wait for a completion on the
- * rx_drain_qp before destroying QPs attached to our SRQ.
+ * same CQ before destroying QPs attached to our SRQ.
  */
 
 enum ipoib_cm_state {
@@ -199,7 +199,6 @@ struct ipoib_cm_dev_priv {
 	struct ib_srq  	       *srq;
 	struct ipoib_cm_rx_buf *srq_ring;
 	struct ib_cm_id        *id;
-	struct ib_qp           *rx_drain_qp;   /* generates WR described in 10.3.1 */
 	struct list_head        passive_ids;   /* state: LIVE */
 	struct list_head        rx_error_list; /* state: ERROR */
 	struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index f133b56..076a0bb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -69,8 +69,9 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 
 #define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
 
-static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
-	.wr_id = IPOIB_CM_RX_DRAIN_WRID
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
+	.opcode = IB_WR_SEND,
 };
 
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
@@ -163,16 +164,22 @@ partial_error:
 
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
 {
-	struct ib_recv_wr *bad_wr;
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
 
-	/* rx_drain_qp send queue depth is 1, so
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
 	 * make sure we have at most 1 outstanding WR. */
 	if (list_empty(&priv->cm.rx_flush_list) ||
 	    !list_empty(&priv->cm.rx_drain_list))
 		return;
 
-	if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
-		ipoib_warn(priv, "failed to post rx_drain wr\n");
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
 
 	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
 }
@@ -199,10 +206,10 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
-		.send_cq = priv->cq, /* does not matter, we never send anything */
+		.send_cq = priv->cq, /* For drain WR */
 		.recv_cq = priv->cq,
 		.srq = priv->cm.srq,
-		.cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
 		.qp_type = IB_QPT_RC,
@@ -242,6 +249,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
 		return ret;
 	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
 	return 0;
 }
 
@@ -623,38 +651,11 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_qp_init_attr qp_init_attr = {
-		.send_cq = priv->cq,   /* does not matter, we never send anything */
-		.recv_cq = priv->cq,
-		.cap.max_send_wr = 1,  /* FIXME: 0 Seems not to work */
-		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
-		.cap.max_recv_wr = 1,
-		.cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
-		.qp_type = IB_QPT_UC,
-	};
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
 		return 0;
 
-	priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
-	if (IS_ERR(priv->cm.rx_drain_qp)) {
-		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
-		ret = PTR_ERR(priv->cm.rx_drain_qp);
-		return ret;
-	}
-
-	/*
-	 * We put the QP in error state directly.  This way, a "flush
-	 * error" WC will be immediately generated for each WR we post.
-	 */
-	ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
-	if (ret) {
-		ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
-		goto err_qp;
-	}
-
 	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
 	if (IS_ERR(priv->cm.id)) {
 		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
@@ -676,8 +677,6 @@ err_listen:
 	ib_destroy_cm_id(priv->cm.id);
 err_cm:
 	priv->cm.id = NULL;
-err_qp:
-	ib_destroy_qp(priv->cm.rx_drain_qp);
 	return ret;
 }
 
@@ -740,7 +739,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 		kfree(p);
 	}
 
-	ib_destroy_qp(priv->cm.rx_drain_qp);
 	cancel_delayed_work(&priv->cm.stale_task);
 }
 
-- 
cgit v0.10.2


From d998ccce020e2cfcf11c6b57503532930ede2894 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Mon, 21 May 2007 17:38:02 -0700
Subject: IB/cm: Fix stale connection detection

The ib_cm can incorrectly detect a stale connection (a new connection
request for a QPN that is already connected) as a duplicate connection
request.  Separate the handling of potential duplicate REQs from stale
connections.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index e840434..40c004a 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1297,26 +1297,29 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 
-	/* Check for duplicate REQ and stale connections. */
+	/* Check for possible duplicate REQ. */
 	spin_lock_irqsave(&cm.lock, flags);
 	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
-	if (!timewait_info)
-		timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
-
 	if (timewait_info) {
 		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
 					   timewait_info->work.remote_id);
-		cm_cleanup_timewait(cm_id_priv->timewait_info);
 		spin_unlock_irqrestore(&cm.lock, flags);
 		if (cur_cm_id_priv) {
 			cm_dup_req_handler(work, cur_cm_id_priv);
 			cm_deref_id(cur_cm_id_priv);
-		} else
-			cm_issue_rej(work->port, work->mad_recv_wc,
-				     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
-				     NULL, 0);
-		listen_cm_id_priv = NULL;
-		goto out;
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
 	}
 
 	/* Find matching listen request. */
-- 
cgit v0.10.2


From a2cb4a98f243d01f2c8d5799c764bb96ffa66c44 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 29 May 2007 16:07:09 -0700
Subject: IB/mlx4: Fix last allocated object tracking in bitmap allocator

Set last allocated object to the object after the one just allocated
before ORing in the extra top bits.  Also handle the case where this
wraps around.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index dfbd580..f8d63d3 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -51,8 +51,8 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 	if (obj < bitmap->max) {
 		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1) & (bitmap->max - 1);
 		obj |= bitmap->top;
-		bitmap->last = obj + 1;
 	} else
 		obj = -1;
 
-- 
cgit v0.10.2