From b8a1b1ce14252b59b2d5c89de25b54f9bfd4cc5e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 14 Jan 2009 14:55:41 -0800 Subject: IPoIB: Fix hang in napi_disable() if P_Key is never found After commit fe25c561 ("IPoIB: Don't enable NAPI when it's already enabled"), if an interface is brought up but the corresponding P_Key never appears, then ipoib_stop() will hang in napi_disable(), because ipoib_open() returns before it does napi_enable(). Fix this by changing ipoib_open() to call napi_enable() even if the P_Key isn't present. Reported-by: Yossi Etigin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index dce0443..0bd2a4f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -106,23 +106,17 @@ int ipoib_open(struct net_device *dev) ipoib_dbg(priv, "bringing up interface\n"); - set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + if (!test_and_set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + napi_enable(&priv->napi); if (ipoib_pkey_dev_delay_open(dev)) return 0; - napi_enable(&priv->napi); + if (ipoib_ib_dev_open(dev)) + goto err_disable; - if (ipoib_ib_dev_open(dev)) { - napi_disable(&priv->napi); - return -EINVAL; - } - - if (ipoib_ib_dev_up(dev)) { - ipoib_ib_dev_stop(dev, 1); - napi_disable(&priv->napi); - return -EINVAL; - } + if (ipoib_ib_dev_up(dev)) + goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -144,6 +138,15 @@ int ipoib_open(struct net_device *dev) netif_start_queue(dev); return 0; + +err_stop: + ipoib_ib_dev_stop(dev, 1); + +err_disable: + napi_disable(&priv->napi); + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; } static int ipoib_stop(struct net_device *dev) -- cgit v0.10.2 From cbbe1efa4972350286b52cb48aefaa11e198c0fb Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 14 Jan 2009 21:44:39 -0800 Subject: IPoIB: Fix deadlock between ipoib_open() and child interface create Fix a deadlock between child interface creation/deletion and ipoib start/stop. The former takes vlan_mutex, and then might take RTNL via register_netdev()/unregister_netdev(). The latter is executed with RTNL held, and tries to take vlan_mutex, which can lead to an AB-BA deadlock. Fix this by having the child interface creation/deletion code take the RTNL first so vlan_mutex always nests inside RTNL. We can use register_netdevice() for child interfaces because we form the interface name from the parent interface and hence don't need the '%' expansion of register_netdev(). Reported-by: Yossi Etigin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 2cf1a40..5a76a55 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -61,6 +61,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ppriv = netdev_priv(pdev); + rtnl_lock(); mutex_lock(&ppriv->vlan_mutex); /* @@ -111,7 +112,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) goto device_init_failed; } - result = register_netdev(priv->dev); + result = register_netdevice(priv->dev); if (result) { ipoib_warn(priv, "failed to initialize; error %i", result); goto register_failed; @@ -134,12 +135,13 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) list_add_tail(&priv->list, &ppriv->child_intfs); mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); return 0; sysfs_failed: ipoib_delete_debug_files(priv->dev); - unregister_netdev(priv->dev); + unregister_netdevice(priv->dev); register_failed: ipoib_dev_cleanup(priv->dev); @@ -149,6 +151,7 @@ device_init_failed: err: mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); return result; } @@ -162,10 +165,11 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) ppriv = netdev_priv(pdev); + rtnl_lock(); mutex_lock(&ppriv->vlan_mutex); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey) { - unregister_netdev(priv->dev); + unregister_netdevice(priv->dev); ipoib_dev_cleanup(priv->dev); list_del(&priv->list); free_netdev(priv->dev); @@ -175,6 +179,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } } mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); return ret; } -- cgit v0.10.2 From d3b924d960a808105180d229b4667061123cc4ef Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 15 Jan 2009 20:43:56 -0800 Subject: mlx4_core: Fix min() warning Fix drivers/net/mlx4/profile.c: In function `mlx4_make_profile': drivers/net/mlx4/profile.c:110: warning: comparison of distinct pointer types lacks a cast This happened because num_possible_cpus() was secretly changed by commit ae7a47e7 ("cpumask: make cpumask.h eat its own dogfood.") from returning "int" to (now) returning "unsigned int". I think that was a good change, so we should just swallow the fallout. Cc: "David S. Miller" Cc: Roland Dreier Cc: Yevgeny Petrilin Cc: Jack Morgenstein Cc: Vladimir Sokolovsky Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index 919fb9e..cebdf32 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -107,9 +107,9 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = min(dev_cap->max_eqs, - dev_cap->reserved_eqs + - num_possible_cpus() + 1); + profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, + dev_cap->reserved_eqs + + num_possible_cpus() + 1); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; profile[MLX4_RES_MTT].num = request->num_mtt; -- cgit v0.10.2 From 0fd7e1d8559f45a6838cee93ea49adc0c5bda8f0 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 16 Jan 2009 12:47:47 -0800 Subject: IB/mlx4: Fix memory ordering problem when posting LSO sends The current work request posting code writes the LSO segment before writing any data segments. This leaves a window where the LSO segment overwrites the stamping in one cacheline that the HCA prefetches before the rest of the cacheline is filled with the correct data segments. When the HCA processes this work request, a local protection error may result. Fix this by saving the LSO header size field off and writing it only after all data segments are written. This fix is a cleaned-up version of a patch from Jack Morgenstein . This fixes . Reported-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 39167a7..a91cb4c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1462,7 +1462,8 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) } static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, - struct mlx4_ib_qp *qp, unsigned *lso_seg_len) + struct mlx4_ib_qp *qp, unsigned *lso_seg_len, + __be32 *lso_hdr_sz) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); @@ -1479,12 +1480,8 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); - /* make sure LSO header is written before overwriting stamping */ - wmb(); - - wqe->mss_hdr_size = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | - wr->wr.ud.hlen); - + *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | + wr->wr.ud.hlen); *lso_seg_len = halign; return 0; } @@ -1518,6 +1515,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int uninitialized_var(stamp); int uninitialized_var(size); unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); int i; spin_lock_irqsave(&qp->sq.lock, flags); @@ -1525,6 +1525,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ind = qp->sq_next_wqe; for (nreq = 0; wr; ++nreq, wr = wr->next) { + lso_wqe = &dummy; + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; *bad_wr = wr; @@ -1606,11 +1608,12 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { - err = build_lso_seg(wqe, wr, qp, &seglen); + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz); if (unlikely(err)) { *bad_wr = wr; goto out; } + lso_wqe = (__be32 *) wqe; wqe += seglen; size += seglen / 16; } @@ -1652,6 +1655,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (i = wr->num_sge - 1; i >= 0; --i, --dseg) set_data_seg(dseg, wr->sg_list + i); + /* + * Possibly overwrite stamping in cacheline with LSO + * segment only after making sure all data segments + * are written. + */ + wmb(); + *lso_wqe = lso_hdr_sz; + ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; @@ -1686,7 +1697,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); } - } out: -- cgit v0.10.2 From 3c20962086b0ceb5498ba840e5a91bf4a692aae9 Mon Sep 17 00:00:00 2001 From: Yossi Etigin Date: Fri, 16 Jan 2009 13:42:59 -0800 Subject: IPoIB: Do not print error messages for multicast join retries When IPoIB tries to join a multicast group, and the SA module's SM address handle is NULL (because of an SM change, etc), the join returns with -EAGAIN status. In that case, don't print an error message unless multicast debugging is enabled. Signed-off-by: Yossi Etigin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 59d02e0b..425e311 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -409,7 +409,7 @@ static int ipoib_mcast_join_complete(int status, } if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT) { + if (status == -ETIMEDOUT || status == -EAGAIN) { ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", mcast->mcmember.mgid.raw, status); } else { -- cgit v0.10.2