From 50d40b8e53fab58b0141a75f7448eb28f9e21338 Mon Sep 17 00:00:00 2001 From: Alexander Schmidt Date: Sat, 5 Sep 2009 20:22:54 -0700 Subject: IB/ehca: Make port autodetect mode the default Make port autodetect mode the default for the ehca driver. The autodetect code has been in the kernel for several releases now and has proved to be stable. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index fab18a2..5b635aa 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -52,7 +52,7 @@ #include "ehca_tools.h" #include "hcp_if.h" -#define HCAD_VERSION "0028" +#define HCAD_VERSION "0029" MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch "); @@ -64,7 +64,7 @@ static int ehca_hw_level = 0; static int ehca_poll_all_eqs = 1; int ehca_debug_level = 0; -int ehca_nr_ports = 2; +int ehca_nr_ports = -1; int ehca_use_hp_mr = 0; int ehca_port_act_time = 30; int ehca_static_rate = -1; @@ -95,8 +95,8 @@ MODULE_PARM_DESC(hw_level, "Hardware level (0: autosensing (default), " "0x10..0x14: eHCA, 0x20..0x23: eHCA2)"); MODULE_PARM_DESC(nr_ports, - "number of connected ports (-1: autodetect, 1: port one only, " - "2: two ports (default)"); + "number of connected ports (-1: autodetect (default), " + "1: port one only, 2: two ports)"); MODULE_PARM_DESC(use_hp_mr, "Use high performance MRs (default: no)"); MODULE_PARM_DESC(port_act_time, -- cgit v0.10.2 From d706834d995939c96b4952bf042918879b4db18b Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Sat, 5 Sep 2009 20:22:55 -0700 Subject: IB/ehca: Construct MAD redirect replies from request MAD The old code used a lot of hard-coded values, which might not be valid in all environments (especially routed fabrics or partitioned subnets). Copy as much information as possible from the incoming request to correct that. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c index c568b28..8c1213f 100644 --- a/drivers/infiniband/hw/ehca/ehca_sqp.c +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -125,14 +125,30 @@ struct ib_perf { u8 data[192]; } __attribute__ ((packed)); +/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */ +struct tcslfl { + u32 tc:8; + u32 sl:4; + u32 fl:20; +} __attribute__ ((packed)); + +/* IP Version/TC/FL packed into 32 bits, as in GRH */ +struct vertcfl { + u32 ver:4; + u32 tc:8; + u32 fl:20; +} __attribute__ ((packed)); static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { struct ib_perf *in_perf = (struct ib_perf *)in_mad; struct ib_perf *out_perf = (struct ib_perf *)out_mad; struct ib_class_port_info *poi = (struct ib_class_port_info *)out_perf->data; + struct tcslfl *tcslfl = + (struct tcslfl *)&poi->redirect_tcslfl; struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, ib_device); struct ehca_sport *sport = &shca->sport[port_num - 1]; @@ -158,10 +174,29 @@ static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, poi->base_version = 1; poi->class_version = 1; poi->resp_time_value = 18; - poi->redirect_lid = sport->saved_attr.lid; - poi->redirect_qp = sport->pma_qp_nr; + + /* copy local routing information from WC where applicable */ + tcslfl->sl = in_wc->sl; + poi->redirect_lid = + sport->saved_attr.lid | in_wc->dlid_path_bits; + poi->redirect_qp = sport->pma_qp_nr; poi->redirect_qkey = IB_QP1_QKEY; - poi->redirect_pkey = IB_DEFAULT_PKEY_FULL; + + ehca_query_pkey(ibdev, port_num, in_wc->pkey_index, + &poi->redirect_pkey); + + /* if request was globally routed, copy route info */ + if (in_grh) { + struct vertcfl *vertcfl = + (struct vertcfl *)&in_grh->version_tclass_flow; + memcpy(poi->redirect_gid, in_grh->dgid.raw, + sizeof(poi->redirect_gid)); + tcslfl->tc = vertcfl->tc; + tcslfl->fl = vertcfl->fl; + } else + /* else only fill in default GID */ + ehca_query_gid(ibdev, port_num, 0, + (union ib_gid *)&poi->redirect_gid); ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", sport->saved_attr.lid, sport->pma_qp_nr); @@ -183,8 +218,7 @@ perf_reply: int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, - struct ib_mad *out_mad) + struct ib_mad *in_mad, struct ib_mad *out_mad) { int ret; @@ -196,7 +230,8 @@ int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS; ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); - ret = ehca_process_perf(ibdev, port_num, in_mad, out_mad); + ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh, + in_mad, out_mad); return ret; } -- cgit v0.10.2 From 6303e74c699d6ba8bd3d44ec6898b2d6aa55d788 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Sat, 5 Sep 2009 20:22:55 -0700 Subject: IB/ehca: Fix CQE flags reporting The driver was reporting CQE flags in the wrong bit positions, causing consumers to miss incoming immediate data. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 5a3d96f..8fd88cd 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -786,7 +786,11 @@ repoll: wc->slid = cqe->rlid; wc->dlid_path_bits = cqe->dlid; wc->src_qp = cqe->remote_qp_number; - wc->wc_flags = cqe->w_completion_flags; + /* + * HW has "Immed data present" and "GRH present" in bits 6 and 5. + * SW defines those in bits 1 and 0, so we can just shift and mask. + */ + wc->wc_flags = (cqe->w_completion_flags >> 5) & 3; wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); wc->sl = cqe->service_level; -- cgit v0.10.2 From 286b63d09660de0fbd0d7748984d7ae491c7fdb6 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sat, 5 Sep 2009 20:23:21 -0700 Subject: IB/ipath: strncpy() doesn't always NUL-terminate strlcpy() will always null terminate the string. node_desc is not guaranteed to be NUL-terminated so just use memcpy(). Signed-off-by: Roel Kluin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 2317398..38a2870 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1616,7 +1616,7 @@ static int try_alloc_port(struct ipath_devdata *dd, int port, pd->port_cnt = 1; port_fp(fp) = pd; pd->port_pid = get_pid(task_pid(current)); - strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); ipath_stats.sps_ports++; ret = 0; } else diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 16a702d..ceb98ee 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -60,7 +60,7 @@ static int recv_subn_get_nodedescription(struct ib_smp *smp, if (smp->attr_mod) smp->status |= IB_SMP_INVALID_FIELD; - strncpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); return reply(smp); } -- cgit v0.10.2 From cd0bcf4cb963a147baf0b79d94c25ba86220f708 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:23:38 -0700 Subject: IPoIB: Remove unused includes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 181b1f3..8f4b4fc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -31,7 +31,6 @@ */ #include -#include #include #include #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index e7e5adf..e35f4a0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -36,7 +36,6 @@ #include #include -#include #include #include -- cgit v0.10.2 From 721d67cdca5b7642b380ca0584de8dceecf6102f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:23:40 -0700 Subject: IPoIB: Drop priv->lock before calling ipoib_send() IPoIB currently must use irqsave locking for priv->lock, since it is taken from interrupt context in one path. However, ipoib_send() does skb_orphan(), and the network stack locking is not IRQ-safe. Therefore we need to make sure we don't hold priv->lock when calling ipoib_send() to avoid lockdep warnings (the code was almost certainly safe in practice, since the only code path that takes priv->lock from interrupt context would never call into the network stack). Addresses: http://bugzilla.kernel.org/show_bug.cgi?id=13757 Reported-by: Bart Van Assche Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index e319d91..2bf5116 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -604,8 +604,11 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) skb_queue_len(&neigh->queue)); goto err_drop; } - } else + } else { + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha)); + return; + } } else { neigh->ah = NULL; @@ -688,7 +691,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid)); + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { /* put pseudoheader back on for next time */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a0e9753..a0825fe 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -720,7 +720,9 @@ out: } } + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + return; } unlock: -- cgit v0.10.2 From 5e47596bee12597824a3b5b21e20f80b61e58a35 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sat, 5 Sep 2009 20:23:40 -0700 Subject: IPoIB: Check multicast address format Check that the format of multicast link addresses is correct before taking them from dev->mc_list to priv->multicast_list. This way we never try to send a bogus address to the SA, which prevents badness from erronous 'ip maddr addr add', broken bonding drivers, etc. Signed-off-by: Jason Gunthorpe Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a0825fe..25874fc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -760,6 +760,20 @@ void ipoib_mcast_dev_flush(struct net_device *dev) } } +static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, + const u8 *broadcast) +{ + if (addrlen != INFINIBAND_ALEN) + return 0; + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = @@ -793,6 +807,11 @@ void ipoib_mcast_restart_task(struct work_struct *work) for (mclist = dev->mc_list; mclist; mclist = mclist->next) { union ib_gid mgid; + if (!ipoib_mcast_addr_is_valid(mclist->dmi_addr, + mclist->dmi_addrlen, + dev->broadcast)) + continue; + memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(dev, &mgid); -- cgit v0.10.2 From 6276e08a9bdf645b71a092fb4530baf4f6c4c6eb Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:24:23 -0700 Subject: IB: Use DEFINE_SPINLOCK() for static spinlocks Rather than just defining static spinlock_t variables and then initializing them later in init functions, simply define them with DEFINE_SPINLOCK() and remove the calls to spin_lock_init(). This cleans up the source a tad and also shrinks the compiled code; eg on x86-64: add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-40 (-40) function old new delta ib_uverbs_init 336 326 -10 ib_mad_init_module 147 137 -10 ib_sa_init 123 103 -20 Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index de922a0..5cef8f8 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -51,8 +51,7 @@ static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; /* Port list lock */ -static spinlock_t ib_mad_port_list_lock; - +static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, @@ -2984,8 +2983,6 @@ static int __init ib_mad_init_module(void) { int ret; - spin_lock_init(&ib_mad_port_list_lock); - ib_mad_cache = kmem_cache_create("ib_mad", sizeof(struct ib_mad_private), 0, @@ -3021,4 +3018,3 @@ static void __exit ib_mad_cleanup_module(void) module_init(ib_mad_init_module); module_exit(ib_mad_cleanup_module); - diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1865049..8254371 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -109,10 +109,10 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static spinlock_t idr_lock; +static DEFINE_SPINLOCK(idr_lock); static DEFINE_IDR(query_idr); -static spinlock_t tid_lock; +static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ @@ -1077,9 +1077,6 @@ static int __init ib_sa_init(void) { int ret; - spin_lock_init(&idr_lock); - spin_lock_init(&tid_lock); - get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index eb36a81..1a3ac3d 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -73,7 +73,7 @@ DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); -static spinlock_t map_lock; +static DEFINE_SPINLOCK(map_lock); static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES]; static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -836,8 +836,6 @@ static int __init ib_uverbs_init(void) { int ret; - spin_lock_init(&map_lock); - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { -- cgit v0.10.2 From 181c74e87eb93df447a759af93cf0a279875ea7d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Sat, 5 Sep 2009 20:24:23 -0700 Subject: RDMA/amso1100: Use %pM conversion specifier Use the %pM conversion specifier to print a MAC address. Signed-off-by: Tobias Klauser Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index 0cfbb6d..8250740 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -86,11 +86,7 @@ MODULE_DEVICE_TABLE(pci, c2_pci_table); static void c2_print_macaddr(struct net_device *netdev) { - pr_debug("%s: MAC %02X:%02X:%02X:%02X:%02X:%02X, " - "IRQ %u\n", netdev->name, - netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], - netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5], - netdev->irq); + pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq); } static void c2_set_rxbufsize(struct c2_port *c2_port) -- cgit v0.10.2 From f1aa78b26e8dabc2956be94a93c40c6cc08eb4a3 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sat, 5 Sep 2009 20:24:24 -0700 Subject: IB: Use printk_once() for driver versions Replace open-coded reimplementations with printk_once(). Signed-off-by: Marcin Slusarz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 26fc0a4..9cc99df 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -105,11 +105,9 @@ static void rnic_init(struct iwch_dev *rnicp) static void open_rnic_dev(struct t3cdev *tdev) { struct iwch_dev *rnicp; - static int vers_printed; PDBG("%s t3cdev %p\n", __func__, tdev); - if (!vers_printed++) - printk(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION); rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); if (!rnicp) { diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d759..0b2f77a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -540,15 +540,11 @@ static struct device_attribute *mlx4_class_attributes[] = { static void *mlx4_ib_add(struct mlx4_dev *dev) { - static int mlx4_ib_version_printed; struct mlx4_ib_dev *ibdev; int num_ports = 0; int i; - if (!mlx4_ib_version_printed) { - printk(KERN_INFO "%s", mlx4_ib_version); - ++mlx4_ib_version_printed; - } + printk_once(KERN_INFO "%s", mlx4_ib_version); mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) num_ports++; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 13da9f1..2e4e043 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1215,15 +1215,11 @@ int __mthca_restart_one(struct pci_dev *pdev) static int __devinit mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - static int mthca_version_printed = 0; int ret; mutex_lock(&mthca_device_mutex); - if (!mthca_version_printed) { - printk(KERN_INFO "%s", mthca_version); - ++mthca_version_printed; - } + printk_once(KERN_INFO "%s", mthca_version); if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { printk(KERN_ERR PFX "%s has invalid driver data %lx\n", -- cgit v0.10.2 From e1d7806df32bd247af6a2fe52433ecdd34fee773 Mon Sep 17 00:00:00 2001 From: Yossi Etigin Date: Sat, 5 Sep 2009 20:24:24 -0700 Subject: IB/core: Fix send multicast group leave retry Until now, retries were only sent when joining a multicast group. This patch will adds retries when leaving a multicast group as well. Signed-off-by: Ron Livne Signed-off-by: Yossi Etigin Acked-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 107f170..8d82ba1 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -106,6 +106,8 @@ struct mcast_group { struct ib_sa_query *query; int query_id; u16 pkey_index; + u8 leave_state; + int retries; }; struct mcast_member { @@ -350,6 +352,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state) rec = group->rec; rec.join_state = leave_state; + group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, @@ -542,7 +545,11 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, { struct mcast_group *group = context; - mcast_work_handler(&group->work); + if (status && group->retries > 0 && + !send_leave(group, group->leave_state)) + group->retries--; + else + mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -565,6 +572,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port, if (!group) return NULL; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; -- cgit v0.10.2 From b1b8afb83336b66ac7198111aad4ead41132b53f Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sat, 5 Sep 2009 20:24:24 -0700 Subject: IB/uverbs: Return ENOSYS for unimplemented commands (not EINVAL) Since the original commit 883a99c7 ("[IB] uverbs: Add a mask of device methods allowed for userspace"), the uverbs core returns EINVAL for commands not implemented by a specific low-level driver. This creates a problem that there is no way to tell the difference between an unimplemented command and an implemented one which is incorrectly invoked (which also returns EINVAL). The fix is to have unimplemented commands return ENOSYS. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 1a3ac3d..d3fff9e 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -584,14 +584,16 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (hdr.command < 0 || hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command] || - !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) + !uverbs_cmd_table[hdr.command]) return -EINVAL; if (!file->ucontext && hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) + return -ENOSYS; + return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr, hdr.in_words * 4, hdr.out_words * 4); } -- cgit v0.10.2 From 1493ab4083c315a978e51b8957bf87859703745d Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sat, 5 Sep 2009 20:24:24 -0700 Subject: RDMA/amso1100: Check kmalloc() result in c2_register_device() dev->ibdev.iwcm allocation may fail, prevent a dereference. Signed-off-by: Roel Kluin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index f1948fa..ad723bd 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -780,11 +780,11 @@ int c2_register_device(struct c2_dev *dev) /* Register pseudo network device */ dev->pseudo_netdev = c2_pseudo_netdev_init(dev); if (!dev->pseudo_netdev) - goto out3; + goto out; ret = register_netdev(dev->pseudo_netdev); if (ret) - goto out2; + goto out_free_netdev; pr_debug("%s:%u\n", __func__, __LINE__); strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX); @@ -851,6 +851,10 @@ int c2_register_device(struct c2_dev *dev) dev->ibdev.post_recv = c2_post_receive; dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL); + if (dev->ibdev.iwcm == NULL) { + ret = -ENOMEM; + goto out_unregister_netdev; + } dev->ibdev.iwcm->add_ref = c2_add_ref; dev->ibdev.iwcm->rem_ref = c2_rem_ref; dev->ibdev.iwcm->get_qp = c2_get_qp; @@ -862,23 +866,25 @@ int c2_register_device(struct c2_dev *dev) ret = ib_register_device(&dev->ibdev); if (ret) - goto out1; + goto out_free_iwcm; for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) { ret = device_create_file(&dev->ibdev.dev, c2_dev_attributes[i]); if (ret) - goto out0; + goto out_unregister_ibdev; } - goto out3; + goto out; -out0: +out_unregister_ibdev: ib_unregister_device(&dev->ibdev); -out1: +out_free_iwcm: + kfree(dev->ibdev.iwcm); +out_unregister_netdev: unregister_netdev(dev->pseudo_netdev); -out2: +out_free_netdev: free_netdev(dev->pseudo_netdev); -out3: +out: pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret); return ret; } -- cgit v0.10.2 From a01df0fe5e4db8c272dcc395354884ff7c15fc10 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:24:48 -0700 Subject: mlx4_core: Use pci_request_regions() The old code used two calls to pci_request_region() to get the two BARs for the mlx4 device, for no particularly good reason. Clean up the code a little by converting this to a single call to pci_request_regions(). Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index dac621b..5c1afe0 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -1070,18 +1070,12 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_disable_pdev; } - err = pci_request_region(pdev, 0, DRV_NAME); + err = pci_request_regions(pdev, DRV_NAME); if (err) { - dev_err(&pdev->dev, "Cannot request control region, aborting.\n"); + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); goto err_disable_pdev; } - err = pci_request_region(pdev, 2, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n"); - goto err_release_bar0; - } - pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1090,7 +1084,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1101,7 +1095,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } @@ -1110,7 +1104,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; - goto err_release_bar2; + goto err_release_regions; } dev = &priv->dev; @@ -1205,11 +1199,8 @@ err_cmd: err_free_dev: kfree(priv); -err_release_bar2: - pci_release_region(pdev, 2); - -err_release_bar0: - pci_release_region(pdev, 0); +err_release_regions: + pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); @@ -1265,8 +1256,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) pci_disable_msix(pdev); kfree(priv); - pci_release_region(pdev, 2); - pci_release_region(pdev, 0); + pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } -- cgit v0.10.2 From ff149b2a168296c74763cb4a6e7054bdb0a426a1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:24:49 -0700 Subject: mlx4_core: Remove unnecessary includes of Lots of mlx4 files with no function annotations included for no reason. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index ac57b6a..ccfe276 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -34,7 +34,6 @@ * SOFTWARE. */ -#include #include #include diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index b9ceddd..c11a052 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c index baf4bf6..04b382f 100644 --- a/drivers/net/mlx4/icm.c +++ b/drivers/net/mlx4/icm.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index 6053c35..5ccbce9 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index f96948b..ca7ab8e 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include #include #include diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 26d1a7a..c4988d6 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index bd22df9..ca25b9d 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -32,8 +32,6 @@ * SOFTWARE. */ -#include - #include "mlx4.h" #include "fw.h" diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index 1c565ef..42ab9fc 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -33,8 +33,6 @@ * SOFTWARE. */ -#include - #include #include diff --git a/drivers/net/mlx4/reset.c b/drivers/net/mlx4/reset.c index 3951b88..e5741da 100644 --- a/drivers/net/mlx4/reset.c +++ b/drivers/net/mlx4/reset.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index fe9f218..1377d0d 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include - #include #include "mlx4.h" -- cgit v0.10.2 From 338a8fad27908f64a0d249cc9f5c7d4ddb7e5684 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:24:49 -0700 Subject: IB/mlx4: Annotate CQ locking mlx4_ib_lock_cqs()/mlx4_ib_unlock_cqs() are helper functions that lock/unlock both CQs attached to a QP in the proper order to avoid AB-BA deadlocks. Annotate this so sparse can understand what's going on (and warn us if we misuse these functions). Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index c4a0264..219b103 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -615,10 +615,12 @@ static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -628,10 +630,12 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { -- cgit v0.10.2 From fa0681d2129732027355d6b7083dd8932b9b799d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:24:49 -0700 Subject: mlx4_core: Allocate and map sufficient ICM memory for EQ context The current implementation allocates a single host page for EQ context memory, which was OK when we only allocated a few EQs. However, since we now allocate an EQ for each CPU core, this patch removes the hard-coded limit (which we exceed with 4 KB pages and 128 byte EQ context entries with 32 CPUs) and uses the same ICM table code as all other context tables, which ends up simplifying the code quite a bit while fixing the problem. This problem was actually hit in practice on a dual-socket Nehalem box with 16 real hardware threads and sufficiently odd ACPI tables that it shows on boot SMP: Allowing 32 CPUs, 16 hotplug CPUs so num_possible_cpus() ends up 32, and mlx4 ends up creating 33 MSI-X interrupts and 33 EQs. This mlx4 bug means that mlx4 can't even initialize at all on this quite mainstream system. Cc: Reported-by: Eli Cohen Tested-by: Christoph Lameter Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index c11a052..d7974a6 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -525,48 +525,6 @@ static void mlx4_unmap_clr_int(struct mlx4_dev *dev) iounmap(priv->clr_base); } -int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int ret; - - /* - * We assume that mapping one page is enough for the whole EQ - * context table. This is fine with all current HCAs, because - * we only use 32 EQs and each EQ uses 64 bytes of context - * memory, or 1 KB total. - */ - priv->eq_table.icm_virt = icm_virt; - priv->eq_table.icm_page = alloc_page(GFP_HIGHUSER); - if (!priv->eq_table.icm_page) - return -ENOMEM; - priv->eq_table.icm_dma = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0, - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); - if (pci_dma_mapping_error(dev->pdev, priv->eq_table.icm_dma)) { - __free_page(priv->eq_table.icm_page); - return -ENOMEM; - } - - ret = mlx4_MAP_ICM_page(dev, priv->eq_table.icm_dma, icm_virt); - if (ret) { - pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv->eq_table.icm_page); - } - - return ret; -} - -void mlx4_unmap_eq_icm(struct mlx4_dev *dev) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - - mlx4_UNMAP_ICM(dev, priv->eq_table.icm_virt, 1); - pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv->eq_table.icm_page); -} - int mlx4_alloc_eq_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 5c1afe0..528f89b 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -525,7 +525,10 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_aux; } - err = mlx4_map_eq_icm(dev, init_hca->eqc_base); + err = mlx4_init_icm_table(dev, &priv->eq_table.table, + init_hca->eqc_base, dev_cap->eqc_entry_sz, + dev->caps.num_eqs, dev->caps.num_eqs, + 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; @@ -668,7 +671,7 @@ err_unmap_mtt: mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); err_unmap_eq: - mlx4_unmap_eq_icm(dev); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); err_unmap_cmpt: mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); @@ -698,11 +701,11 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); - mlx4_unmap_eq_icm(dev); mlx4_UNMAP_ICM_AUX(dev); mlx4_free_icm(dev, priv->fw.aux_icm, 0); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 5bd79c2..bc72d6e 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -205,9 +205,7 @@ struct mlx4_eq_table { void __iomem **uar_map; u32 clr_mask; struct mlx4_eq *eq; - u64 icm_virt; - struct page *icm_page; - dma_addr_t icm_dma; + struct mlx4_icm_table table; struct mlx4_icm_table cmpt_table; int have_irq; u8 inta_pin; @@ -373,9 +371,6 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca); -int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt); -void mlx4_unmap_eq_icm(struct mlx4_dev *dev); - int mlx4_cmd_init(struct mlx4_dev *dev); void mlx4_cmd_cleanup(struct mlx4_dev *dev); void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param); -- cgit v0.10.2 From 1af92e2a211b0d46ef60ea9f30f989fd7b5725fa Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Sat, 5 Sep 2009 20:24:49 -0700 Subject: mlx4_core: Avoid double free_icms On the error path of mlx4_init_hca(), mlx4_close_hca() is called, followed by mlx4_free_icms() and mlx4_UNMAP_FA(). But both those functions are also called from mlx4_close_hca(), which leads to a double free. Signed-off-by: Yevgeny Petrilin Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 528f89b..3dd481e 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -789,7 +789,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev) return 0; err_close: - mlx4_close_hca(dev); + mlx4_CLOSE_HCA(dev, 0); err_free_icm: mlx4_free_icms(dev); -- cgit v0.10.2 From f5f5951c7494b6ae89ec53ca7ca6b0177ebd1308 Mon Sep 17 00:00:00 2001 From: Arputham Benjamin Date: Sat, 5 Sep 2009 20:24:50 -0700 Subject: mlx4_core: Distinguish multiple devices in /proc/interrupts When the mlx4 driver uses the same name for interrupts for every device in the system. This can make it very confusing trying to work out exactly which device MSI-X interrupts are for. Change the driver to add the PCI name of the device to the interrupt name. Signed-off-by: Arputham Benjamin Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index d7974a6..bffb799 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -41,6 +41,10 @@ #include "fw.h" enum { + MLX4_IRQNAME_SIZE = 64 +}; + +enum { MLX4_NUM_ASYNC_EQE = 0x100, MLX4_NUM_SPARE_EQE = 0x80, MLX4_EQ_ENTRY_SIZE = 0x20 @@ -572,7 +576,9 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.clr_int = priv->clr_base + (priv->eq_table.inta_pin < 32 ? 4 : 0); - priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL); + priv->eq_table.irq_names = + kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1), + GFP_KERNEL); if (!priv->eq_table.irq_names) { err = -ENOMEM; goto err_out_bitmap; @@ -595,17 +601,25 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) goto err_out_comp; if (dev->flags & MLX4_FLAG_MSI_X) { - static const char async_eq_name[] = "mlx4-async"; const char *eq_name; for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { if (i < dev->caps.num_comp_vectors) { - snprintf(priv->eq_table.irq_names + i * 16, 16, - "mlx4-comp-%d", i); - eq_name = priv->eq_table.irq_names + i * 16; - } else - eq_name = async_eq_name; + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-comp-%d@pci:%s", i, + pci_name(dev->pdev)); + } else { + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-async@pci:%s", + pci_name(dev->pdev)); + } + eq_name = priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE; err = request_irq(priv->eq_table.eq[i].irq, mlx4_msi_x_interrupt, 0, eq_name, priv->eq_table.eq + i); @@ -615,8 +629,12 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.eq[i].have_irq = 1; } } else { + snprintf(priv->eq_table.irq_names, + MLX4_IRQNAME_SIZE, + DRV_NAME "@pci:%s", + pci_name(dev->pdev)); err = request_irq(dev->pdev->irq, mlx4_interrupt, - IRQF_SHARED, DRV_NAME, dev); + IRQF_SHARED, priv->eq_table.irq_names, dev); if (err) goto err_out_async; -- cgit v0.10.2 From 3b4a8cd51e59c1c342c51b241bbb96c6ac24a147 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sat, 5 Sep 2009 20:24:50 -0700 Subject: IB/mlx4: Don't allow userspace open while recovering from catastrophic error Userspace apps are supposed to release all ib device resources if they receive a fatal async event (IBV_EVENT_DEVICE_FATAL). However, the app has no way of knowing when the device has come back up, except to repeatedly attempt ibv_open_device() until it succeeds. However, currently there is no protection against the open succeeding while the device is in being removed following the fatal event. In this case, the open will succeed, but as a result the device waits in the middle of its removal until the new app releases its resources -- and the new app will not do so, since the open succeeded at a point following the fatal event generation. This patch adds an "active" flag to the device. The active flag is set to false (in the fatal event flow) before the "fatal" event is generated, so any subsequent ibv_dev_open() call to the device will fail until the device comes back up, thus preventing the above deadlock. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d759..313ce7f 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -342,6 +342,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx4_ib_alloc_ucontext_resp resp; int err; + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + resp.qp_tab_size = dev->dev->caps.num_qps; resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; @@ -673,6 +676,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_reg; } + ibdev->ib_active = true; + return ibdev; err_reg: @@ -729,6 +734,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 8a7dd67..3486d76 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -175,6 +175,7 @@ struct mlx4_ib_dev { spinlock_t sm_lock; struct mutex cap_mask_mutex; + bool ib_active; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) -- cgit v0.10.2 From fc1285585f5bfda18e3fe7f90afd47da51b82bd6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:36:13 -0700 Subject: IB/mthca: Remove unnecessary include of mthca_config_reg.h was including for no reason -- the whole file is just defines of constants, so it's entirely self-contained. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h index 75671f7..155bc66 100644 --- a/drivers/infiniband/hw/mthca/mthca_config_reg.h +++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h @@ -34,8 +34,6 @@ #ifndef MTHCA_CONFIG_REG_H #define MTHCA_CONFIG_REG_H -#include - #define MTHCA_HCR_BASE 0x80680 #define MTHCA_HCR_SIZE 0x0001c #define MTHCA_ECR_BASE 0x80700 -- cgit v0.10.2 From deecb5d6728fff5288b6f7241ee8225ba0d8c6d2 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:36:15 -0700 Subject: IB/mthca: Remove unnecessary include of mthca_reset.c doesn't have any function annotations, so there's no reason to include . Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c index acb6817f..2a13a16 100644 --- a/drivers/infiniband/hw/mthca/mthca_reset.c +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include #include -- cgit v0.10.2 From ffe063f32bacad7d5111ae9c2b3e31addb47ce39 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 5 Sep 2009 20:36:15 -0700 Subject: IB/mthca: Annotate CQ locking mthca_ib_lock_cqs()/mthca_ib_unlock_cqs() are helper functions that lock/unlock both CQs attached to a QP in the proper order to avoid AB-BA deadlocks. Annotate this so sparse can understand what's going on (and warn us if we misuse these functions). Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index f5081bf..c10576f 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1319,10 +1319,12 @@ int mthca_alloc_qp(struct mthca_dev *dev, } static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + __acquire(&recv_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -1332,10 +1334,12 @@ static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) } static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { -- cgit v0.10.2 From d94a86890137fabcc97eaa324bfef3f1827744c8 Mon Sep 17 00:00:00 2001 From: Arputham Benjamin Date: Sat, 5 Sep 2009 20:36:15 -0700 Subject: IB/mthca: Distinguish multiple devices in /proc/interrupts When the mthca driver uses the same name for interrupts for every device in the system. This can make it very confusing trying to work out exactly which device MSI-X interrupts are for. Change the driver to add the PCI name of the device to the interrupt name. Signed-off-by: Arputham Benjamin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 90e4e45..8c31fa3 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -829,27 +829,34 @@ int mthca_init_eq_table(struct mthca_dev *dev) if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { static const char *eq_name[] = { - [MTHCA_EQ_COMP] = DRV_NAME " (comp)", - [MTHCA_EQ_ASYNC] = DRV_NAME " (async)", - [MTHCA_EQ_CMD] = DRV_NAME " (cmd)" + [MTHCA_EQ_COMP] = DRV_NAME "-comp", + [MTHCA_EQ_ASYNC] = DRV_NAME "-async", + [MTHCA_EQ_CMD] = DRV_NAME "-cmd" }; for (i = 0; i < MTHCA_NUM_EQ; ++i) { + snprintf(dev->eq_table.eq[i].irq_name, + IB_DEVICE_NAME_MAX, + "%s@pci:%s", eq_name[i], + pci_name(dev->pdev)); err = request_irq(dev->eq_table.eq[i].msi_x_vector, mthca_is_memfree(dev) ? mthca_arbel_msi_x_interrupt : mthca_tavor_msi_x_interrupt, - 0, eq_name[i], dev->eq_table.eq + i); + 0, dev->eq_table.eq[i].irq_name, + dev->eq_table.eq + i); if (err) goto err_out_cmd; dev->eq_table.eq[i].have_irq = 1; } } else { + snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX, + DRV_NAME "@pci:%s", pci_name(dev->pdev)); err = request_irq(dev->pdev->irq, mthca_is_memfree(dev) ? mthca_arbel_interrupt : mthca_tavor_interrupt, - IRQF_SHARED, DRV_NAME, dev); + IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev); if (err) goto err_out_cmd; dev->eq_table.have_irq = 1; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index c621f87..90f4c4d 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -113,6 +113,7 @@ struct mthca_eq { int nent; struct mthca_buf_list *page_list; struct mthca_mr mr; + char irq_name[IB_DEVICE_NAME_MAX]; }; struct mthca_av; -- cgit v0.10.2 From d84106477733cb155c5dcaea664ddf120bf69eb7 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sat, 5 Sep 2009 20:36:16 -0700 Subject: IB/mthca: Don't allow userspace open while recovering from catastrophic error Userspace apps are supposed to release all ib device resources if they receive a fatal async event (IBV_EVENT_DEVICE_FATAL). However, the app has no way of knowing when the device has come back up, except to repeatedly attempt ibv_open_device() until it succeeds. However, currently there is no protection against the open succeeding while the device is in being removed following the fatal event. In this case, the open will succeed, but as a result the device waits in the middle of its removal until the new app releases its resources -- and the new app will not do so, since the open succeeded at a point following the fatal event generation. This patch adds an "active" flag to the device. The active flag is set to false (in the fatal event flow) before the "fatal" event is generated, so any subsequent ibv_dev_open() call to the device will fail until the device comes back up, thus preventing the above deadlock. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c index 65ad359..056b2a4 100644 --- a/drivers/infiniband/hw/mthca/mthca_catas.c +++ b/drivers/infiniband/hw/mthca/mthca_catas.c @@ -88,6 +88,7 @@ static void handle_catas(struct mthca_dev *dev) event.device = &dev->ib_dev; event.event = IB_EVENT_DEVICE_FATAL; event.element.port_num = 0; + dev->active = false; ib_dispatch_event(&event); diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 9ef611f..7e6a6d6 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -357,6 +357,7 @@ struct mthca_dev { struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; spinlock_t sm_lock; u8 rate[MTHCA_MAX_PORTS]; + bool active; }; #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 13da9f1..518cc54 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1116,6 +1116,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) pci_set_drvdata(pdev, mdev); mdev->hca_type = hca_type; + mdev->active = true; + return 0; err_unregister: diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 87ad889..bcf7a40 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -334,6 +334,9 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, struct mthca_ucontext *context; int err; + if (!(to_mdev(ibdev)->active)) + return ERR_PTR(-EAGAIN); + memset(&uresp, 0, sizeof uresp); uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; -- cgit v0.10.2 From c4c3f279cd8e9cc1d3e2f364a27beadb2e69cda8 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:36 -0700 Subject: RDMA/nes: Update refcnt during disconnect During termination, it is possible for the refcnt to go to zero while the worker thread is posting events upward. This fix increments the refcnt before the request is passed to the worker thread. The thread decrements the refcnt when the request is completed. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 114b802..fe08eb5 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2456,6 +2456,7 @@ int nes_cm_disconn(struct nes_qp *nesqp) if (nesqp->disconn_pending == 0) { nesqp->disconn_pending++; spin_unlock_irqrestore(&nesqp->lock, flags); + nes_add_ref(&nesqp->ibqp); /* init our disconnect work element, to */ INIT_WORK(&nesqp->disconn_work, nes_disconnect_worker); @@ -2477,6 +2478,7 @@ static void nes_disconnect_worker(struct work_struct *work) nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", nesqp->last_aeq, nesqp->hwqp.qp_id); nes_cm_disconn_true(nesqp); + nes_rem_ref(&nesqp->ibqp); } -- cgit v0.10.2 From 873fcdd4bfc75880888a7d148a71d70ed87c9ebf Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:37 -0700 Subject: RDMA/nes: Allocate work item for disconnect event handling The code currently has a work structure in the QP. This requires a lock and a pending flag to ensure there is never more than one request active. When two events happen quickly (such as FIN and LLP CLOSE), it causes unnecessary timeouts since the second one is dropped. This fix allocates memory for the work request so the second one can be queued. A lock is removed since it is no longer needed. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index fe08eb5..d89bdee 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2450,20 +2450,16 @@ static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_nod */ int nes_cm_disconn(struct nes_qp *nesqp) { - unsigned long flags; + struct disconn_work *work; - spin_lock_irqsave(&nesqp->lock, flags); - if (nesqp->disconn_pending == 0) { - nesqp->disconn_pending++; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_add_ref(&nesqp->ibqp); - /* init our disconnect work element, to */ - INIT_WORK(&nesqp->disconn_work, nes_disconnect_worker); - - queue_work(g_cm_core->disconn_wq, &nesqp->disconn_work); - } else - spin_unlock_irqrestore(&nesqp->lock, flags); + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; /* Timer will clean up */ + nes_add_ref(&nesqp->ibqp); + work->nesqp = nesqp; + INIT_WORK(&work->work, nes_disconnect_worker); + queue_work(g_cm_core->disconn_wq, &work->work); return 0; } @@ -2473,8 +2469,10 @@ int nes_cm_disconn(struct nes_qp *nesqp) */ static void nes_disconnect_worker(struct work_struct *work) { - struct nes_qp *nesqp = container_of(work, struct nes_qp, disconn_work); + struct disconn_work *dwork = container_of(work, struct disconn_work, work); + struct nes_qp *nesqp = dwork->nesqp; + kfree(dwork); nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", nesqp->last_aeq, nesqp->hwqp.qp_id); nes_cm_disconn_true(nesqp); @@ -2557,7 +2555,6 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) spin_lock_irqsave(&nesqp->lock, flags); } - nesqp->disconn_pending = 0; /* There might have been another AE while the lock was released */ original_hw_tcp_state = nesqp->hw_tcp_state; original_ibqp_state = nesqp->ibqp_state; @@ -2610,7 +2607,6 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) } } } else { - nesqp->disconn_pending = 0; spin_unlock_irqrestore(&nesqp->lock, flags); } diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 41c07f2..7df34fe 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -119,6 +119,11 @@ struct nes_wq { spinlock_t lock; }; +struct disconn_work { + struct work_struct work; + struct nes_qp *nesqp; +}; + struct iw_cm_id; struct ietf_mpa_frame; @@ -127,7 +132,6 @@ struct nes_qp { void *allocated_buffer; struct iw_cm_id *cm_id; struct workqueue_struct *wq; - struct work_struct disconn_work; struct nes_cq *nesscq; struct nes_cq *nesrcq; struct nes_pd *nespd; @@ -165,7 +169,6 @@ struct nes_qp { u8 hw_iwarp_state; u8 flush_issued; u8 hw_tcp_state; - u8 disconn_pending; u8 destroyed; }; #endif /* NES_VERBS_H */ -- cgit v0.10.2 From ba0c5d9a8975cf740a4a4b8c579cc4b325f8b852 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:37 -0700 Subject: RDMA/nes: Change memory allocation for cqp request to GFP_ATOMIC The routine to allocate a cqp request is not called from process context code. Since it is not OK to sleep, it needs to use GFP_ATOMIC not GFP_KERNEL. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index a282031..b34072b 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -548,7 +548,7 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) spin_unlock_irqrestore(&nesdev->cqp.lock, flags); } if (cqp_request == NULL) { - cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_KERNEL); + cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC); if (cqp_request) { cqp_request->dynamic = 1; INIT_LIST_HEAD(&cqp_request->list); -- cgit v0.10.2 From 5ee21fe0eaf68fb840f442131ab7addced1a31c3 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:37 -0700 Subject: RDMA/nes: Clean out CQ completions when QP is destroyed When a QP is destroyed, unprocessed CQ entries could still reference the QP. This change zeroes the context value at QP destroy time. By skipping over cqe's with a zero context, poll_cq no longer processes a cqe for a destroyed QP. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 21e0fd3..c6b5873 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1506,12 +1506,45 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, /** + * nes_clean_cq + */ +static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq) +{ + u32 cq_head; + u32 lo; + u32 hi; + u64 u64temp; + unsigned long flags = 0; + + spin_lock_irqsave(&nescq->lock, flags); + + cq_head = nescq->hw_cq.cq_head; + while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { + rmb(); + lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]); + u64temp = (((u64)hi) << 32) | ((u64)lo); + u64temp &= ~(NES_SW_CONTEXT_ALIGN-1); + if (u64temp == (u64)(unsigned long)nesqp) { + /* Zero the context value so cqe will be ignored */ + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0; + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0; + } + + if (++cq_head >= nescq->hw_cq.cq_size) + cq_head = 0; + } + + spin_unlock_irqrestore(&nescq->lock, flags); +} + + +/** * nes_destroy_qp */ static int nes_destroy_qp(struct ib_qp *ibqp) { struct nes_qp *nesqp = to_nesqp(ibqp); - /* struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); */ struct nes_ucontext *nes_ucontext; struct ib_qp_attr attr; struct iw_cm_id *cm_id; @@ -1548,7 +1581,6 @@ static int nes_destroy_qp(struct ib_qp *ibqp) nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); } - if (nesqp->user_mode) { if ((ibqp->uobject)&&(ibqp->uobject->context)) { nes_ucontext = to_nesucontext(ibqp->uobject->context); @@ -1560,6 +1592,13 @@ static int nes_destroy_qp(struct ib_qp *ibqp) } if (nesqp->pbl_pbase) kunmap(nesqp->page); + } else { + /* Clean any pending completions from the cq(s) */ + if (nesqp->nesscq) + nes_clean_cq(nesqp, nesqp->nesscq); + + if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) + nes_clean_cq(nesqp, nesqp->nesrcq); } nes_rem_ref(&nesqp->ibqp); @@ -3547,7 +3586,6 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { u64 u64temp; u64 wrid; - /* u64 u64temp; */ unsigned long flags = 0; struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); struct nes_device *nesdev = nesvnic->nesdev; @@ -3560,7 +3598,6 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) u32 cqe_count = 0; u32 wqe_index; u32 u32temp; - /* u32 counter; */ nes_debug(NES_DBG_CQ, "\n"); @@ -3570,24 +3607,27 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) cq_size = nescq->hw_cq.cq_size; while (cqe_count < num_entries) { - if (le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & - NES_CQE_VALID) { - /* - * Make sure we read CQ entry contents *after* - * we've checked the valid bit. - */ - rmb(); - - cqe = nescq->hw_cq.cq_vbase[head]; - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; - u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); - wqe_index = u32temp & - (nesdev->nesadapter->max_qp_wr - 1); - u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); - /* parse CQE, get completion context from WQE (either rq or sq */ - u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | - ((u64)u32temp); - nesqp = *((struct nes_qp **)&u64temp); + if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & + NES_CQE_VALID) == 0) + break; + + /* + * Make sure we read CQ entry contents *after* + * we've checked the valid bit. + */ + rmb(); + + cqe = nescq->hw_cq.cq_vbase[head]; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); + u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); + /* parse CQE, get completion context from WQE (either rq or sq) */ + u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)u32temp); + + if (u64temp) { + nesqp = (struct nes_qp *)(unsigned long)u64temp; memset(entry, 0, sizeof *entry); if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { entry->status = IB_WC_SUCCESS; @@ -3601,7 +3641,7 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { if (nesqp->skip_lsmm) { nesqp->skip_lsmm = 0; - wq_tail = nesqp->hwqp.sq_tail++; + nesqp->hwqp.sq_tail++; } /* Working on a SQ Completion*/ @@ -3643,24 +3683,25 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); entry->opcode = IB_WC_RECV; } - entry->wr_id = wrid; - if (++head >= cq_size) - head = 0; - cqe_count++; - nescq->polled_completions++; - if ((nescq->polled_completions > (cq_size / 2)) || - (nescq->polled_completions == 255)) { - nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" - " are pending %u of %u.\n", - nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); - nes_write32(nesdev->regs+NES_CQE_ALLOC, - nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); - nescq->polled_completions = 0; - } + entry->wr_id = wrid; entry++; - } else - break; + cqe_count++; + } + + if (++head >= cq_size) + head = 0; + nescq->polled_completions++; + + if ((nescq->polled_completions > (cq_size / 2)) || + (nescq->polled_completions == 255)) { + nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" + " are pending %u of %u.\n", + nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } } if (nescq->polled_completions) { -- cgit v0.10.2 From 3c28b4457a4cf95e982ca13578a5613a11009394 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:38 -0700 Subject: RDMA/nes: Add CQ error handling CQ errors are not being handled correctly. Put in the the upcall for CQ errors. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 4a84d02..2a0c5a1 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -2913,6 +2913,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, u64 aeqe_context = 0; unsigned long flags; struct nes_qp *nesqp; + struct nes_hw_cq *hw_cq; + struct nes_cq *nescq; int resource_allocated; /* struct iw_cm_id *cm_id; */ struct nes_adapter *nesadapter = nesdev->nesadapter; @@ -3153,6 +3155,16 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, if (resource_allocated) { printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + hw_cq = (struct nes_hw_cq *)(unsigned long)context; + if (hw_cq) { + nescq = container_of(hw_cq, struct nes_cq, hw_cq); + if (nescq->ibcq.event_handler) { + ibevent.device = nescq->ibcq.device; + ibevent.event = IB_EVENT_CQ_ERR; + ibevent.element.cq = &nescq->ibcq; + nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context); + } + } } break; case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: -- cgit v0.10.2 From 8b1c9dc4ba713985d33aba87c761bf71d5a96491 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:38 -0700 Subject: RDMA/nes: Implement Terminate Packet Implement the sending and receiving of Terminate packets. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bf1720f..bcc6abc 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -523,7 +523,7 @@ int nes_cm_disconn(struct nes_qp *); void nes_cm_disconn_worker(void *); /* nes_verbs.c */ -int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32); +int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32); int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); struct nes_ib_device *nes_init_ofa_device(struct net_device *); void nes_destroy_ofa_device(struct nes_ib_device *); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 8b7e7c0..90e8e4d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -410,8 +410,6 @@ struct nes_cm_ops { int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, enum nes_timer_type, int, int); -int nes_cm_disconn(struct nes_qp *); - int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); int nes_reject(struct iw_cm_id *, const void *, u8); int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 2a0c5a1..297026f 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -74,6 +74,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, static void process_critical_error(struct nes_device *nesdev); static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); +static void nes_terminate_timeout(unsigned long context); +static void nes_terminate_start_timer(struct nes_qp *nesqp); #ifdef CONFIG_INFINIBAND_NES_DEBUG static unsigned char *nes_iwarp_state_str[] = { @@ -2903,6 +2905,383 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) } +static u8 *locate_mpa(u8 *pkt, u32 aeq_info) +{ + u16 pkt_len; + + if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { + /* skip over ethernet header */ + pkt_len = be16_to_cpu(*(u16 *)(pkt + ETH_HLEN - 2)); + pkt += ETH_HLEN; + + /* Skip over IP and TCP headers */ + pkt += 4 * (pkt[0] & 0x0f); + pkt += 4 * ((pkt[12] >> 4) & 0x0f); + } + return pkt; +} + +/* Determine if incoming error pkt is rdma layer */ +static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info) +{ + u8 *pkt; + u16 *mpa; + u32 opcode = 0xffffffff; + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u16 *)locate_mpa(pkt, aeq_info); + opcode = be16_to_cpu(mpa[1]) & 0xf; + } + + return opcode; +} + +/* Build iWARP terminate header */ +static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info) +{ + u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + u16 ddp_seg_len; + int copy_len = 0; + u8 is_tagged = 0; + struct nes_terminate_hdr *termhdr; + + termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; + memset(termhdr, 0, 64); + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + + /* Use data from offending packet to fill in ddp & rdma hdrs */ + pkt = locate_mpa(pkt, aeq_info); + ddp_seg_len = be16_to_cpu(*(u16 *)pkt); + if (ddp_seg_len) { + copy_len = 2; + termhdr->hdrct = DDP_LEN_FLAG; + if (pkt[2] & 0x80) { + is_tagged = 1; + if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) { + copy_len += TERM_DDP_LEN_TAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + } else { + if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) { + copy_len += TERM_DDP_LEN_UNTAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + + if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) { + if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) { + copy_len += TERM_RDMA_LEN; + termhdr->hdrct |= RDMA_HDR_FLAG; + } + } + } + } + } + + switch (async_event_id) { + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_STAG; + break; + default: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + break; + case NES_AEQE_AEID_AMP_BAD_QP: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_BOUNDS; + } else { + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_BOUNDS; + } + break; + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_ACCESS; + break; + case NES_AEQE_AEID_AMP_TO_WRAP: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_TO_WRAP; + break; + case NES_AEQE_AEID_AMP_BAD_PD: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; + break; + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_UNASSOC_STAG; + } + break; + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_MARKER; + break; + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_CRC; + break; + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; + break; + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + if (is_tagged) { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_DDP_VER; + } else { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER; + } + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MO; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_INV_RDMAP_VER; + break; + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNEXPECTED_OP; + break; + default: + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNSPECIFIED; + break; + } + + if (copy_len) + memcpy(termhdr + 1, pkt, copy_len); + + return sizeof(struct nes_terminate_hdr) + copy_len; +} + +static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp, + struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype) +{ + u64 context; + unsigned long flags; + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + u32 termlen = 0; + u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_FIN; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if (nesqp->term_flags & NES_TERM_SENT) + return; /* Sanity check */ + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + if (!context) { + WARN_ON(!context); + return; + } + + nesqp = (struct nes_qp *)(unsigned long)context; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->terminate_eventtype = eventtype; + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (nesadapter->send_term_ok) + termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info); + else + mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG; + + nes_terminate_start_timer(nesqp); + nesqp->term_flags |= NES_TERM_SENT; + nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0); +} + +static void nes_terminate_send_fin(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + unsigned long flags; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Send the fin only */ + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0); +} + +/* Cleanup after a terminate sent or received */ +static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred) +{ + u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + unsigned long flags; + struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device); + struct nes_device *nesdev = nesvnic->nesdev; + u8 first_time = 0; + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->hte_added) { + nesqp->hte_added = 0; + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + } + + first_time = (nesqp->term_flags & NES_TERM_DONE) == 0; + nesqp->term_flags |= NES_TERM_DONE; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Make sure we go through this only once */ + if (first_time) { + if (timeout_occurred == 0) + del_timer(&nesqp->terminate_timer); + else + next_iwarp_state |= NES_CQP_QP_RESET; + + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } +} + +static void nes_terminate_received(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u8 *pkt; + u32 *mpa; + u8 ddp_ctl; + u8 rdma_ctl; + u16 aeq_id = 0; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + /* Terminate is not a performance path so the silicon */ + /* did not validate the frame - do it now */ + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u32 *)locate_mpa(pkt, aeq_info); + ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff; + rdma_ctl = be32_to_cpu(mpa[0]) & 0xff; + if ((ddp_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC; + else if ((ddp_ctl & 0x03) != 1) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION; + else if (be32_to_cpu(mpa[2]) != 2) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN; + else if (be32_to_cpu(mpa[3]) != 1) + aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN; + else if (be32_to_cpu(mpa[4]) != 0) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO; + else if ((rdma_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION; + + if (aeq_id) { + /* Bad terminate recvd - send back a terminate */ + aeq_info = (aeq_info & 0xffff0000) | aeq_id; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + return; + } + } + + nesqp->term_flags |= NES_TERM_RCVD; + nesqp->terminate_eventtype = IB_EVENT_QP_FATAL; + nes_terminate_start_timer(nesqp); + nes_terminate_send_fin(nesdev, nesqp, aeqe); +} + +/* Timeout routine in case terminate fails to complete */ +static void nes_terminate_timeout(unsigned long context) +{ + struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; + + nes_terminate_done(nesqp, 1); +} + +/* Set a timer in case hw cannot complete the terminate sequence */ +static void nes_terminate_start_timer(struct nes_qp *nesqp) +{ + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.expires = jiffies + HZ; + nesqp->terminate_timer.data = (unsigned long)nesqp; + add_timer(&nesqp->terminate_timer); +} + /** * nes_process_iwarp_aeqe */ @@ -2910,30 +3289,27 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, struct nes_hw_aeqe *aeqe) { u64 context; - u64 aeqe_context = 0; unsigned long flags; struct nes_qp *nesqp; struct nes_hw_cq *hw_cq; struct nes_cq *nescq; int resource_allocated; - /* struct iw_cm_id *cm_id; */ struct nes_adapter *nesadapter = nesdev->nesadapter; - struct ib_event ibevent; - /* struct iw_cm_event cm_event; */ u32 aeq_info; u32 next_iwarp_state = 0; u16 async_event_id; u8 tcp_state; u8 iwarp_state; + int must_disconn = 1; + int must_terminate = 0; + struct ib_event ibevent; nes_debug(NES_DBG_AEQ, "\n"); aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - if ((NES_AEQE_INBOUND_RDMA&aeq_info) || (!(NES_AEQE_QP&aeq_info))) { + if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) { context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; } else { - aeqe_context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); - aeqe_context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; context = (unsigned long)nesadapter->qp_table[le32_to_cpu( aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; BUG_ON(!context); @@ -2950,7 +3326,11 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, switch (async_event_id) { case NES_AEQE_AEID_LLP_FIN_RECEIVED: - nesqp = *((struct nes_qp **)&context); + nesqp = (struct nes_qp *)(unsigned long)context; + + if (nesqp->term_flags) + return; /* Ignore it, wait for close complete */ + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { nesqp->cm_id->add_ref(nesqp->cm_id); schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, @@ -2961,18 +3341,24 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), async_event_id, nesqp->last_aeq, tcp_state); } + if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) || (nesqp->ibqp_state != IB_QPS_RTS)) { /* FIN Received but tcp state or IB state moved on, should expect a close complete */ return; } + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: + nesqp = (struct nes_qp *)(unsigned long)context; + if (nesqp->term_flags) { + nes_terminate_done(nesqp, 0); + return; + } + case NES_AEQE_AEID_LLP_CONNECTION_RESET: - case NES_AEQE_AEID_TERMINATE_SENT: - case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: case NES_AEQE_AEID_RESET_SENT: - nesqp = *((struct nes_qp **)&context); + nesqp = (struct nes_qp *)(unsigned long)context; if (async_event_id == NES_AEQE_AEID_RESET_SENT) { tcp_state = NES_AEQE_TCP_STATE_CLOSED; } @@ -2984,12 +3370,7 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) || (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) { nesqp->hte_added = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u to remove hte\n", - nesqp->hwqp.qp_id); - nes_hw_modify_qp(nesdev, nesqp, - NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE, 0); - spin_lock_irqsave(&nesqp->lock, flags); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; } if ((nesqp->ibqp_state == IB_QPS_RTS) && @@ -3001,151 +3382,106 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; break; case NES_AEQE_IWARP_STATE_TERMINATE: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; - if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { - next_iwarp_state |= 0x02000000; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - } + must_disconn = 0; /* terminate path takes care of disconn */ + if (nesqp->term_flags == 0) + must_terminate = 1; break; - default: - next_iwarp_state = 0; - } - spin_unlock_irqrestore(&nesqp->lock, flags); - if (next_iwarp_state) { - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," - " also added another reference\n", - nesqp->hwqp.qp_id, next_iwarp_state); - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); } - nes_cm_disconn(nesqp); } else { if (async_event_id == NES_AEQE_AEID_LLP_FIN_RECEIVED) { /* FIN Received but ib state not RTS, close complete will be on its way */ - spin_unlock_irqrestore(&nesqp->lock, flags); - return; - } - spin_unlock_irqrestore(&nesqp->lock, flags); - if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," - " also added another reference\n", - nesqp->hwqp.qp_id, next_iwarp_state); - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); + must_disconn = 0; } - nes_cm_disconn(nesqp); } - break; - case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TERMINATE_RECEIVED" - " event on QP%u \n Q2 Data:\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || - ((nesqp->ibqp_state == IB_QPS_RTS)&& - (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + + if (must_terminate) + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + else if (must_disconn) { + if (next_iwarp_state) { + nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n", + nesqp->hwqp.qp_id, next_iwarp_state); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + } nes_cm_disconn(nesqp); - } else { - nesqp->in_disconnect = 0; - wake_up(&nesqp->kick_waitq); } break; - case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nesqp->last_aeq = async_event_id; - if (nesqp->cm_id) { - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" - " event on QP%u, remote IP = 0x%08X \n", - nesqp->hwqp.qp_id, - ntohl(nesqp->cm_id->remote_addr.sin_addr.s_addr)); - } else { - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" - " event on QP%u \n", - nesqp->hwqp.qp_id); - } - spin_unlock_irqrestore(&nesqp->lock, flags); - next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_RESET; - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } + + case NES_AEQE_AEID_TERMINATE_SENT: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_send_fin(nesdev, nesqp, aeqe); break; - case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: - if (NES_AEQE_INBOUND_RDMA&aeq_info) { - nesqp = nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - } else { - /* TODO: get the actual WQE and mask off wqe index */ - context &= ~((u64)511); - nesqp = *((struct nes_qp **)&context); - } - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_BAD_STAG_INDEX event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } + + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_received(nesdev, nesqp, aeqe); break; + + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_UNALLOCATED_STAG event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: case NES_AEQE_AEID_PRIV_OPERATION_DENIED: - nesqp = nesadapter->qp_table[le32_to_cpu(aeqe->aeqe_words - [NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_PRIV_OPERATION_DENIED event on QP%u," - " nesqp = %p, AE reported %p\n", - nesqp->hwqp.qp_id, nesqp, *((struct nes_qp **)&context)); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + case NES_AEQE_AEID_AMP_TO_WRAP: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); + break; + + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + nesqp = (struct nes_qp *)(unsigned long)context; + if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { + aeq_info &= 0xffff0000; + aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); } + + case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: + case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + case NES_AEQE_AEID_AMP_BAD_QP: + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + case NES_AEQE_AEID_AMP_BAD_PD: + case NES_AEQE_AEID_AMP_FASTREG_SHARED: + case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG: + case NES_AEQE_AEID_AMP_FASTREG_MW_STAG: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH: + case NES_AEQE_AEID_AMP_INVALIDATE_SHARED: + case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS: + case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG: + case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS: + case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT: + case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED: + case NES_AEQE_AEID_BAD_CLOSE: + case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO: + case NES_AEQE_AEID_STAG_ZERO_INVALID: + case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: + case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); break; + case NES_AEQE_AEID_CQ_OPERATION_ERROR: context <<= 1; nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", @@ -3167,81 +3503,7 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, } } break; - case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: - nesqp = nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG" - "_FOR_AVAILABLE_BUFFER event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_INVALID_MSN" - "_NO_BUFFER_AVAILABLE event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR" - " event on QP%u \n Q2 Data:\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - /* TODO: additional AEs need to be here */ - case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, - nesqp->ibqp.qp_context); - } - nes_cm_disconn(nesqp); - break; + default: nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", async_event_id); @@ -3250,7 +3512,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, } - /** * nes_iwarp_ce_handler */ diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index c3654c6..4a0bfcd 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -241,6 +241,7 @@ enum nes_cqp_stag_wqeword_idx { }; #define NES_CQP_OP_IWARP_STATE_SHIFT 28 +#define NES_CQP_OP_TERMLEN_SHIFT 28 enum nes_cqp_qp_bits { NES_CQP_QP_ARP_VALID = (1<<8), @@ -265,6 +266,8 @@ enum nes_cqp_qp_bits { NES_CQP_QP_IWARP_STATE_TERMINATE = (5< 21)) || ((major_ver > 2) && (major_ver != 255))) { nesadapter->virtwq = 1; } + if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) + nesadapter->send_term_ok = 1; + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + (u32)((u8)eeprom_data); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index c6b5873..36666ac 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2923,7 +2923,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * nes_hw_modify_qp */ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, - u32 next_iwarp_state, u32 wait_completion) + u32 next_iwarp_state, u32 termlen, u32 wait_completion) { struct nes_hw_cqp_wqe *cqp_wqe; /* struct iw_cm_id *cm_id = nesqp->cm_id; */ @@ -2955,6 +2955,13 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); + /* If sending a terminate message, fill in the length (in words) */ + if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) && + !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) { + termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen); + } + atomic_set(&cqp_request->refcount, 2); nes_post_cqp_request(nesdev, cqp_request); @@ -3125,6 +3132,9 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", nesqp->hwqp.qp_id); + if (nesqp->term_flags) + del_timer(&nesqp->terminate_timer); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ if (nesqp->hte_added) { @@ -3202,7 +3212,7 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (issue_modify_qp) { nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); - ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 1); + ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1); if (ret) nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" " failed for QP%u.\n", @@ -3367,6 +3377,12 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, head = nesqp->hwqp.sq_head; while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + /* Check for SQ overflow */ if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { err = -EINVAL; @@ -3523,6 +3539,12 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, head = nesqp->hwqp.rq_head; while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { err = -EINVAL; break; diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 7df34fe..d92b1ef 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -40,6 +40,10 @@ struct nes_device; #define NES_MAX_USER_DB_REGIONS 4096 #define NES_MAX_USER_WQ_REGIONS 4096 +#define NES_TERM_SENT 0x01 +#define NES_TERM_RCVD 0x02 +#define NES_TERM_DONE 0x04 + struct nes_ucontext { struct ib_ucontext ibucontext; struct nes_device *nesdev; @@ -159,6 +163,8 @@ struct nes_qp { void *pbl_vbase; dma_addr_t pbl_pbase; struct page *page; + struct timer_list terminate_timer; + enum ib_event_type terminate_eventtype; wait_queue_head_t kick_waitq; u16 in_disconnect; u16 private_data_len; @@ -169,6 +175,7 @@ struct nes_qp { u8 hw_iwarp_state; u8 flush_issued; u8 hw_tcp_state; + u8 term_flags; u8 destroyed; }; #endif /* NES_VERBS_H */ -- cgit v0.10.2 From 4b281faec3ad00f7fb00080078321e4d819795eb Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:38 -0700 Subject: RDMA/nes: Use flush mechanism to set status for wqe in error When an asynchronous event occurs that requires a terminate, it is sometimes possible to identify the wqe in error. This change uses flush to get this information to the poll routine. The flush operation puts the status into the cqe. If this information is not available, it continues to use the more generic flush code as before. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 297026f..63a1a8e 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -2944,6 +2944,7 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a u16 ddp_seg_len; int copy_len = 0; u8 is_tagged = 0; + u8 flush_code = 0; struct nes_terminate_hdr *termhdr; termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; @@ -2983,19 +2984,23 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: switch (iwarp_opcode(nesqp, aeq_info)) { case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; termhdr->error_code = DDP_TAGGED_INV_STAG; break; default: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_INV_STAG; } break; case NES_AEQE_AEID_AMP_INVALID_STAG: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_INV_STAG; break; case NES_AEQE_AEID_AMP_BAD_QP: + flush_code = IB_WC_LOC_QP_OP_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_QN; break; @@ -3004,19 +3009,23 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a switch (iwarp_opcode(nesqp, aeq_info)) { case IWARP_OPCODE_SEND_INV: case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_OP_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; termhdr->error_code = RDMAP_CANT_INV_STAG; break; default: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_INV_STAG; } break; case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { + flush_code = IB_WC_LOC_PROT_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; termhdr->error_code = DDP_TAGGED_BOUNDS; } else { + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_INV_BOUNDS; } @@ -3024,57 +3033,69 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_ACCESS; break; case NES_AEQE_AEID_AMP_TO_WRAP: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_TO_WRAP; break; case NES_AEQE_AEID_AMP_BAD_PD: switch (iwarp_opcode(nesqp, aeq_info)) { case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; break; case IWARP_OPCODE_SEND_INV: case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_CANT_INV_STAG; break; default: + flush_code = IB_WC_REM_ACCESS_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; termhdr->error_code = RDMAP_UNASSOC_STAG; } break; case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + flush_code = IB_WC_LOC_LEN_ERR; termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; termhdr->error_code = MPA_MARKER; break; case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + flush_code = IB_WC_GENERAL_ERR; termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; termhdr->error_code = MPA_CRC; break; case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + flush_code = IB_WC_LOC_LEN_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; termhdr->error_code = DDP_CATASTROPHIC_LOCAL; break; case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: case NES_AEQE_AEID_DDP_NO_L_BIT: + flush_code = IB_WC_FATAL_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; termhdr->error_code = DDP_CATASTROPHIC_LOCAL; break; case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + flush_code = IB_WC_GENERAL_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; break; case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + flush_code = IB_WC_LOC_LEN_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; break; case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + flush_code = IB_WC_GENERAL_ERR; if (is_tagged) { termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; termhdr->error_code = DDP_TAGGED_INV_DDP_VER; @@ -3084,26 +3105,32 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a } break; case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + flush_code = IB_WC_GENERAL_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_MO; break; case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + flush_code = IB_WC_REM_OP_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; break; case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + flush_code = IB_WC_GENERAL_ERR; termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; termhdr->error_code = DDP_UNTAGGED_INV_QN; break; case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + flush_code = IB_WC_GENERAL_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; termhdr->error_code = RDMAP_INV_RDMAP_VER; break; case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + flush_code = IB_WC_LOC_QP_OP_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; termhdr->error_code = RDMAP_UNEXPECTED_OP; break; default: + flush_code = IB_WC_FATAL_ERR; termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; termhdr->error_code = RDMAP_UNSPECIFIED; break; @@ -3112,6 +3139,13 @@ static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 a if (copy_len) memcpy(termhdr + 1, pkt, copy_len); + if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) { + if (aeq_info & NES_AEQE_SQ) + nesqp->term_sq_flush_code = flush_code; + else + nesqp->term_rq_flush_code = flush_code; + } + return sizeof(struct nes_terminate_hdr) + copy_len; } @@ -3646,6 +3680,8 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, { struct nes_cqp_request *cqp_request; struct nes_hw_cqp_wqe *cqp_wqe; + u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; int ret; cqp_request = nes_get_cqp_request(nesdev); @@ -3662,6 +3698,24 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, cqp_wqe = &cqp_request->cqp_wqe; nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + /* If wqe in error was identified, set code to be put into cqe */ + if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code; + nesqp->term_sq_flush_code = 0; + } + + if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code; + nesqp->term_rq_flush_code = 0; + } + + if (which_wq & NES_CQP_FLUSH_MAJ_MIN) { + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code); + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code); + } + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index 4a0bfcd..f28a41b 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -274,6 +274,8 @@ enum nes_cqp_qp_bits { enum nes_cqp_qp_wqe_word_idx { NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6, NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7, + NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8, + NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9, NES_CQP_QP_WQE_NEW_MSS_IDX = 15, }; @@ -364,6 +366,7 @@ enum nes_cqp_arp_bits { enum nes_cqp_flush_bits { NES_CQP_FLUSH_SQ = (1<<30), NES_CQP_FLUSH_RQ = (1<<31), + NES_CQP_FLUSH_MAJ_MIN = (1<<28), }; enum nes_cqe_opcode_bits { @@ -757,6 +760,15 @@ enum nes_iwarp_sq_wqe_bits { NES_IWARP_SQ_OP_NOP = 12, }; +enum nes_iwarp_cqe_major_code { + NES_IWARP_CQE_MAJOR_FLUSH = 1, + NES_IWARP_CQE_MAJOR_DRV = 0x8000 +}; + +enum nes_iwarp_cqe_minor_code { + NES_IWARP_CQE_MINOR_FLUSH = 1 +}; + #define NES_EEPROM_READ_REQUEST (1<<16) #define NES_MAC_ADDR_VALID (1<<20) diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index d92b1ef..89822d7 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -168,6 +168,8 @@ struct nes_qp { wait_queue_head_t kick_waitq; u16 in_disconnect; u16 private_data_len; + u16 term_sq_flush_code; + u16 term_rq_flush_code; u8 active_conn; u8 skip_lsmm; u8 user_mode; -- cgit v0.10.2 From 6eed5e7c8bdce6ee21bbe0be4a3f3dce4d4b392a Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:39 -0700 Subject: RDMA/nes: Make poll_cq return correct number of wqes during flush When a flush request is given to the hw, it will place one cqe marked as flushed (unless there is nothing to flush). An application that is waiting for all wqe's to complete will be left hanging. This modifies poll_cq to return the correct number of flushes for the pending elements on the wq. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 36666ac..ad3c891 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3615,11 +3615,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) struct nes_qp *nesqp; struct nes_hw_cqe cqe; u32 head; - u32 wq_tail; + u32 wq_tail = 0; u32 cq_size; u32 cqe_count = 0; u32 wqe_index; u32 u32temp; + u32 move_cq_head = 1; + u32 err_code; nes_debug(NES_DBG_CQ, "\n"); @@ -3640,7 +3642,6 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) rmb(); cqe = nescq->hw_cq.cq_vbase[head]; - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); @@ -3667,16 +3668,14 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) } /* Working on a SQ Completion*/ - wq_tail = wqe_index; - nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); - wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | - ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); - switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { case NES_IWARP_SQ_OP_RDMAW: nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); @@ -3685,7 +3684,7 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) case NES_IWARP_SQ_OP_RDMAR: nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); entry->opcode = IB_WC_RDMA_READ; - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); break; case NES_IWARP_SQ_OP_SENDINV: @@ -3696,14 +3695,24 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) entry->opcode = IB_WC_SEND; break; } + + nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.sq_tail; + } } else { /* Working on a RQ Completion*/ - wq_tail = wqe_index; - nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); - wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | - ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); + wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | + ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); entry->opcode = IB_WC_RECV; + + nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.rq_tail; + } } entry->wr_id = wrid; @@ -3711,18 +3720,28 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) cqe_count++; } - if (++head >= cq_size) - head = 0; - nescq->polled_completions++; + if (move_cq_head) { + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + if (++head >= cq_size) + head = 0; + nescq->polled_completions++; - if ((nescq->polled_completions > (cq_size / 2)) || - (nescq->polled_completions == 255)) { - nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" + if ((nescq->polled_completions > (cq_size / 2)) || + (nescq->polled_completions == 255)) { + nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" " are pending %u of %u.\n", nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); - nes_write32(nesdev->regs+NES_CQE_ALLOC, + nes_write32(nesdev->regs+NES_CQE_ALLOC, nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); - nescq->polled_completions = 0; + nescq->polled_completions = 0; + } + } else { + /* Update the wqe index and set status to flush */ + wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = + cpu_to_le32(wqe_index); + move_cq_head = 1; /* ready for next pass */ } } -- cgit v0.10.2 From 320cdfd21d4a9f6ef54b74871e0d6b19a0e86fd6 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:39 -0700 Subject: RDMA/nes: Use the flush code to fill in cqe error Use the flush status to fill in cqe status when a specific error has been identified. Subsequent flushed completions still use the flushed value. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index ad3c891..993c1d4 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3655,7 +3655,16 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { entry->status = IB_WC_SUCCESS; } else { - entry->status = IB_WC_WR_FLUSH_ERR; + err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) { + entry->status = err_code & 0x0000ffff; + + /* The rest of the cqe's will be marked as flushed */ + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] = + cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) | + NES_IWARP_CQE_MINOR_FLUSH); + } else + entry->status = IB_WC_WR_FLUSH_ERR; } entry->qp = &nesqp->ibqp; -- cgit v0.10.2 From b29a4fc49b028dbdab53b679826ed1eb658dde59 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Sat, 5 Sep 2009 20:36:39 -0700 Subject: RDMA/nes: Rework the disconn routine for terminate and flushing The disconn routine has been reworked to acoomodate the terminate and flushing changes. The routine has been reorganized to make all the decisions at the start then it performs all the required operations. This simplified the lock handling and is easier to follow. Signed-off-by: Don Wood Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index d89bdee..73473db 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2493,7 +2493,12 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) u16 last_ae; u8 original_hw_tcp_state; u8 original_ibqp_state; - u8 issued_disconnect_reset = 0; + enum iw_cm_event_type disconn_status = IW_CM_EVENT_STATUS_OK; + int issue_disconn = 0; + int issue_close = 0; + int issue_flush = 0; + u32 flush_q = NES_CQP_FLUSH_RQ; + struct ib_event ibevent; if (!nesqp) { nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); @@ -2517,24 +2522,55 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) original_ibqp_state = nesqp->ibqp_state; last_ae = nesqp->last_aeq; + if (nesqp->term_flags) { + issue_disconn = 1; + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((original_ibqp_state == IB_QPS_RTS) && + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_disconn = 1; + if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) + disconn_status = IW_CM_EVENT_STATUS_RESET; + } + + if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || + (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } + + spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_CM, "set ibqp_state=%u\n", nesqp->ibqp_state); + if ((issue_flush) && (nesqp->destroyed == 0)) { + /* Flush the queue(s) */ + if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE) + flush_q |= NES_CQP_FLUSH_SQ; + flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1); - if ((nesqp->cm_id) && (cm_id->event_handler)) { - if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || - ((original_ibqp_state == IB_QPS_RTS) && - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + if (nesqp->term_flags) { + ibevent.device = nesqp->ibqp.device; + ibevent.event = nesqp->terminate_eventtype; + ibevent.element.qp = &nesqp->ibqp; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + } + + if ((cm_id) && (cm_id->event_handler)) { + if (issue_disconn) { atomic_inc(&cm_disconnects); cm_event.event = IW_CM_EVENT_DISCONNECT; - if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) { - cm_event.status = IW_CM_EVENT_STATUS_RESET; - nes_debug(NES_DBG_CM, "Generating a CM " - "Disconnect Event (status reset) for " - "QP%u, cm_id = %p. \n", - nesqp->hwqp.qp_id, cm_id); - } else - cm_event.status = IW_CM_EVENT_STATUS_OK; - + cm_event.status = disconn_status; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = NULL; @@ -2547,28 +2583,14 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) nesqp->hwqp.sq_tail, cm_id, atomic_read(&nesqp->refcount)); - spin_unlock_irqrestore(&nesqp->lock, flags); ret = cm_id->event_handler(cm_id, &cm_event); if (ret) nes_debug(NES_DBG_CM, "OFA CM event_handler " "returned, ret=%d\n", ret); - spin_lock_irqsave(&nesqp->lock, flags); } - /* There might have been another AE while the lock was released */ - original_hw_tcp_state = nesqp->hw_tcp_state; - original_ibqp_state = nesqp->ibqp_state; - last_ae = nesqp->last_aeq; - - if ((issued_disconnect_reset == 0) && (nesqp->cm_id) && - ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || - (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || - (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + if (issue_close) { atomic_inc(&cm_closes); - nesqp->cm_id = NULL; - nesqp->in_disconnect = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); nes_disconnect(nesqp, 1); cm_id->provider_data = nesqp; @@ -2587,27 +2609,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) } cm_id->rem_ref(cm_id); - - spin_lock_irqsave(&nesqp->lock, flags); - if (nesqp->flush_issued == 0) { - nesqp->flush_issued = 1; - spin_unlock_irqrestore(&nesqp->lock, flags); - flush_wqes(nesvnic->nesdev, nesqp, - NES_CQP_FLUSH_RQ, 1); - } else - spin_unlock_irqrestore(&nesqp->lock, flags); - } else { - cm_id = nesqp->cm_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - /* check to see if the inbound reset beat the outbound reset */ - if ((!cm_id) && (last_ae==NES_AEQE_AEID_RESET_SENT)) { - nes_debug(NES_DBG_CM, "QP%u: Decing refcount " - "due to inbound reset beating the " - "outbound reset.\n", nesqp->hwqp.qp_id); - } } - } else { - spin_unlock_irqrestore(&nesqp->lock, flags); } return 0; -- cgit v0.10.2 From cd1d3f7abec19719949ec5b5189a821cd52af868 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Sat, 5 Sep 2009 20:36:39 -0700 Subject: RDMA/nes: Map MTU to IB_MTU_* and correctly report link state Old query_port code reports static MTU and link state values. Instead, map actual MTU to next largest IB_MTU_* constant and correctly report link state. Cc: Steve Wise Reported-by: Jeff Squyres Signed-off-by: Chien Tung Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 993c1d4..a680c42 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -667,15 +667,32 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop */ static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct net_device *netdev = nesvnic->netdev; + memset(props, 0, sizeof(*props)); - props->max_mtu = IB_MTU_2048; - props->active_mtu = IB_MTU_2048; + props->max_mtu = IB_MTU_4096; + + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + props->lid = 1; props->lmc = 0; props->sm_lid = 0; props->sm_sl = 0; - props->state = IB_PORT_ACTIVE; + if (nesvnic->linkup) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_DOWN; props->phys_state = 0; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; -- cgit v0.10.2 From e6cc0fd1e31cfe48e207de78742ccdf301369bf3 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 7 Sep 2009 21:54:38 -0700 Subject: MAINTAINERS: InfiniBand/RDMA mailing list transition to vger InfiniBand/RDMA development discussion is moving from general@lists.openfabrics.org to linux-rdma@vger.kernel.org. Signed-off-by: Roland Dreier diff --git a/MAINTAINERS b/MAINTAINERS index 8dca9d8..989ff11 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -439,7 +439,7 @@ F: drivers/hwmon/ams/ AMSO1100 RNIC DRIVER M: Tom Tucker M: Steve Wise -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org S: Maintained F: drivers/infiniband/hw/amso1100/ @@ -1494,7 +1494,7 @@ F: drivers/net/cxgb3/ CXGB3 IWARP RNIC DRIVER (IW_CXGB3) M: Steve Wise -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org W: http://www.openfabrics.org S: Supported F: drivers/infiniband/hw/cxgb3/ @@ -1868,7 +1868,7 @@ F: fs/efs/ EHCA (IBM GX bus InfiniBand adapter) DRIVER M: Hoang-Nam Nguyen M: Christoph Raisch -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/ehca/ @@ -2552,7 +2552,7 @@ INFINIBAND SUBSYSTEM M: Roland Dreier M: Sean Hefty M: Hal Rosenstock -L: general@lists.openfabrics.org (moderated for non-subscribers) +L: linux-rdma@vger.kernel.org W: http://www.openib.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git S: Supported @@ -2729,7 +2729,7 @@ F: drivers/net/ipg.c IPATH DRIVER M: Ralph Campbell -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org T: git git://git.qlogic.com/ipath-linux-2.6 S: Supported F: drivers/infiniband/hw/ipath/ @@ -3485,7 +3485,7 @@ F: drivers/scsi/NCR_D700.* NETEFFECT IWARP RNIC DRIVER (IW_NES) M: Faisal Latif M: Chien Tung -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org W: http://www.neteffect.com S: Supported F: drivers/infiniband/hw/nes/ -- cgit v0.10.2 From cb58160e72244a23f4e0cf4d6f81ffbd131fcd51 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 9 Sep 2009 11:37:38 -0700 Subject: RDMA/iwcm: Reject the connection when the cm_id is destroyed If the cm_id of a connect request is destroyed prior to the ULP accepting or rejecting the connection, then the provider never cleans up the connection. The iwcm should explicitly reject these connections if the cm_id is destroyed. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 8f9509e..55d093a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -362,6 +362,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; + cm_id->device->iwcm->reject(cm_id, NULL, 0); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: -- cgit v0.10.2