From e54664c0958acf14ef3a65d1b78f4a54b437cdf7 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Sun, 29 Jul 2007 15:12:26 -0500 Subject: RDMA/cxgb3: Make the iw_cxgb3 module parameters writable Allow changing parameter values without having to reload the module. This is safe because these parameters are only looked at when a new connection is established. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 1cdfcd4..20ba372 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -63,37 +63,37 @@ static char *states[] = { }; static int ep_timeout_secs = 10; -module_param(ep_timeout_secs, int, 0444); +module_param(ep_timeout_secs, int, 0644); MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " "in seconds (default=10)"); static int mpa_rev = 1; -module_param(mpa_rev, int, 0444); +module_param(mpa_rev, int, 0644); MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " "1 is spec compliant. (default=1)"); static int markers_enabled = 0; -module_param(markers_enabled, int, 0444); +module_param(markers_enabled, int, 0644); MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); static int crc_enabled = 1; -module_param(crc_enabled, int, 0444); +module_param(crc_enabled, int, 0644); MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); static int rcv_win = 256 * 1024; -module_param(rcv_win, int, 0444); +module_param(rcv_win, int, 0644); MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)"); static int snd_win = 32 * 1024; -module_param(snd_win, int, 0444); +module_param(snd_win, int, 0644); MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); static unsigned int nocong = 0; -module_param(nocong, uint, 0444); +module_param(nocong, uint, 0644); MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)"); static unsigned int cong_flavor = 1; -module_param(cong_flavor, uint, 0444); +module_param(cong_flavor, uint, 0644); MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)"); static void process_work(struct work_struct *work); -- cgit v0.10.2 From ce423ef50ee1b6b7db63c748034423aa0afce224 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:04 -0700 Subject: IPoIB: Make sure no receives are handled when stopping device The current IPoIB code might process receive completions from ipoib_drain_cq() when bringing down the interface. This could cause packets to be passed up the stack without the device's poll method being called. Avoid this by setting the status of any successful completions to IB_WC_WR_FLUSH_ERR. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 1094488..5a70e28 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -558,6 +558,14 @@ void ipoib_drain_cq(struct net_device *dev) do { n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ) ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) -- cgit v0.10.2 From 65d470b3ea52ee1402499d6fcb4632296452e5b1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:04 -0700 Subject: IB: find_first_zero_bit() takes unsigned pointer Fix sparse warning drivers/infiniband/core/device.c:142:6: warning: incorrect type in argument 1 (different signedness) drivers/infiniband/core/device.c:142:6: expected unsigned long const *addr drivers/infiniband/core/device.c:142:6: got long *[assigned] inuse by making the local variable inuse unsigned. Does not affect generated code at all. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2506c43..5ac5ffe 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -120,12 +120,12 @@ static struct ib_device *__ib_device_get_by_name(const char *name) static int alloc_name(char *name) { - long *inuse; + unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; - inuse = (long *) get_zeroed_page(GFP_KERNEL); + inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; -- cgit v0.10.2 From 36ce10d3e849c2f9efa3fd647058de398ee276eb Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Tue, 7 Aug 2007 11:18:52 +0300 Subject: mlx4_core: Use enum value GO_BIT_TIMEOUT_MSECS Rename GO_BIT_TIMEOUT to GO_BIT_TIMEOUT_MSECS for clarity, and actually use it as the go bit timeout (instead of having the define but then ignoring it and using a hard-coded 10 * HZ for the actual timeout). Signed-off-by: Dotan Barak Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index a9f3175..b540820 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -95,7 +95,7 @@ enum { }; enum { - GO_BIT_TIMEOUT = 10000 + GO_BIT_TIMEOUT_MSECS = 10000 }; struct mlx4_cmd_context { @@ -155,7 +155,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, end = jiffies; if (event) - end += HZ * 10; + end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); while (cmd_pending(dev)) { if (time_after_eq(jiffies, end)) -- cgit v0.10.2 From eaf559bf566f76887533c077d425adce847f06c8 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:05 -0700 Subject: mlx4_core: Don't free special QPs in QP number bitmap Special QPs are not allocated using the regular QP number bitmap, so when they are destroyed, their QP number should not be freed in the bitmap. Found by Dotan Barak of Mellanox. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index 19b48c7..278414b 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -240,7 +240,8 @@ void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp) mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn); mlx4_table_put(dev, &qp_table->qp_table, qp->qpn); - mlx4_bitmap_free(&qp_table->bitmap, qp->qpn); + if (qp->qpn < dev->caps.sqp_start + 8) + mlx4_bitmap_free(&qp_table->bitmap, qp->qpn); } EXPORT_SYMBOL_GPL(mlx4_qp_free); -- cgit v0.10.2 From 2242fa4f04d670f02efb43ec537d677edc220880 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:05 -0700 Subject: IB/mlx4: Use __set_data_seg() in mlx4_ib_post_recv() Use a __set_data_seg() helper in mlx4_ib_post_recv() too; in addition to making the code easier to read, this also allows gcc to generate better code -- on x86_64: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-8 (-8) function old new delta mlx4_ib_post_recv 359 351 -8 Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 85c51bd..31a480e 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1249,6 +1249,13 @@ static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) dseg->byte_count = cpu_to_be32(sg->length); } +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -1464,11 +1471,8 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, scat = get_recv_wqe(qp, ind); - for (i = 0; i < wr->num_sge; ++i) { - scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); - scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); - scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); - } + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); if (i < qp->rq.max_gs) { scat[i].byte_count = 0; -- cgit v0.10.2 From 1fea391039d1c4e876a164099bff475a02a29d96 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:05 -0700 Subject: IB/ehca: Include from ehca_classes.h ehca_classes.h uses struct mutex, so while seems to be pulled in indirectly by one of the headers it includes, the right thing is to include directly. Signed-off-by: Michael S. Tsirkin Acked-by: Stefan Roscher Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index b5e9603..0d69f0f 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -53,6 +53,7 @@ struct ehca_pd; struct ehca_av; #include +#include #include #include -- cgit v0.10.2 From 8a68bbe31d51cc75d2acf1c9f5ceff91f7662ea4 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 29 Aug 2007 08:36:22 -0500 Subject: IB/fmr_pool: Clean up some error messages in fmr_pool.c A number of printks in fmr_pool.c dont have newlines, eg: fmr_create failed for FMR 0<5>FS-Cache: Loaded Fix them up. Signed-off-by: Anton Blanchard Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index a06bcc6..d7f6452 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -152,7 +152,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d", + printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", fmr, fmr->ref_count); } #endif @@ -170,7 +170,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d", ret); + printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); @@ -235,13 +235,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct"); + printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); return ERR_PTR(-ENOMEM); } ret = ib_query_device(device, attr); if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d", ret); + printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); kfree(attr); return ERR_PTR(ret); } @@ -255,7 +255,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct"); + printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); return ERR_PTR(-ENOMEM); } @@ -272,7 +272,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool"); + printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } @@ -296,7 +296,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread"); + printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } @@ -314,7 +314,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, GFP_KERNEL); if (!fmr) { printk(KERN_WARNING PFX "failed to allocate fmr " - "struct for FMR %d", i); + "struct for FMR %d\n", i); goto out_fail; } @@ -326,7 +326,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d", i); + "for FMR %d\n", i); kfree(fmr); goto out_fail; } @@ -381,7 +381,7 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered", + printk(KERN_WARNING PFX "pool still has %d regions registered\n", pool->pool_size - i); kfree(pool->cache_bucket); @@ -518,7 +518,7 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0", + printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", fmr, fmr->ref_count); #endif -- cgit v0.10.2 From 08fb105540f4649eaa25270ec3ada1d35406afcc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 7 Aug 2007 16:08:28 +0300 Subject: mlx4_core: Enable MSI-X by default Recover from MSI-X errors by automatically falling back on regular interrupt, instead of asking the user to do this manually. This makes it possible to enable MSI-X by default, and will make it possible to get rid of the msi_x module option in the future. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 4dc9dc1..4b12694 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -61,7 +61,7 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #ifdef CONFIG_PCI_MSI -static int msi_x; +static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); @@ -599,13 +599,17 @@ static int __devinit mlx4_setup_hca(struct mlx4_dev *dev) err = mlx4_NOP(dev); if (err) { - mlx4_err(dev, "NOP command failed to generate interrupt " - "(IRQ %d), aborting.\n", - priv->eq_table.eq[MLX4_EQ_ASYNC].irq); - if (dev->flags & MLX4_FLAG_MSI_X) - mlx4_err(dev, "Try again with MSI-X disabled.\n"); - else + if (dev->flags & MLX4_FLAG_MSI_X) { + mlx4_warn(dev, "NOP command failed to generate MSI-X " + "interrupt IRQ %d).\n", + priv->eq_table.eq[MLX4_EQ_ASYNC].irq); + mlx4_warn(dev, "Trying again without MSI-X.\n"); + } else { + mlx4_err(dev, "NOP command failed to generate interrupt " + "(IRQ %d), aborting.\n", + priv->eq_table.eq[MLX4_EQ_ASYNC].irq); mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n"); + } goto err_cmd_poll; } @@ -803,8 +807,6 @@ static int __devinit mlx4_init_one(struct pci_dev *pdev, goto err_free_dev; } - mlx4_enable_msi_x(dev); - if (mlx4_cmd_init(dev)) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); goto err_free_dev; @@ -814,7 +816,15 @@ static int __devinit mlx4_init_one(struct pci_dev *pdev, if (err) goto err_cmd; + mlx4_enable_msi_x(dev); + err = mlx4_setup_hca(dev); + if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) { + dev->flags &= ~MLX4_FLAG_MSI_X; + pci_disable_msix(pdev); + err = mlx4_setup_hca(dev); + } + if (err) goto err_close; @@ -838,15 +848,15 @@ err_cleanup: mlx4_cleanup_uar_table(dev); err_close: + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + mlx4_close_hca(dev); err_cmd: mlx4_cmd_cleanup(dev); err_free_dev: - if (dev->flags & MLX4_FLAG_MSI_X) - pci_disable_msix(pdev); - kfree(priv); err_release_bar2: -- cgit v0.10.2 From 017aadc4b505ad3ec2acc4e6ba96d63ae1c997a5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 7 Aug 2007 16:10:34 +0300 Subject: IB/mthca: Enable MSI-X by default Recover from MSI-X errors by automatically falling back on regular interrupt, instead of asking the user to do this manually. This makes it possible to enable MSI-X by default, and will make it possible to get rid of the msi_x module option in the future. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 76fed75..04c1520 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -61,7 +61,7 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #ifdef CONFIG_PCI_MSI -static int msi_x = 0; +static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); @@ -833,14 +833,19 @@ static int mthca_setup_hca(struct mthca_dev *dev) err = mthca_NOP(dev, &status); if (err || status) { - mthca_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting.\n", - dev->mthca_flags & MTHCA_FLAG_MSI_X ? - dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector : - dev->pdev->irq); - if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X)) - mthca_err(dev, "Try again with MSI/MSI-X disabled.\n"); - else + if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X)) { + mthca_warn(dev, "NOP command failed to generate interrupt " + "(IRQ %d).\n", + dev->mthca_flags & MTHCA_FLAG_MSI_X ? + dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector : + dev->pdev->irq); + mthca_warn(dev, "Trying again with MSI/MSI-X disabled.\n"); + } else { + mthca_err(dev, "NOP command failed to generate interrupt " + "(IRQ %d), aborting.\n", + dev->pdev->irq); mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n"); + } goto err_cmd_poll; } @@ -1115,24 +1120,6 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) goto err_free_dev; } - if (msi_x && !mthca_enable_msi_x(mdev)) - mdev->mthca_flags |= MTHCA_FLAG_MSI_X; - else if (msi) { - static int warned; - - if (!warned) { - printk(KERN_WARNING PFX "WARNING: MSI support will be " - "removed from the ib_mthca driver in January 2008.\n"); - printk(KERN_WARNING " If you are using MSI and cannot " - "switch to MSI-X, please tell " - ".\n"); - ++warned; - } - - if (!pci_enable_msi(pdev)) - mdev->mthca_flags |= MTHCA_FLAG_MSI; - } - if (mthca_cmd_init(mdev)) { mthca_err(mdev, "Failed to init command interface, aborting.\n"); goto err_free_dev; @@ -1156,7 +1143,35 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); } + if (msi_x && !mthca_enable_msi_x(mdev)) + mdev->mthca_flags |= MTHCA_FLAG_MSI_X; + else if (msi) { + static int warned; + + if (!warned) { + printk(KERN_WARNING PFX "WARNING: MSI support will be " + "removed from the ib_mthca driver in January 2008.\n"); + printk(KERN_WARNING " If you are using MSI and cannot " + "switch to MSI-X, please tell " + ".\n"); + ++warned; + } + + if (!pci_enable_msi(pdev)) + mdev->mthca_flags |= MTHCA_FLAG_MSI; + } + err = mthca_setup_hca(mdev); + if (err == -EBUSY && (mdev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))) { + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + if (mdev->mthca_flags & MTHCA_FLAG_MSI) + pci_disable_msi(pdev); + mdev->mthca_flags &= ~(MTHCA_FLAG_MSI_X | MTHCA_FLAG_MSI); + + err = mthca_setup_hca(mdev); + } + if (err) goto err_close; @@ -1192,17 +1207,17 @@ err_cleanup: mthca_cleanup_uar_table(mdev); err_close: + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + if (mdev->mthca_flags & MTHCA_FLAG_MSI) + pci_disable_msi(pdev); + mthca_close_hca(mdev); err_cmd: mthca_cmd_cleanup(mdev); err_free_dev: - if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) - pci_disable_msix(pdev); - if (mdev->mthca_flags & MTHCA_FLAG_MSI) - pci_disable_msi(pdev); - ib_dealloc_device(&mdev->ib_dev); err_free_res: -- cgit v0.10.2 From b3ac60fc243f2312d27ecded058ef96f52f25fe0 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 9 Oct 2007 19:59:06 -0700 Subject: IPoIB: Fix typo to end statement with ';' instead of ',' Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 563aeac..3c6e45d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -185,7 +185,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) goto out_free_cq; init_attr.send_cq = priv->cq; - init_attr.recv_cq = priv->cq, + init_attr.recv_cq = priv->cq; priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { -- cgit v0.10.2 From ca6de177acef8f2c7c3901ea583a263364ca7bbb Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 21 Aug 2007 18:46:10 +0300 Subject: IPoIB: Fix error path memory leak Clean up properly if ib_query_pkey() or ib_query_gid() fail. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 894b1dcd..5f948b9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1083,7 +1083,7 @@ static struct net_device *ipoib_add_port(const char *format, if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); - goto alloc_mem_failed; + goto device_init_failed; } /* @@ -1099,7 +1099,7 @@ static struct net_device *ipoib_add_port(const char *format, if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); - goto alloc_mem_failed; + goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); -- cgit v0.10.2 From d7dc3ccbe4dd1d37950da0138079e61d5be06ca9 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:06 -0700 Subject: IB/mlx4: Fix up SRQ limit_watermark endianness mlx4_srq_query() returns a big-endian 16-bit value through an int *, which screws up sparse checking. Fix this so that a CPU-endian value is returned. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 408748f..e7e9a3d 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -251,7 +251,7 @@ int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) if (ret) return ret; - srq_attr->srq_limit = be16_to_cpu(limit_watermark); + srq_attr->srq_limit = limit_watermark; srq_attr->max_wr = srq->msrq.max - 1; srq_attr->max_sge = srq->msrq.max_gs; diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index b061c86..31b255a 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -227,7 +227,7 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn); if (err) goto err_out; - *limit_watermark = srq_context->limit_watermark; + *limit_watermark = be16_to_cpu(srq_context->limit_watermark); err_out: mlx4_free_cmd_mailbox(dev, mailbox); -- cgit v0.10.2 From 1f794483025bc1e9e8e3321089a7f29ad3cc448f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 29 Aug 2007 11:05:35 -0500 Subject: IB/ehca: Make output clearer by removing some debug messages ehca spits out a lot of debugging information. I had to look closely to see the "Port 1 is not active" message within all the debug: eHCA Infiniband Device Driver (Rel.: SVNEHCA_0022) eHCA scaling code enabled ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_define_sqp Port 1 is not active. ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_create_qp ehca_define_sqp() failed rc=ffffffffffffffff ib_mad: Couldn't create ib_mad QP1 ib_mad: Couldn't open ehca0 port 1 ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_alloc_fmr unsupported fmr_attr->page_shift=9 ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_alloc_fmr rc=ffffffffffffffea pd=c000000b4b5b2420 mr_access_flags=7 fmr_attr=c0000005afd37394 fmr_create failed for FMR 0 Remove a few debug statements so that things are clearer: eHCA Infiniband Device Driver (Rel.: SVNEHCA_0022) eHCA scaling code enabled ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_define_sqp Port 1 is not active. ib_mad: Couldn't create ib_mad QP1 ib_mad: Couldn't open ehca0 port 1 ehca D.001.DQDXYCB-P1-C9: PU0006 EHCA_ERR:ehca_alloc_fmr unsupported fmr_attr->page_shift=9 fmr_create failed for FMR 0 Signed-off-by: Anton Blanchard Acked-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index d97eda3..dc4c840 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -846,10 +846,6 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, alloc_fmr_exit1: ehca_mr_delete(e_fmr); alloc_fmr_exit0: - if (IS_ERR(ib_fmr)) - ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x " - "fmr_attr=%p", PTR_ERR(ib_fmr), pd, - mr_access_flags, fmr_attr); return ib_fmr; } /* end ehca_alloc_fmr() */ diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 84d435a..88f0745 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -714,8 +714,6 @@ static struct ehca_qp *internal_create_qp( if (qp_type == IB_QPT_GSI) { h_ret = ehca_define_sqp(shca, my_qp, init_attr); if (h_ret != H_SUCCESS) { - ehca_err(pd->device, "ehca_define_sqp() failed rc=%lx", - h_ret); ret = ehca2ib_return_code(h_ret); goto create_qp_exit4; } -- cgit v0.10.2 From 339e2640a9f403f7b7acb2ea67f3568b8ac3eebf Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 29 Aug 2007 12:43:01 -0500 Subject: IB/ehca: Export module parameters in sysfs At the moment the ehca module parameters are not exported in sysfs. Export them with 0444 permissions. Signed-off-by: Anton Blanchard Acked-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 99036b6..db041df 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -65,16 +65,16 @@ int ehca_static_rate = -1; int ehca_scaling_code = 0; int ehca_mr_largepage = 0; -module_param_named(open_aqp1, ehca_open_aqp1, int, 0); -module_param_named(debug_level, ehca_debug_level, int, 0); -module_param_named(hw_level, ehca_hw_level, int, 0); -module_param_named(nr_ports, ehca_nr_ports, int, 0); -module_param_named(use_hp_mr, ehca_use_hp_mr, int, 0); -module_param_named(port_act_time, ehca_port_act_time, int, 0); -module_param_named(poll_all_eqs, ehca_poll_all_eqs, int, 0); -module_param_named(static_rate, ehca_static_rate, int, 0); -module_param_named(scaling_code, ehca_scaling_code, int, 0); -module_param_named(mr_largepage, ehca_mr_largepage, int, 0); +module_param_named(open_aqp1, ehca_open_aqp1, int, S_IRUGO); +module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); +module_param_named(hw_level, ehca_hw_level, int, S_IRUGO); +module_param_named(nr_ports, ehca_nr_ports, int, S_IRUGO); +module_param_named(use_hp_mr, ehca_use_hp_mr, int, S_IRUGO); +module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO); +module_param_named(poll_all_eqs, ehca_poll_all_eqs, int, S_IRUGO); +module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); +module_param_named(scaling_code, ehca_scaling_code, int, S_IRUGO); +module_param_named(mr_largepage, ehca_mr_largepage, int, S_IRUGO); MODULE_PARM_DESC(open_aqp1, "AQP1 on startup (0: no (default), 1: yes)"); -- cgit v0.10.2 From 3c10c7c929b30e8813d15960cb97f95a0b2ba615 Mon Sep 17 00:00:00 2001 From: Ali Ayoub Date: Sun, 9 Sep 2007 14:55:11 +0300 Subject: IB/sa: Error handling thinko fix ib_create_send_mad() returns an error code pointer on error, not NULL. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index d271bd7..312c8ff 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -531,7 +531,7 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, gfp_mask); - if (!query->mad_buf) { + if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; } -- cgit v0.10.2 From a855b1a7423ac83c76638f156d79c854b0feb94d Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Fri, 10 Aug 2007 13:54:33 -0700 Subject: IB/mthca: Use PCI-X/PCI-Express read control interfaces These driver changes incorporate the proposed PCI-X / PCI-Express read byte count interface. Reading and setting those values doesn't take place "manually", instead wrapping functions are called to allow quirks for some PCI bridges. Signed-off by: Peter Oruba Based on work by Stephen Hemminger Cc: Roland Dreier Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 04c1520..60de6f9 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -137,40 +137,23 @@ static const char mthca_version[] __devinitdata = static int mthca_tune_pci(struct mthca_dev *mdev) { - int cap; - u16 val; - if (!tune_pci) return 0; /* First try to max out Read Byte Count */ - cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); - if (cap) { - if (pci_read_config_word(mdev->pdev, cap + PCI_X_CMD, &val)) { - mthca_err(mdev, "Couldn't read PCI-X command register, " - "aborting.\n"); - return -ENODEV; - } - val = (val & ~PCI_X_CMD_MAX_READ) | (3 << 2); - if (pci_write_config_word(mdev->pdev, cap + PCI_X_CMD, val)) { - mthca_err(mdev, "Couldn't write PCI-X command register, " - "aborting.\n"); + if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) { + if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) { + mthca_err(mdev, "Couldn't set PCI-X max read count, " + "aborting.\n"); return -ENODEV; } } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) mthca_info(mdev, "No PCI-X capability, not setting RBC.\n"); - cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP); - if (cap) { - if (pci_read_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, &val)) { - mthca_err(mdev, "Couldn't read PCI Express device control " - "register, aborting.\n"); - return -ENODEV; - } - val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12); - if (pci_write_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, val)) { - mthca_err(mdev, "Couldn't write PCI Express device control " - "register, aborting.\n"); + if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) { + if (pcie_set_readrq(mdev->pdev, 4096)) { + mthca_err(mdev, "Couldn't write PCI Express read request, " + "aborting.\n"); return -ENODEV; } } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE) -- cgit v0.10.2 From 441633b968a5be0ef9be7c37ae24c35eda5b730d Mon Sep 17 00:00:00 2001 From: Stefan Roscher Date: Tue, 11 Sep 2007 15:26:33 +0200 Subject: IB/ehca: Small QP userspace support Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 88f0745..87b32ab 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -273,6 +273,7 @@ static inline void queue2resp(struct ipzu_queue_resp *resp, resp->queue_length = queue->queue_length; resp->pagesize = queue->pagesize; resp->toggle_state = queue->toggle_state; + resp->offset = queue->offset; } /* @@ -598,8 +599,7 @@ static struct ehca_qp *internal_create_qp( parms.squeue.max_sge = max_send_sge; parms.rqueue.max_sge = max_recv_sge; - if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap) - && !(context && udata)) { /* no small QP support in userspace ATM */ + if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { if (HAS_SQ(my_qp)) ehca_determine_small_queue( &parms.squeue, max_send_sge, is_llqp); @@ -739,8 +739,7 @@ static struct ehca_qp *internal_create_qp( resp.ext_type = my_qp->ext_type; resp.qkey = my_qp->qkey; resp.real_qp_num = my_qp->real_qp_num; - resp.ipz_rqueue.offset = my_qp->ipz_rqueue.offset; - resp.ipz_squeue.offset = my_qp->ipz_squeue.offset; + if (HAS_SQ(my_qp)) queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue); if (HAS_RQ(my_qp)) diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c index 29bd476..661f8db 100644 --- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -158,6 +158,7 @@ static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9))); queue->small_page = page; + queue->offset = bit << (order + 9); return 1; out: -- cgit v0.10.2 From 5281a4b8a0c6bac0c070913ec25868faa06a3115 Mon Sep 17 00:00:00 2001 From: Stefan Roscher Date: Tue, 11 Sep 2007 15:29:39 +0200 Subject: IB/ehca: Support more than 4k QPs for userspace and kernelspace Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index 81aff36..a6f17e4 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -166,7 +166,6 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, write_lock_irqsave(&ehca_cq_idr_lock, flags); ret = idr_get_new(&ehca_cq_idr, my_cq, &my_cq->token); write_unlock_irqrestore(&ehca_cq_idr_lock, flags); - } while (ret == -EAGAIN); if (ret) { @@ -176,6 +175,12 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, goto create_cq_exit1; } + if (my_cq->token > 0x1FFFFFF) { + cq = ERR_PTR(-ENOMEM); + ehca_err(device, "Invalid number of cq. device=%p", device); + goto create_cq_exit2; + } + /* * CQs maximum depth is 4GB-64, but we need additional 20 as buffer * for receiving errors CQEs. diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index db041df..9916907 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -380,7 +380,7 @@ int ehca_init_device(struct ehca_shca *shca) strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX); shca->ib_device.owner = THIS_MODULE; - shca->ib_device.uverbs_abi_ver = 7; + shca->ib_device.uverbs_abi_ver = 8; shca->ib_device.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 87b32ab..bfae1c2 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -557,7 +557,6 @@ static struct ehca_qp *internal_create_qp( write_lock_irqsave(&ehca_qp_idr_lock, flags); ret = idr_get_new(&ehca_qp_idr, my_qp, &my_qp->token); write_unlock_irqrestore(&ehca_qp_idr_lock, flags); - } while (ret == -EAGAIN); if (ret) { @@ -566,11 +565,17 @@ static struct ehca_qp *internal_create_qp( goto create_qp_exit0; } + if (my_qp->token > 0x1FFFFFF) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid number of qp"); + goto create_qp_exit1; + } + parms.servicetype = ibqptype2servicetype(qp_type); if (parms.servicetype < 0) { ret = -EINVAL; ehca_err(pd->device, "Invalid qp_type=%x", qp_type); - goto create_qp_exit0; + goto create_qp_exit1; } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 4bc687f..3340f49 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -164,7 +164,7 @@ static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, int ret; switch (rsrc_type) { - case 1: /* galpa fw handle */ + case 0: /* galpa fw handle */ ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number); ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa); if (unlikely(ret)) { @@ -175,7 +175,7 @@ static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, } break; - case 2: /* cq queue_addr */ + case 1: /* cq queue_addr */ ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number); ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue); if (unlikely(ret)) { @@ -201,7 +201,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, int ret; switch (rsrc_type) { - case 1: /* galpa fw handle */ + case 0: /* galpa fw handle */ ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num); ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa); if (unlikely(ret)) { @@ -212,7 +212,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, } break; - case 2: /* qp rqueue_addr */ + case 1: /* qp rqueue_addr */ ehca_dbg(qp->ib_qp.device, "qp_num=%x rqueue", qp->ib_qp.qp_num); ret = ehca_mmap_queue(vma, &qp->ipz_rqueue, @@ -225,7 +225,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, } break; - case 3: /* qp squeue_addr */ + case 2: /* qp squeue_addr */ ehca_dbg(qp->ib_qp.device, "qp_num=%x squeue", qp->ib_qp.qp_num); ret = ehca_mmap_queue(vma, &qp->ipz_squeue, @@ -249,10 +249,10 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { - u64 fileoffset = vma->vm_pgoff << PAGE_SHIFT; - u32 idr_handle = fileoffset >> 32; - u32 q_type = (fileoffset >> 28) & 0xF; /* CQ, QP,... */ - u32 rsrc_type = (fileoffset >> 24) & 0xF; /* sq,rq,cmnd_window */ + u64 fileoffset = vma->vm_pgoff; + u32 idr_handle = fileoffset & 0x1FFFFFF; + u32 q_type = (fileoffset >> 27) & 0x1; /* CQ, QP,... */ + u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */ u32 cur_pid = current->tgid; u32 ret; struct ehca_cq *cq; @@ -261,7 +261,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct ib_uobject *uobject; switch (q_type) { - case 1: /* CQ */ + case 0: /* CQ */ read_lock(&ehca_cq_idr_lock); cq = idr_find(&ehca_cq_idr, idr_handle); read_unlock(&ehca_cq_idr_lock); @@ -289,7 +289,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) } break; - case 2: /* QP */ + case 1: /* QP */ read_lock(&ehca_qp_idr_lock); qp = idr_find(&ehca_qp_idr, idr_handle); read_unlock(&ehca_qp_idr_lock); -- cgit v0.10.2 From e390d3b52f791fcea26312ba4982cda82052727b Mon Sep 17 00:00:00 2001 From: Hoang-Nam Nguyen Date: Tue, 11 Sep 2007 15:31:06 +0200 Subject: IB/ehca: Use remap_4k_pfn() to map firmware contexts to user space Use Paul's new remap_4k_pfn() function to map our 4K firmware contexts into user space on 64K-page machines without exposing neighboring firmware contexts. Return the context's offset within a 64K page to user space so it can determine the proper virtual address. For details about remap_4k_pfn(), see commit 721151d0 or http://patchwork.ozlabs.org/linuxppc/patch?id=10281 Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 0d69f0f..0942a17 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -338,6 +338,8 @@ struct ehca_create_cq_resp { u32 cq_number; u32 token; struct ipzu_queue_resp ipz_queue; + u32 fw_handle_ofs; + u32 dummy; }; struct ehca_create_qp_resp { @@ -348,7 +350,7 @@ struct ehca_create_qp_resp { u32 qkey; /* qp_num assigned by ehca: sqp0/1 may have got different numbers */ u32 real_qp_num; - u32 dummy; /* padding for 8 byte alignment */ + u32 fw_handle_ofs; struct ipzu_queue_resp ipz_squeue; struct ipzu_queue_resp ipz_rqueue; }; diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index a6f17e4..d68603d 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -281,6 +281,8 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, resp.ipz_queue.queue_length = ipz_queue->queue_length; resp.ipz_queue.pagesize = ipz_queue->pagesize; resp.ipz_queue.toggle_state = ipz_queue->toggle_state; + resp.fw_handle_ofs = (u32) + (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { ehca_err(device, "Copy to udata failed."); goto create_cq_exit4; diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index bfae1c2..e89fdca 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -749,6 +749,8 @@ static struct ehca_qp *internal_create_qp( queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue); if (HAS_RQ(my_qp)) queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue); + resp.fw_handle_ofs = (u32) + (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1)); if (ib_copy_to_udata(udata, &resp, sizeof resp)) { ehca_err(pd->device, "Copy to udata failed"); diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 3340f49..84a16bc 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -109,7 +109,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, u64 vsize, physical; vsize = vma->vm_end - vma->vm_start; - if (vsize != EHCA_PAGESIZE) { + if (vsize < EHCA_PAGESIZE) { ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start); return -EINVAL; } @@ -118,8 +118,8 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ehca_gen_dbg("vsize=%lx physical=%lx", vsize, physical); /* VM_IO | VM_RESERVED are set by remap_pfn_range() */ - ret = remap_pfn_range(vma, vma->vm_start, physical >> PAGE_SHIFT, - vsize, vma->vm_page_prot); + ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, + vma->vm_page_prot); if (unlikely(ret)) { ehca_gen_err("remap_pfn_range() failed ret=%x", ret); return -ENOMEM; -- cgit v0.10.2 From 2863ad4bddf366790a733cfd71f2f480afdf36fc Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:31:49 +0200 Subject: IB/ehca: Refactor hvcall tracing Change hvcall trace output towards better readability: reg numbers instead of argument numbers, return code as signed decimal instead of unsigned hex. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index 24f4541..a762cc7 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -84,6 +84,10 @@ #define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48) #define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49) +#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx" +#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx" +#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx" + static DEFINE_SPINLOCK(hcall_lock); static u32 get_longbusy_msecs(int longbusy_rc) @@ -118,8 +122,7 @@ static long ehca_plpar_hcall_norets(unsigned long opcode, long ret; int i, sleep_msecs; - ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx " - "arg5=%lx arg6=%lx arg7=%lx", + ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT, opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); for (i = 0; i < 5; i++) { @@ -133,16 +136,13 @@ static long ehca_plpar_hcall_norets(unsigned long opcode, } if (ret < H_SUCCESS) - ehca_gen_err("opcode=%lx ret=%lx" - " arg1=%lx arg2=%lx arg3=%lx arg4=%lx" - " arg5=%lx arg6=%lx arg7=%lx ", - opcode, ret, - arg1, arg2, arg3, arg4, arg5, - arg6, arg7); - - ehca_gen_dbg("opcode=%lx ret=%lx", opcode, ret); - return ret; + ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT, + opcode, ret, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + else + ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret); + return ret; } return H_BUSY; @@ -164,10 +164,8 @@ static long ehca_plpar_hcall9(unsigned long opcode, int i, sleep_msecs, lock_is_set = 0; unsigned long flags = 0; - ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx " - "arg5=%lx arg6=%lx arg7=%lx arg8=%lx arg9=%lx", - opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7, - arg8, arg9); + ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode, + arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); for (i = 0; i < 5; i++) { if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) { @@ -188,26 +186,19 @@ static long ehca_plpar_hcall9(unsigned long opcode, continue; } - if (ret < H_SUCCESS) - ehca_gen_err("opcode=%lx ret=%lx" - " arg1=%lx arg2=%lx arg3=%lx arg4=%lx" - " arg5=%lx arg6=%lx arg7=%lx arg8=%lx" - " arg9=%lx" - " out1=%lx out2=%lx out3=%lx out4=%lx" - " out5=%lx out6=%lx out7=%lx out8=%lx" - " out9=%lx", - opcode, ret, - arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, - outs[0], outs[1], outs[2], outs[3], + if (ret < H_SUCCESS) { + ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + } else + ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], outs[4], outs[5], outs[6], outs[7], outs[8]); - - ehca_gen_dbg("opcode=%lx ret=%lx out1=%lx out2=%lx out3=%lx " - "out4=%lx out5=%lx out6=%lx out7=%lx out8=%lx " - "out9=%lx", - opcode, ret, outs[0], outs[1], outs[2], outs[3], - outs[4], outs[5], outs[6], outs[7], outs[8]); return ret; } -- cgit v0.10.2 From e37221928bf685d63ba5319746eafe463d61e330 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:32:22 +0200 Subject: IB/ehca: Print return codes as signed decimal integers ...because -12 is easier to read than FFFFFFF4. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index d68603d..79c25f5 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -190,7 +190,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, if (h_ret != H_SUCCESS) { ehca_err(device, "hipz_h_alloc_resource_cq() failed " - "h_ret=%lx device=%p", h_ret, device); + "h_ret=%li device=%p", h_ret, device); cq = ERR_PTR(ehca2ib_return_code(h_ret)); goto create_cq_exit2; } @@ -198,7 +198,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages, EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0); if (!ipz_rc) { - ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%x device=%p", + ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p", ipz_rc, device); cq = ERR_PTR(-EINVAL); goto create_cq_exit3; @@ -226,7 +226,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, if (h_ret < H_SUCCESS) { ehca_err(device, "hipz_h_register_rpage_cq() failed " - "ehca_cq=%p cq_num=%x h_ret=%lx counter=%i " + "ehca_cq=%p cq_num=%x h_ret=%li counter=%i " "act_pages=%i", my_cq, my_cq->cq_number, h_ret, counter, param.act_pages); cq = ERR_PTR(-EINVAL); @@ -238,7 +238,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, if ((h_ret != H_SUCCESS) || vpage) { ehca_err(device, "Registration of pages not " "complete ehca_cq=%p cq_num=%x " - "h_ret=%lx", my_cq, my_cq->cq_number, + "h_ret=%li", my_cq, my_cq->cq_number, h_ret); cq = ERR_PTR(-EAGAIN); goto create_cq_exit4; @@ -246,7 +246,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, } else { if (h_ret != H_PAGE_REGISTERED) { ehca_err(device, "Registration of page failed " - "ehca_cq=%p cq_num=%x h_ret=%lx" + "ehca_cq=%p cq_num=%x h_ret=%li" "counter=%i act_pages=%i", my_cq, my_cq->cq_number, h_ret, counter, param.act_pages); @@ -298,7 +298,7 @@ create_cq_exit3: h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); if (h_ret != H_SUCCESS) ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p " - "cq_num=%x h_ret=%lx", my_cq, my_cq->cq_number, h_ret); + "cq_num=%x h_ret=%li", my_cq, my_cq->cq_number, h_ret); create_cq_exit2: write_lock_irqsave(&ehca_cq_idr_lock, flags); @@ -362,7 +362,7 @@ int ehca_destroy_cq(struct ib_cq *cq) cq_num); } if (h_ret != H_SUCCESS) { - ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lx " + ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%li " "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num); return ehca2ib_return_code(h_ret); } diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index cf22472..3436c49 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -352,7 +352,7 @@ int ehca_modify_port(struct ib_device *ibdev, hret = hipz_h_modify_port(shca->ipz_hca_handle, port, cap, props->init_type, port_modify_mask); if (hret != H_SUCCESS) { - ehca_err(&shca->ib_device, "Modify port failed hret=%lx", + ehca_err(&shca->ib_device, "Modify port failed h_ret=%li", hret); ret = -EINVAL; } diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 9916907..0e3ffee 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -273,7 +273,7 @@ int ehca_sense_attributes(struct ehca_shca *shca) h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock); if (h_ret != H_SUCCESS) { - ehca_gen_err("Cannot query device properties. h_ret=%lx", + ehca_gen_err("Cannot query device properties. h_ret=%li", h_ret); ret = -EPERM; goto sense_attributes1; @@ -332,7 +332,7 @@ int ehca_sense_attributes(struct ehca_shca *shca) port = (struct hipz_query_port *)rblock; h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); if (h_ret != H_SUCCESS) { - ehca_gen_err("Cannot query port properties. h_ret=%lx", + ehca_gen_err("Cannot query port properties. h_ret=%li", h_ret); ret = -EPERM; goto sense_attributes1; @@ -526,13 +526,13 @@ static int ehca_destroy_aqp1(struct ehca_sport *sport) ret = ib_destroy_qp(sport->ibqp_aqp1); if (ret) { - ehca_gen_err("Cannot destroy AQP1 QP. ret=%x", ret); + ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret); return ret; } ret = ib_destroy_cq(sport->ibcq_aqp1); if (ret) - ehca_gen_err("Cannot destroy AQP1 CQ. ret=%x", ret); + ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret); return ret; } @@ -728,7 +728,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr); if (ret) { - ehca_err(&shca->ib_device, "Cannot create internal MR ret=%x", + ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i", ret); goto probe5; } @@ -736,7 +736,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, ret = ib_register_device(&shca->ib_device); if (ret) { ehca_err(&shca->ib_device, - "ib_register_device() failed ret=%x", ret); + "ib_register_device() failed ret=%i", ret); goto probe6; } @@ -777,7 +777,7 @@ probe8: ret = ehca_destroy_aqp1(&shca->sport[0]); if (ret) ehca_err(&shca->ib_device, - "Cannot destroy AQP1 for port 1. ret=%x", ret); + "Cannot destroy AQP1 for port 1. ret=%i", ret); probe7: ib_unregister_device(&shca->ib_device); @@ -826,7 +826,7 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev) if (ret) ehca_err(&shca->ib_device, "Cannot destroy AQP1 for port %x " - "ret=%x", ret, i); + "ret=%i", ret, i); } } @@ -835,20 +835,20 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev) ret = ehca_dereg_internal_maxmr(shca); if (ret) ehca_err(&shca->ib_device, - "Cannot destroy internal MR. ret=%x", ret); + "Cannot destroy internal MR. ret=%i", ret); ret = ehca_dealloc_pd(&shca->pd->ib_pd); if (ret) ehca_err(&shca->ib_device, - "Cannot destroy internal PD. ret=%x", ret); + "Cannot destroy internal PD. ret=%i", ret); ret = ehca_destroy_eq(shca, &shca->eq); if (ret) - ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%x", ret); + ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret); ret = ehca_destroy_eq(shca, &shca->neq); if (ret) - ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%x", ret); + ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret); ib_dealloc_device(&shca->ib_device); diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c index 32a8706..e3ef026 100644 --- a/drivers/infiniband/hw/ehca/ehca_mcast.c +++ b/drivers/infiniband/hw/ehca/ehca_mcast.c @@ -88,7 +88,7 @@ int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (h_ret != H_SUCCESS) ehca_err(ibqp->device, "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed " - "h_ret=%lx", my_qp, ibqp->qp_num, h_ret); + "h_ret=%li", my_qp, ibqp->qp_num, h_ret); return ehca2ib_return_code(h_ret); } @@ -125,7 +125,7 @@ int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (h_ret != H_SUCCESS) ehca_err(ibqp->device, "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed " - "h_ret=%lx", my_qp, ibqp->qp_num, h_ret); + "h_ret=%li", my_qp, ibqp->qp_num, h_ret); return ehca2ib_return_code(h_ret); } diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index dc4c840..e925e84 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -159,7 +159,7 @@ struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags) get_dma_mr_exit0: if (IS_ERR(ib_mr)) - ehca_err(&shca->ib_device, "rc=%lx pd=%p mr_access_flags=%x ", + ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x", PTR_ERR(ib_mr), pd, mr_access_flags); return ib_mr; } /* end ehca_get_dma_mr() */ @@ -271,7 +271,7 @@ reg_phys_mr_exit1: ehca_mr_delete(e_mr); reg_phys_mr_exit0: if (IS_ERR(ib_mr)) - ehca_err(pd->device, "rc=%lx pd=%p phys_buf_array=%p " + ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p " "num_phys_buf=%x mr_access_flags=%x iova_start=%p", PTR_ERR(ib_mr), pd, phys_buf_array, num_phys_buf, mr_access_flags, iova_start); @@ -403,8 +403,7 @@ reg_user_mr_exit1: ehca_mr_delete(e_mr); reg_user_mr_exit0: if (IS_ERR(ib_mr)) - ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x" - " udata=%p", + ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p", PTR_ERR(ib_mr), pd, mr_access_flags, udata); return ib_mr; } /* end ehca_reg_user_mr() */ @@ -565,7 +564,7 @@ rereg_phys_mr_exit1: spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); rereg_phys_mr_exit0: if (ret) - ehca_err(mr->device, "ret=%x mr=%p mr_rereg_mask=%x pd=%p " + ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p " "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x " "iova_start=%p", ret, mr, mr_rereg_mask, pd, phys_buf_array, @@ -607,7 +606,7 @@ int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout); if (h_ret != H_SUCCESS) { - ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lx mr=%p " + ehca_err(mr->device, "hipz_mr_query failed, h_ret=%li mr=%p " "hca_hndl=%lx mr_hndl=%lx lkey=%x", h_ret, mr, shca->ipz_hca_handle.handle, e_mr->ipz_mr_handle.handle, mr->lkey); @@ -625,7 +624,7 @@ query_mr_exit1: spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); query_mr_exit0: if (ret) - ehca_err(mr->device, "ret=%x mr=%p mr_attr=%p", + ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p", ret, mr, mr_attr); return ret; } /* end ehca_query_mr() */ @@ -667,7 +666,7 @@ int ehca_dereg_mr(struct ib_mr *mr) /* TODO: BUSY: MR still has bound window(s) */ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); if (h_ret != H_SUCCESS) { - ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lx shca=%p " + ehca_err(mr->device, "hipz_free_mr failed, h_ret=%li shca=%p " "e_mr=%p hca_hndl=%lx mr_hndl=%lx mr->lkey=%x", h_ret, shca, e_mr, shca->ipz_hca_handle.handle, e_mr->ipz_mr_handle.handle, mr->lkey); @@ -683,7 +682,7 @@ int ehca_dereg_mr(struct ib_mr *mr) dereg_mr_exit0: if (ret) - ehca_err(mr->device, "ret=%x mr=%p", ret, mr); + ehca_err(mr->device, "ret=%i mr=%p", ret, mr); return ret; } /* end ehca_dereg_mr() */ @@ -708,7 +707,7 @@ struct ib_mw *ehca_alloc_mw(struct ib_pd *pd) h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw, e_pd->fw_pd, &hipzout); if (h_ret != H_SUCCESS) { - ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lx " + ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%li " "shca=%p hca_hndl=%lx mw=%p", h_ret, shca, shca->ipz_hca_handle.handle, e_mw); ib_mw = ERR_PTR(ehca2ib_return_code(h_ret)); @@ -723,7 +722,7 @@ alloc_mw_exit1: ehca_mw_delete(e_mw); alloc_mw_exit0: if (IS_ERR(ib_mw)) - ehca_err(pd->device, "rc=%lx pd=%p", PTR_ERR(ib_mw), pd); + ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd); return ib_mw; } /* end ehca_alloc_mw() */ @@ -750,7 +749,7 @@ int ehca_dealloc_mw(struct ib_mw *mw) h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw); if (h_ret != H_SUCCESS) { - ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lx shca=%p " + ehca_err(mw->device, "hipz_free_mw failed, h_ret=%li shca=%p " "mw=%p rkey=%x hca_hndl=%lx mw_hndl=%lx", h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle, e_mw->ipz_mw_handle.handle); @@ -912,7 +911,7 @@ int ehca_map_phys_fmr(struct ib_fmr *fmr, map_phys_fmr_exit0: if (ret) - ehca_err(fmr->device, "ret=%x fmr=%p page_list=%p list_len=%x " + ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x " "iova=%lx", ret, fmr, page_list, list_len, iova); return ret; } /* end ehca_map_phys_fmr() */ @@ -975,7 +974,7 @@ int ehca_unmap_fmr(struct list_head *fmr_list) unmap_fmr_exit0: if (ret) - ehca_gen_err("ret=%x fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x", + ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x", ret, fmr_list, num_fmr, unmap_fmr_cnt); return ret; } /* end ehca_unmap_fmr() */ @@ -999,7 +998,7 @@ int ehca_dealloc_fmr(struct ib_fmr *fmr) h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); if (h_ret != H_SUCCESS) { - ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lx e_fmr=%p " + ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%li e_fmr=%p " "hca_hndl=%lx fmr_hndl=%lx fmr->lkey=%x", h_ret, e_fmr, shca->ipz_hca_handle.handle, e_fmr->ipz_mr_handle.handle, fmr->lkey); @@ -1012,7 +1011,7 @@ int ehca_dealloc_fmr(struct ib_fmr *fmr) free_fmr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x fmr=%p", ret, fmr); + ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr); return ret; } /* end ehca_dealloc_fmr() */ @@ -1042,7 +1041,7 @@ int ehca_reg_mr(struct ehca_shca *shca, (u64)iova_start, size, hipz_acl, e_pd->fw_pd, &hipzout); if (h_ret != H_SUCCESS) { - ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lx " + ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%li " "hca_hndl=%lx", h_ret, shca->ipz_hca_handle.handle); ret = ehca2ib_return_code(h_ret); goto ehca_reg_mr_exit0; @@ -1068,9 +1067,9 @@ int ehca_reg_mr(struct ehca_shca *shca, ehca_reg_mr_exit1: h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); if (h_ret != H_SUCCESS) { - ehca_err(&shca->ib_device, "h_ret=%lx shca=%p e_mr=%p " + ehca_err(&shca->ib_device, "h_ret=%li shca=%p e_mr=%p " "iova_start=%p size=%lx acl=%x e_pd=%p lkey=%x " - "pginfo=%p num_kpages=%lx num_hwpages=%lx ret=%x", + "pginfo=%p num_kpages=%lx num_hwpages=%lx ret=%i", h_ret, shca, e_mr, iova_start, size, acl, e_pd, hipzout.lkey, pginfo, pginfo->num_kpages, pginfo->num_hwpages, ret); @@ -1079,7 +1078,7 @@ ehca_reg_mr_exit1: } ehca_reg_mr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p " + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " "iova_start=%p size=%lx acl=%x e_pd=%p pginfo=%p " "num_kpages=%lx num_hwpages=%lx", ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo, @@ -1123,7 +1122,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca, ret = ehca_set_pagebuf(pginfo, rnum, kpage); if (ret) { ehca_err(&shca->ib_device, "ehca_set_pagebuf " - "bad rc, ret=%x rnum=%x kpage=%p", + "bad rc, ret=%i rnum=%x kpage=%p", ret, rnum, kpage); goto ehca_reg_mr_rpages_exit1; } @@ -1151,7 +1150,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca, */ if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "last " - "hipz_reg_rpage_mr failed, h_ret=%lx " + "hipz_reg_rpage_mr failed, h_ret=%li " "e_mr=%p i=%x hca_hndl=%lx mr_hndl=%lx" " lkey=%x", h_ret, e_mr, i, shca->ipz_hca_handle.handle, @@ -1163,7 +1162,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca, ret = 0; } else if (h_ret != H_PAGE_REGISTERED) { ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, " - "h_ret=%lx e_mr=%p i=%x lkey=%x hca_hndl=%lx " + "h_ret=%li e_mr=%p i=%x lkey=%x hca_hndl=%lx " "mr_hndl=%lx", h_ret, e_mr, i, e_mr->ib.ib_mr.lkey, shca->ipz_hca_handle.handle, @@ -1179,7 +1178,7 @@ ehca_reg_mr_rpages_exit1: ehca_free_fw_ctrlblock(kpage); ehca_reg_mr_rpages_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p pginfo=%p " + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p " "num_kpages=%lx num_hwpages=%lx", ret, shca, e_mr, pginfo, pginfo->num_kpages, pginfo->num_hwpages); return ret; @@ -1240,7 +1239,7 @@ inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca, * (MW bound or MR is shared) */ ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed " - "(Rereg1), h_ret=%lx e_mr=%p", h_ret, e_mr); + "(Rereg1), h_ret=%li e_mr=%p", h_ret, e_mr); *pginfo = pginfo_save; ret = -EAGAIN; } else if ((u64 *)hipzout.vaddr != iova_start) { @@ -1269,7 +1268,7 @@ ehca_rereg_mr_rereg1_exit1: ehca_free_fw_ctrlblock(kpage); ehca_rereg_mr_rereg1_exit0: if ( ret && (ret != -EAGAIN) ) - ehca_err(&shca->ib_device, "ret=%x lkey=%x rkey=%x " + ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x " "pginfo=%p num_kpages=%lx num_hwpages=%lx", ret, *lkey, *rkey, pginfo, pginfo->num_kpages, pginfo->num_hwpages); @@ -1330,7 +1329,7 @@ int ehca_rereg_mr(struct ehca_shca *shca, h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "hipz_free_mr failed, " - "h_ret=%lx e_mr=%p hca_hndl=%lx mr_hndl=%lx " + "h_ret=%li e_mr=%p hca_hndl=%lx mr_hndl=%lx " "mr->lkey=%x", h_ret, e_mr, shca->ipz_hca_handle.handle, e_mr->ipz_mr_handle.handle, @@ -1362,7 +1361,7 @@ int ehca_rereg_mr(struct ehca_shca *shca, ehca_rereg_mr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p " + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " "iova_start=%p size=%lx acl=%x e_pd=%p pginfo=%p " "num_kpages=%lx lkey=%x rkey=%x rereg_1_hcall=%x " "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size, @@ -1406,7 +1405,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca, * FMRs are not shared and no MW bound to FMRs */ ehca_err(&shca->ib_device, "hipz_reregister_pmr failed " - "(Rereg1), h_ret=%lx e_fmr=%p hca_hndl=%lx " + "(Rereg1), h_ret=%li e_fmr=%p hca_hndl=%lx " "mr_hndl=%lx lkey=%x lkey_out=%x", h_ret, e_fmr, shca->ipz_hca_handle.handle, e_fmr->ipz_mr_handle.handle, @@ -1418,7 +1417,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca, h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "hipz_free_mr failed, " - "h_ret=%lx e_fmr=%p hca_hndl=%lx mr_hndl=%lx " + "h_ret=%li e_fmr=%p hca_hndl=%lx mr_hndl=%lx " "lkey=%x", h_ret, e_fmr, shca->ipz_hca_handle.handle, e_fmr->ipz_mr_handle.handle, @@ -1453,7 +1452,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca, ehca_unmap_one_fmr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x tmp_lkey=%x tmp_rkey=%x " + ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x " "fmr_max_pages=%x", ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages); return ret; @@ -1482,7 +1481,7 @@ int ehca_reg_smr(struct ehca_shca *shca, (u64)iova_start, hipz_acl, e_pd->fw_pd, &hipzout); if (h_ret != H_SUCCESS) { - ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lx " + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%li " "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x " "e_pd=%p hca_hndl=%lx mr_hndl=%lx lkey=%x", h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd, @@ -1506,7 +1505,7 @@ int ehca_reg_smr(struct ehca_shca *shca, ehca_reg_smr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p e_origmr=%p " + ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p " "e_newmr=%p iova_start=%p acl=%x e_pd=%p", ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd); return ret; @@ -1581,7 +1580,7 @@ ehca_reg_internal_maxmr_exit1: ehca_mr_delete(e_mr); ehca_reg_internal_maxmr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p e_pd=%p e_maxmr=%p", + ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p", ret, shca, e_pd, e_maxmr); return ret; } /* end ehca_reg_internal_maxmr() */ @@ -1608,7 +1607,7 @@ int ehca_reg_maxmr(struct ehca_shca *shca, (u64)iova_start, hipz_acl, e_pd->fw_pd, &hipzout); if (h_ret != H_SUCCESS) { - ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lx " + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%li " "e_origmr=%p hca_hndl=%lx mr_hndl=%lx lkey=%x", h_ret, e_origmr, shca->ipz_hca_handle.handle, e_origmr->ipz_mr_handle.handle, @@ -1649,7 +1648,7 @@ int ehca_dereg_internal_maxmr(struct ehca_shca *shca) ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr); if (ret) { ehca_err(&shca->ib_device, "dereg internal max-MR failed, " - "ret=%x e_maxmr=%p shca=%p lkey=%x", + "ret=%i e_maxmr=%p shca=%p lkey=%x", ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey); shca->maxmr = e_maxmr; goto ehca_dereg_internal_maxmr_exit0; @@ -1659,7 +1658,7 @@ int ehca_dereg_internal_maxmr(struct ehca_shca *shca) ehca_dereg_internal_maxmr_exit0: if (ret) - ehca_err(&shca->ib_device, "ret=%x shca=%p shca->maxmr=%p", + ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p", ret, shca, shca->maxmr); return ret; } /* end ehca_dereg_internal_maxmr() */ diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index e89fdca..88d7dd9 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -310,7 +310,7 @@ static inline int init_qp_queue(struct ehca_shca *shca, } if (!ipz_rc) { - ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%x", + ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i", ipz_rc); return -EBUSY; } @@ -334,7 +334,7 @@ static inline int init_qp_queue(struct ehca_shca *shca, if (cnt == (nr_q_pages - 1)) { /* last page! */ if (h_ret != expected_hret) { ehca_err(ib_dev, "hipz_qp_register_rpage() " - "h_ret= %lx ", h_ret); + "h_ret=%li", h_ret); ret = ehca2ib_return_code(h_ret); goto init_qp_queue1; } @@ -348,7 +348,7 @@ static inline int init_qp_queue(struct ehca_shca *shca, } else { if (h_ret != H_PAGE_REGISTERED) { ehca_err(ib_dev, "hipz_qp_register_rpage() " - "h_ret= %lx ", h_ret); + "h_ret=%li", h_ret); ret = ehca2ib_return_code(h_ret); goto init_qp_queue1; } @@ -617,7 +617,7 @@ static struct ehca_qp *internal_create_qp( h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms); if (h_ret != H_SUCCESS) { - ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lx", + ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%li", h_ret); ret = ehca2ib_return_code(h_ret); goto create_qp_exit1; @@ -671,7 +671,7 @@ static struct ehca_qp *internal_create_qp( &parms.squeue, swqe_size); if (ret) { ehca_err(pd->device, "Couldn't initialize squeue " - "and pages ret=%x", ret); + "and pages ret=%i", ret); goto create_qp_exit2; } } @@ -682,7 +682,7 @@ static struct ehca_qp *internal_create_qp( H_SUCCESS, &parms.rqueue, rwqe_size); if (ret) { ehca_err(pd->device, "Couldn't initialize rqueue " - "and pages ret=%x", ret); + "and pages ret=%i", ret); goto create_qp_exit3; } } @@ -728,7 +728,7 @@ static struct ehca_qp *internal_create_qp( ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp); if (ret) { ehca_err(pd->device, - "Couldn't assign qp to send_cq ret=%x", ret); + "Couldn't assign qp to send_cq ret=%i", ret); goto create_qp_exit4; } } @@ -845,7 +845,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd, mqpcb, my_qp->galpas.kernel); if (hret != H_SUCCESS) { ehca_err(pd->device, "Could not modify SRQ to INIT" - "ehca_qp=%p qp_num=%x hret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, my_qp->real_qp_num, hret); goto create_srq2; } @@ -859,7 +859,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd, mqpcb, my_qp->galpas.kernel); if (hret != H_SUCCESS) { ehca_err(pd->device, "Could not enable SRQ" - "ehca_qp=%p qp_num=%x hret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, my_qp->real_qp_num, hret); goto create_srq2; } @@ -873,7 +873,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd, mqpcb, my_qp->galpas.kernel); if (hret != H_SUCCESS) { ehca_err(pd->device, "Could not modify SRQ to RTR" - "ehca_qp=%p qp_num=%x hret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, my_qp->real_qp_num, hret); goto create_srq2; } @@ -911,7 +911,7 @@ static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca, &bad_send_wqe_p, NULL, 2); if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed" - " ehca_qp=%p qp_num=%x h_ret=%lx", + " ehca_qp=%p qp_num=%x h_ret=%li", my_qp, qp_num, h_ret); return ehca2ib_return_code(h_ret); } @@ -989,7 +989,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, mqpcb, my_qp->galpas.kernel); if (h_ret != H_SUCCESS) { ehca_err(ibqp->device, "hipz_h_query_qp() failed " - "ehca_qp=%p qp_num=%x h_ret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, ibqp->qp_num, h_ret); ret = ehca2ib_return_code(h_ret); goto modify_qp_exit1; @@ -1025,7 +1025,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, ibqp, &smiqp_attr, smiqp_attr_mask, 1); if (smirc) { ehca_err(ibqp->device, "SMI RESET -> INIT failed. " - "ehca_modify_qp() rc=%x", smirc); + "ehca_modify_qp() rc=%i", smirc); ret = H_PARAMETER; goto modify_qp_exit1; } @@ -1127,7 +1127,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt); if (ret) { ehca_err(ibqp->device, "prepare_sqe_rts() failed " - "ehca_qp=%p qp_num=%x ret=%x", + "ehca_qp=%p qp_num=%x ret=%i", my_qp, ibqp->qp_num, ret); goto modify_qp_exit2; } @@ -1352,7 +1352,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, if (h_ret != H_SUCCESS) { ret = ehca2ib_return_code(h_ret); - ehca_err(ibqp->device, "hipz_h_modify_qp() failed rc=%lx " + ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%li " "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num); goto modify_qp_exit2; } @@ -1385,7 +1385,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, ret = ehca2ib_return_code(h_ret); ehca_err(ibqp->device, "ENABLE in context of " "RESET_2_INIT failed! Maybe you didn't get " - "a LID h_ret=%lx ehca_qp=%p qp_num=%x", + "a LID h_ret=%li ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num); goto modify_qp_exit2; } @@ -1473,7 +1473,7 @@ int ehca_query_qp(struct ib_qp *qp, if (h_ret != H_SUCCESS) { ret = ehca2ib_return_code(h_ret); ehca_err(qp->device, "hipz_h_query_qp() failed " - "ehca_qp=%p qp_num=%x h_ret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, qp->qp_num, h_ret); goto query_qp_exit1; } @@ -1648,7 +1648,7 @@ int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (h_ret != H_SUCCESS) { ret = ehca2ib_return_code(h_ret); - ehca_err(ibsrq->device, "hipz_h_modify_qp() failed rc=%lx " + ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%li " "ehca_qp=%p qp_num=%x", h_ret, my_qp, my_qp->real_qp_num); } @@ -1691,7 +1691,7 @@ int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) if (h_ret != H_SUCCESS) { ret = ehca2ib_return_code(h_ret); ehca_err(srq->device, "hipz_h_query_qp() failed " - "ehca_qp=%p qp_num=%x h_ret=%lx", + "ehca_qp=%p qp_num=%x h_ret=%li", my_qp, my_qp->real_qp_num, h_ret); goto query_srq_exit1; } @@ -1741,7 +1741,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num); if (ret) { ehca_err(dev, "Couldn't unassign qp from " - "send_cq ret=%x qp_num=%x cq_num=%x", ret, + "send_cq ret=%i qp_num=%x cq_num=%x", ret, qp_num, my_qp->send_cq->cq_number); return ret; } @@ -1753,7 +1753,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); if (h_ret != H_SUCCESS) { - ehca_err(dev, "hipz_h_destroy_qp() failed rc=%lx " + ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%li " "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num); return ehca2ib_return_code(h_ret); } diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 94eed70..ea91360 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -526,7 +526,7 @@ poll_cq_one_read_cqe: if (!cqe) { ret = -EAGAIN; ehca_dbg(cq->device, "Completion queue is empty ehca_cq=%p " - "cq_num=%x ret=%x", my_cq, my_cq->cq_number, ret); + "cq_num=%x ret=%i", my_cq, my_cq->cq_number, ret); goto poll_cq_one_exit0; } diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c index 9f16e9c..f0792e5 100644 --- a/drivers/infiniband/hw/ehca/ehca_sqp.c +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -82,7 +82,7 @@ u64 ehca_define_sqp(struct ehca_shca *shca, if (ret != H_SUCCESS) { ehca_err(&shca->ib_device, - "Can't define AQP1 for port %x. rc=%lx", + "Can't define AQP1 for port %x. h_ret=%li", port, ret); return ret; } diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 84a16bc..5234d6c 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -121,7 +121,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, vma->vm_page_prot); if (unlikely(ret)) { - ehca_gen_err("remap_pfn_range() failed ret=%x", ret); + ehca_gen_err("remap_pfn_range() failed ret=%i", ret); return -ENOMEM; } @@ -146,7 +146,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, page = virt_to_page(virt_addr); ret = vm_insert_page(vma, start, page); if (unlikely(ret)) { - ehca_gen_err("vm_insert_page() failed rc=%x", ret); + ehca_gen_err("vm_insert_page() failed rc=%i", ret); return ret; } start += PAGE_SIZE; @@ -169,7 +169,7 @@ static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa); if (unlikely(ret)) { ehca_err(cq->ib_cq.device, - "ehca_mmap_fw() failed rc=%x cq_num=%x", + "ehca_mmap_fw() failed rc=%i cq_num=%x", ret, cq->cq_number); return ret; } @@ -180,7 +180,7 @@ static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue); if (unlikely(ret)) { ehca_err(cq->ib_cq.device, - "ehca_mmap_queue() failed rc=%x cq_num=%x", + "ehca_mmap_queue() failed rc=%i cq_num=%x", ret, cq->cq_number); return ret; } @@ -206,7 +206,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa); if (unlikely(ret)) { ehca_err(qp->ib_qp.device, - "remap_pfn_range() failed ret=%x qp_num=%x", + "remap_pfn_range() failed ret=%i qp_num=%x", ret, qp->ib_qp.qp_num); return -ENOMEM; } @@ -219,7 +219,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, &qp->mm_count_rqueue); if (unlikely(ret)) { ehca_err(qp->ib_qp.device, - "ehca_mmap_queue(rq) failed rc=%x qp_num=%x", + "ehca_mmap_queue(rq) failed rc=%i qp_num=%x", ret, qp->ib_qp.qp_num); return ret; } @@ -232,7 +232,7 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, &qp->mm_count_squeue); if (unlikely(ret)) { ehca_err(qp->ib_qp.device, - "ehca_mmap_queue(sq) failed rc=%x qp_num=%x", + "ehca_mmap_queue(sq) failed rc=%i qp_num=%x", ret, qp->ib_qp.qp_num); return ret; } @@ -283,7 +283,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) ret = ehca_mmap_cq(vma, cq, rsrc_type); if (unlikely(ret)) { ehca_err(cq->ib_cq.device, - "ehca_mmap_cq() failed rc=%x cq_num=%x", + "ehca_mmap_cq() failed rc=%i cq_num=%x", ret, cq->cq_number); return ret; } @@ -313,7 +313,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) ret = ehca_mmap_qp(vma, qp, rsrc_type); if (unlikely(ret)) { ehca_err(qp->ib_qp.device, - "ehca_mmap_qp() failed rc=%x qp_num=%x", + "ehca_mmap_qp() failed rc=%i qp_num=%x", ret, qp->ib_qp.qp_num); return ret; } diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index a762cc7..3d68f65 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -238,7 +238,7 @@ u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, *eq_ist = (u32)outs[5]; if (ret == H_NOT_ENOUGH_RESOURCES) - ehca_gen_err("Not enough resource - ret=%lx ", ret); + ehca_gen_err("Not enough resource - ret=%li ", ret); return ret; } @@ -276,7 +276,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, hcp_galpas_ctor(&cq->galpas, outs[5], outs[6]); if (ret == H_NOT_ENOUGH_RESOURCES) - ehca_gen_err("Not enough resources. ret=%lx", ret); + ehca_gen_err("Not enough resources. ret=%li", ret); return ret; } @@ -351,7 +351,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, hcp_galpas_ctor(&parms->galpas, outs[6], outs[6]); if (ret == H_NOT_ENOUGH_RESOURCES) - ehca_gen_err("Not enough resources. ret=%lx", ret); + ehca_gen_err("Not enough resources. ret=%li", ret); return ret; } @@ -546,7 +546,7 @@ u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, 0, 0, 0, 0, 0); if (ret == H_NOT_ENOUGH_RESOURCES) - ehca_gen_err("Insufficient resources ret=%lx", ret); + ehca_gen_err("Insufficient resources ret=%li", ret); return ret; } @@ -582,7 +582,7 @@ u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, qp->ipz_qp_handle.handle, /* r6 */ 0, 0, 0, 0, 0, 0); if (ret == H_HARDWARE) - ehca_gen_err("HCA not operational. ret=%lx", ret); + ehca_gen_err("HCA not operational. ret=%li", ret); ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, adapter_handle.handle, /* r4 */ @@ -590,7 +590,7 @@ u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, 0, 0, 0, 0, 0); if (ret == H_RESOURCE) - ehca_gen_err("Resource still in use. ret=%lx", ret); + ehca_gen_err("Resource still in use. ret=%li", ret); return ret; } @@ -625,7 +625,7 @@ u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, *bma_qp_nr = (u32)outs[1]; if (ret == H_ALIAS_EXIST) - ehca_gen_err("AQP1 already exists. ret=%lx", ret); + ehca_gen_err("AQP1 already exists. ret=%li", ret); return ret; } @@ -647,7 +647,7 @@ u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, 0, 0); if (ret == H_NOT_ENOUGH_RESOURCES) - ehca_gen_err("Not enough resources. ret=%lx", ret); + ehca_gen_err("Not enough resources. ret=%li", ret); return ret; } @@ -686,7 +686,7 @@ u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, 0, 0, 0, 0); if (ret == H_RESOURCE) - ehca_gen_err("H_FREE_RESOURCE failed ret=%lx ", ret); + ehca_gen_err("H_FREE_RESOURCE failed ret=%li ", ret); return ret; } @@ -708,7 +708,7 @@ u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, 0, 0, 0, 0, 0); if (ret == H_RESOURCE) - ehca_gen_err("Resource in use. ret=%lx ", ret); + ehca_gen_err("Resource in use. ret=%li ", ret); return ret; } -- cgit v0.10.2 From 86dce445e01a50339f8f86c466c64a863e5fd18a Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:32:50 +0200 Subject: IB/ehca: ehca_gen_warn() should always print Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index 57c77a7..f9b264b 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -98,15 +98,12 @@ extern int ehca_debug_level; } while (0) #define ehca_gen_warn(format, arg...) \ - do { \ - if (unlikely(ehca_debug_level)) \ - printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg); \ - } while (0) + printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ + get_paca()->paca_index, __FUNCTION__, ## arg) #define ehca_gen_err(format, arg...) \ printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + get_paca()->paca_index, __FUNCTION__, ## arg) /** * ehca_dmp - printk a memory block, whose length is n*8 bytes. -- cgit v0.10.2 From b708fba3c2942a175c3cb04a7bb4c89f907b497b Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:33:40 +0200 Subject: IB/ehca: Add check for max #SGE to create_qp() Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 88d7dd9..f95403c 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -513,7 +513,7 @@ static struct ehca_qp *internal_create_qp( } else if (init_attr->cap.max_send_wr > 255) { ehca_err(pd->device, "Invalid Number of " - "ax_send_wr=%x for UD QP_TYPE=%x", + "max_send_wr=%x for UD QP_TYPE=%x", init_attr->cap.max_send_wr, qp_type); return ERR_PTR(-EINVAL); } @@ -524,6 +524,18 @@ static struct ehca_qp *internal_create_qp( return ERR_PTR(-EINVAL); break; } + } else { + int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI + || qp_type == IB_QPT_GSI) ? 250 : 252; + + if (init_attr->cap.max_send_sge > max_sge + || init_attr->cap.max_recv_sge > max_sge) { + ehca_err(pd->device, "Invalid number of SGEs requested " + "send_sge=%x recv_sge=%x max_sge=%x", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge, max_sge); + return ERR_PTR(-EINVAL); + } } if (pd->uobject && udata) -- cgit v0.10.2 From e90d0b3daede2bae2e78f8bf88c19182961cd19d Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:34:04 +0200 Subject: IB/ehca: Path migration support Fix some modify_qp() issues related to path migration. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index a925ea5..7093986 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -294,8 +294,8 @@ static void parse_identifier(struct ehca_shca *shca, u64 eqe) case 0x11: /* unaffiliated access error */ ehca_err(&shca->ib_device, "Unaffiliated access error."); break; - case 0x12: /* path migrating error */ - ehca_err(&shca->ib_device, "Path migration error."); + case 0x12: /* path migrating */ + ehca_err(&shca->ib_device, "Path migrating."); break; case 0x13: /* interface trace stopped */ ehca_err(&shca->ib_device, "Interface trace stopped."); diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index f95403c..b10c7df 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1165,6 +1165,13 @@ static int internal_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_PKEY_INDEX) { + if (attr->pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } mqpcb->prim_p_key_idx = attr->pkey_index; update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1); } @@ -1273,50 +1280,78 @@ static int internal_modify_qp(struct ib_qp *ibqp, int ehca_mult = ib_rate_to_mult( shca->sport[my_qp->init_attr.port_num].rate); + if (attr->alt_port_num < 1 + || attr->alt_port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->alt_port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + mqpcb->alt_phys_port = attr->alt_port_num; + + if (attr->alt_pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->alt_p_key_idx = attr->alt_pkey_index; + + mqpcb->timeout_al = attr->alt_timeout; mqpcb->dlid_al = attr->alt_ah_attr.dlid; - update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1); mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1); mqpcb->service_level_al = attr->alt_ah_attr.sl; - update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1); - if (ah_mult < ehca_mult) - mqpcb->max_static_rate = (ah_mult > 0) ? - ((ehca_mult - 1) / ah_mult) : 0; + if (ah_mult > 0 && ah_mult < ehca_mult) + mqpcb->max_static_rate_al = (ehca_mult - 1) / ah_mult; else mqpcb->max_static_rate_al = 0; - update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1); + /* OpenIB doesn't support alternate retry counts - copy them */ + mqpcb->retry_count_al = mqpcb->retry_count; + mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1) + | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1); /* * only if GRH is TRUE we might consider SOURCE_GID_IDX * and DEST_GID otherwise phype will return H_ATTR_PARM!!! */ if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) { - mqpcb->send_grh_flag_al = 1 << 31; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1); - mqpcb->source_gid_idx_al = - attr->alt_ah_attr.grh.sgid_index; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1); + mqpcb->send_grh_flag_al = 1; for (cnt = 0; cnt < 16; cnt++) mqpcb->dest_gid_al.byte[cnt] = attr->alt_ah_attr.grh.dgid.raw[cnt]; - - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1); + mqpcb->source_gid_idx_al = + attr->alt_ah_attr.grh.sgid_index; mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1); mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1); mqpcb->traffic_class_al = attr->alt_ah_attr.grh.traffic_class; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) | EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1); } } @@ -1338,7 +1373,14 @@ static int internal_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_PATH_MIG_STATE) { - mqpcb->path_migration_state = attr->path_mig_state; + if (attr->path_mig_state != IB_MIG_REARM + && attr->path_mig_state != IB_MIG_MIGRATED) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid mig_state=%x", + attr->path_mig_state); + goto modify_qp_exit2; + } + mqpcb->path_migration_state = attr->path_mig_state + 1; update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1); } @@ -1506,7 +1548,7 @@ int ehca_query_qp(struct ib_qp *qp, qp_attr->qkey = qpcb->qkey; qp_attr->path_mtu = qpcb->path_mtu; - qp_attr->path_mig_state = qpcb->path_migration_state; + qp_attr->path_mig_state = qpcb->path_migration_state - 1; qp_attr->rq_psn = qpcb->receive_psn; qp_attr->sq_psn = qpcb->send_psn; qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field; -- cgit v0.10.2 From 0b5de96858e516311f2d3ca45073c2afd2eb5d94 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:34:35 +0200 Subject: IB/ehca: Serialize MR alloc and MR free hvCalls Some firmware levels exhibit a race condition between H_ALLOC_RESOURCE(MR) and H_FREE_RESOURCE(MR). Work around this problem by locking these hvCalls against each other. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index 3d68f65..c16a213 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -120,15 +120,28 @@ static long ehca_plpar_hcall_norets(unsigned long opcode, unsigned long arg7) { long ret; - int i, sleep_msecs; + int i, sleep_msecs, do_lock; + unsigned long flags; ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT, opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + /* lock H_FREE_RESOURCE(MR) against itself and H_ALLOC_RESOURCE(MR) */ + if ((opcode == H_FREE_RESOURCE) && (arg7 == 5)) { + arg7 = 0; /* better not upset firmware */ + do_lock = 1; + } + for (i = 0; i < 5; i++) { + if (do_lock) + spin_lock_irqsave(&hcall_lock, flags); + ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + if (do_lock) + spin_unlock_irqrestore(&hcall_lock, flags); + if (H_IS_LONG_BUSY(ret)) { sleep_msecs = get_longbusy_msecs(ret); msleep_interruptible(sleep_msecs); @@ -161,23 +174,24 @@ static long ehca_plpar_hcall9(unsigned long opcode, unsigned long arg9) { long ret; - int i, sleep_msecs, lock_is_set = 0; + int i, sleep_msecs, do_lock; unsigned long flags = 0; ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); + /* lock H_ALLOC_RESOURCE(MR) against itself and H_FREE_RESOURCE(MR) */ + do_lock = ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)); + for (i = 0; i < 5; i++) { - if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) { + if (do_lock) spin_lock_irqsave(&hcall_lock, flags); - lock_is_set = 1; - } ret = plpar_hcall9(opcode, outs, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); - if (lock_is_set) + if (do_lock) spin_unlock_irqrestore(&hcall_lock, flags); if (H_IS_LONG_BUSY(ret)) { @@ -807,7 +821,7 @@ u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, return ehca_plpar_hcall_norets(H_FREE_RESOURCE, adapter_handle.handle, /* r4 */ mr->ipz_mr_handle.handle, /* r5 */ - 0, 0, 0, 0, 0); + 0, 0, 0, 0, 5); } u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, -- cgit v0.10.2 From 5110e4de4995db1c28457b08ed1e291f9b38f2e7 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Wed, 12 Sep 2007 16:44:11 +0200 Subject: IB/ehca: Replace get_paca()->paca_index by the more portable raw_smp_processor_id() We can use raw_smp_processor_id() here because the processor ID is only used for debug output and therefore our use is preemption-unsafe. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index f9b264b..4a8346a 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -73,37 +73,37 @@ extern int ehca_debug_level; if (unlikely(ehca_debug_level)) \ dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \ "PU%04x EHCA_DBG:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, \ + raw_smp_processor_id(), __FUNCTION__, \ ## arg); \ } while (0) #define ehca_info(ib_dev, format, arg...) \ dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + raw_smp_processor_id(), __FUNCTION__, ## arg) #define ehca_warn(ib_dev, format, arg...) \ dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + raw_smp_processor_id(), __FUNCTION__, ## arg) #define ehca_err(ib_dev, format, arg...) \ dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + raw_smp_processor_id(), __FUNCTION__, ## arg) /* use this one only if no ib_dev available */ #define ehca_gen_dbg(format, arg...) \ do { \ if (unlikely(ehca_debug_level)) \ printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg); \ + raw_smp_processor_id(), __FUNCTION__, ## arg); \ } while (0) #define ehca_gen_warn(format, arg...) \ printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + raw_smp_processor_id(), __FUNCTION__, ## arg) #define ehca_gen_err(format, arg...) \ printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \ - get_paca()->paca_index, __FUNCTION__, ## arg) + raw_smp_processor_id(), __FUNCTION__, ## arg) /** * ehca_dmp - printk a memory block, whose length is n*8 bytes. -- cgit v0.10.2 From 39089e77741a53874eb8a29e4516bbafcc29298a Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 11 Sep 2007 15:35:32 +0200 Subject: IB/ehca: Bump version number and change its format Nobody needed the SVNEHCA_ prefix anyway. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 0e3ffee..403467f 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -49,10 +49,12 @@ #include "ehca_tools.h" #include "hcp_if.h" +#define HCAD_VERSION "0024" + MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch "); MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); -MODULE_VERSION("SVNEHCA_0023"); +MODULE_VERSION(HCAD_VERSION); int ehca_open_aqp1 = 0; int ehca_debug_level = 0; @@ -909,7 +911,7 @@ int __init ehca_module_init(void) int ret; printk(KERN_INFO "eHCA Infiniband Device Driver " - "(Rel.: SVNEHCA_0023)\n"); + "(Version " HCAD_VERSION ")\n"); ret = ehca_create_comp_pool(); if (ret) { -- cgit v0.10.2 From 08c283ac262d7ab21c5733ff469ff88985381ca9 Mon Sep 17 00:00:00 2001 From: Hoang-Nam Nguyen Date: Thu, 13 Sep 2007 18:14:58 +0200 Subject: IB/ehca: Fix large page HW cap defines Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 0942a17..d670696 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -100,10 +100,10 @@ struct ehca_sport { struct ehca_sma_attr saved_attr; }; -#define HCA_CAP_MR_PGSIZE_4K 1 -#define HCA_CAP_MR_PGSIZE_64K 2 -#define HCA_CAP_MR_PGSIZE_1M 4 -#define HCA_CAP_MR_PGSIZE_16M 8 +#define HCA_CAP_MR_PGSIZE_4K 0x80000000 +#define HCA_CAP_MR_PGSIZE_64K 0x40000000 +#define HCA_CAP_MR_PGSIZE_1M 0x20000000 +#define HCA_CAP_MR_PGSIZE_16M 0x10000000 struct ehca_shca { struct ib_device ib_device; -- cgit v0.10.2 From 81668838c4583b19276b16382e0c61e21ef5adf0 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 2 Aug 2007 12:21:31 -0700 Subject: IPoIB: Specify Traffic Class with path record queries for QoS support To support QoS within and between subnets, modify IPoIB to request specific Traffic Class values with path record queries, using the value associated with the IPoIB broadcast group. Signed-off-by: Sean Hefty [ See some comments I made on this at v1 and v2 of the posts ] Reviewed-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 285c143..fc16bce 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -113,7 +113,27 @@ struct ipoib_pseudoheader { u8 hwaddr[INFINIBAND_ALEN]; }; -struct ipoib_mcast; +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long backoff; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; +}; struct ipoib_rx_buf { struct sk_buff *skb; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 5f948b9..900335a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -468,9 +468,10 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) INIT_LIST_HEAD(&path->neigh_list); memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); - path->pathrec.sgid = priv->local_gid; - path->pathrec.pkey = cpu_to_be16(priv->pkey); - path->pathrec.numb_path = 1; + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } @@ -491,6 +492,7 @@ static int path_rec_start(struct net_device *dev, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index aae3670..94a5709 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -57,28 +57,6 @@ MODULE_PARM_DESC(mcast_debug_level, static DEFINE_MUTEX(mcast_mutex); -/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ -struct ipoib_mcast { - struct ib_sa_mcmember_rec mcmember; - struct ib_sa_multicast *mc; - struct ipoib_ah *ah; - - struct rb_node rb_node; - struct list_head list; - - unsigned long created; - unsigned long backoff; - - unsigned long flags; - unsigned char logcount; - - struct list_head neigh_list; - - struct sk_buff_head pkt_queue; - - struct net_device *dev; -}; - struct ipoib_mcast_iter { struct net_device *dev; union ib_gid mgid; -- cgit v0.10.2 From 733d65fe33b3002a6f2694c0fd8bd760dc13141f Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 8 Aug 2007 15:41:28 -0700 Subject: IB/sa: Add new QoS fields to path record The QoS annex defines new fields for path records. Add them to the ib_sa for consumers that want to use them. Signed-off-by: Sean Hefty Reviewed-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 312c8ff..cf474ec 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -123,14 +123,10 @@ static u32 tid; .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { - { RESERVED, + { PATH_REC_FIELD(service_id), .offset_words = 0, .offset_bits = 0, - .size_bits = 32 }, - { RESERVED, - .offset_words = 1, - .offset_bits = 0, - .size_bits = 32 }, + .size_bits = 64 }, { PATH_REC_FIELD(dgid), .offset_words = 2, .offset_bits = 0, @@ -179,7 +175,7 @@ static const struct ib_field path_rec_table[] = { .offset_words = 12, .offset_bits = 16, .size_bits = 16 }, - { RESERVED, + { PATH_REC_FIELD(qos_class), .offset_words = 13, .offset_bits = 0, .size_bits = 12 }, diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 5e26b2f..942692b 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -109,8 +109,8 @@ enum ib_sa_selector { * Reserved rows are indicated with comments to help maintainability. */ -/* reserved: 0 */ -/* reserved: 1 */ +#define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\ + IB_SA_COMP_MASK( 1)) #define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) #define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) #define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) @@ -123,7 +123,7 @@ enum ib_sa_selector { #define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) #define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) #define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) -/* reserved: 14 */ +#define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14) #define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) #define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) #define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) @@ -134,8 +134,7 @@ enum ib_sa_selector { #define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) struct ib_sa_path_rec { - /* reserved */ - /* reserved */ + __be64 service_id; union ib_gid dgid; union ib_gid sgid; __be16 dlid; @@ -148,7 +147,7 @@ struct ib_sa_path_rec { int reversible; u8 numb_path; __be16 pkey; - /* reserved */ + __be16 qos_class; u8 sl; u8 mtu_selector; u8 mtu; -- cgit v0.10.2 From a81c994d5eef87ed77cb30d8343d6be296528b3f Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 8 Aug 2007 15:51:06 -0700 Subject: RDMA/cma: Add ability to specify type of service Provide support to specify a type of service for a communication identifier. A new function call is used when dealing with IPv4 addresses. For IPv6 addresses, the ToS is specified through the traffic class field in the sockaddr_in6 structure. Signed-off-by: Sean Hefty [ The comments Eitan Zahavi and myself have made over the v1 post at were fully addressed. ] Reviewed-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9ffb998..19c9172 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -138,6 +138,7 @@ struct rdma_id_private { u32 qkey; u32 qp_num; u8 srq; + u8 tos; }; struct cma_multicast { @@ -1474,6 +1475,15 @@ err: } EXPORT_SYMBOL(rdma_listen); +void rdma_set_service_type(struct rdma_cm_id *id, int tos) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->tos = (u8) tos; +} +EXPORT_SYMBOL(rdma_set_service_type); + static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { @@ -1498,23 +1508,37 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { - struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr; + struct rdma_addr *addr = &id_priv->id.route.addr; struct ib_sa_path_rec path_rec; + ib_sa_comp_mask comp_mask; + struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); - ib_addr_get_sgid(addr, &path_rec.sgid); - ib_addr_get_dgid(addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr)); + ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); + ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; + path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr); + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; + + if (addr->src_addr.sa_family == AF_INET) { + path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); + comp_mask |= IB_SA_PATH_REC_QOS_CLASS; + } else { + sin6 = (struct sockaddr_in6 *) &addr->src_addr; + path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, - id_priv->id.port_num, &path_rec, - IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | - IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | - IB_SA_PATH_REC_REVERSIBLE, - timeout_ms, GFP_KERNEL, - cma_query_handler, work, &id_priv->query); + id_priv->id.port_num, &path_rec, + comp_mask, timeout_ms, + GFP_KERNEL, cma_query_handler, + work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 2d6a770..010f876 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -314,4 +314,18 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, */ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); +/** + * rdma_set_service_type - Set the type of service associated with a + * connection identifier. + * @id: Communication identifier to associated with service type. + * @tos: Type of service. + * + * The type of service is interpretted as a differentiated service + * field (RFC 2474). The service type should be specified before + * performing route resolution, as existing communication on the + * connection identifier may be unaffected. The type of service + * requested may not be supported by the network to all destinations. + */ +void rdma_set_service_type(struct rdma_cm_id *id, int tos); + #endif /* RDMA_CM_H */ -- cgit v0.10.2 From 7ce86409adcd2fda652b628173a66e905950ece1 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 8 Aug 2007 15:51:13 -0700 Subject: RDMA/ucma: Allow user space to set service type Export the ability to set the type of service to user space. Model the interface after setsockopt. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 53b4c94..90d675a 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -792,6 +792,78 @@ out: return ret; } +static int ucma_set_option_id(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret = 0; + + switch (optname) { + case RDMA_OPTION_ID_TOS: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static int ucma_set_option_level(struct ucma_context *ctx, int level, + int optname, void *optval, size_t optlen) +{ + int ret; + + switch (level) { + case RDMA_OPTION_ID: + ret = ucma_set_option_id(ctx, optname, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_set_option cmd; + struct ucma_context *ctx; + void *optval; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + optval = kmalloc(cmd.optlen, GFP_KERNEL); + if (!optval) { + ret = -ENOMEM; + goto out1; + } + + if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval, + cmd.optlen)) { + ret = -EFAULT; + goto out2; + } + + ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, + cmd.optlen); +out2: + kfree(optval); +out1: + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { @@ -936,7 +1008,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, [RDMA_USER_CM_CMD_GET_OPTION] = NULL, - [RDMA_USER_CM_CMD_SET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h index f632b0c..9749c1b 100644 --- a/include/rdma/rdma_user_cm.h +++ b/include/rdma/rdma_user_cm.h @@ -212,4 +212,22 @@ struct rdma_ucm_event_resp { } param; }; +/* Option levels */ +enum { + RDMA_OPTION_ID = 0 +}; + +/* Option details */ +enum { + RDMA_OPTION_ID_TOS = 0 +}; + +struct rdma_ucm_set_option { + __u64 optval; + __u32 id; + __u32 level; + __u32 optname; + __u32 optlen; +}; + #endif /* RDMA_USER_CM_H */ -- cgit v0.10.2 From 247e020ee5e2a7bf46f2d7a3d4490a670a712a40 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 8 Aug 2007 15:51:18 -0700 Subject: IB/srp: Add QoS support through service ID Provide the target service ID when performing a path record query to support optional QoS capability. QoS requires support from the SA. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f6a0514..9ccc638 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -285,6 +285,7 @@ static int srp_lookup_path(struct srp_target_port *target) target->srp_host->dev->dev, target->srp_host->port, &target->path, + IB_SA_PATH_REC_SERVICE_ID | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | @@ -1692,6 +1693,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) goto out; } target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16)); + target->path.service_id = target->service_id; kfree(p); break; -- cgit v0.10.2 From c8d8beea0383e47c9d65d45f0ca95626ec435fcd Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Thu, 13 Sep 2007 18:15:28 +0200 Subject: IB/umem: Add hugetlb flag to struct ib_umem During ib_umem_get(), determine whether all pages from the memory region are hugetlb pages and report this in the "hugetlb" member. Low-level drivers can use this information if they need it. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 664d2faa..2f54e29 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "uverbs.h" @@ -75,6 +76,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, { struct ib_umem *umem; struct page **page_list; + struct vm_area_struct **vma_list; struct ib_umem_chunk *chunk; unsigned long locked; unsigned long lock_limit; @@ -104,6 +106,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, */ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + /* We assume the memory is from hugetlb until proved otherwise */ + umem->hugetlb = 1; + INIT_LIST_HEAD(&umem->chunk_list); page_list = (struct page **) __get_free_page(GFP_KERNEL); @@ -112,6 +117,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, return ERR_PTR(-ENOMEM); } + /* + * if we can't alloc the vma_list, it's not so bad; + * just assume the memory is not hugetlb memory + */ + vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); + if (!vma_list) + umem->hugetlb = 0; + npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; down_write(¤t->mm->mmap_sem); @@ -131,7 +144,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages(current, current->mm, cur_base, min_t(int, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, NULL); + 1, !umem->writable, page_list, vma_list); if (ret < 0) goto out; @@ -152,6 +165,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); for (i = 0; i < chunk->nents; ++i) { + if (vma_list && + !is_vm_hugetlb_page(vma_list[i + off])) + umem->hugetlb = 0; chunk->page_list[i].page = page_list[i + off]; chunk->page_list[i].offset = 0; chunk->page_list[i].length = PAGE_SIZE; @@ -186,6 +202,8 @@ out: current->mm->locked_vm = locked; up_write(¤t->mm->mmap_sem); + if (vma_list) + free_page((unsigned long) vma_list); free_page((unsigned long) page_list); return ret < 0 ? ERR_PTR(ret) : umem; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index c533d6c..2229842 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -45,6 +45,7 @@ struct ib_umem { int offset; int page_size; int writable; + int hugetlb; struct list_head chunk_list; struct work_struct work; struct mm_struct *mm; -- cgit v0.10.2 From 3a31c41901b6bd3937ec36e0e4a930849e270df6 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Thu, 13 Sep 2007 18:16:20 +0200 Subject: IB/ehca: Only use MR large pages for hugetlb regions ...because, on virtualized hardware like System p, we can't be sure that the physical pages behind them are contiguous otherwise. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index e925e84..da88738 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -51,6 +51,7 @@ #define NUM_CHUNKS(length, chunk_size) \ (((length) + (chunk_size - 1)) / (chunk_size)) + /* max number of rpages (per hcall register_rpages) */ #define MAX_RPAGES 512 @@ -64,6 +65,11 @@ enum ehca_mr_pgsize { EHCA_MR_PGSIZE16M = 0x1000000L }; +#define EHCA_MR_PGSHIFT4K 12 +#define EHCA_MR_PGSHIFT64K 16 +#define EHCA_MR_PGSHIFT1M 20 +#define EHCA_MR_PGSHIFT16M 24 + static u32 ehca_encode_hwpage_size(u32 pgsize) { u32 idx = 0; @@ -347,17 +353,16 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, /* select proper hw_pgsize */ if (ehca_mr_largepage && (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M)) { - if (length <= EHCA_MR_PGSIZE4K - && PAGE_SIZE == EHCA_MR_PGSIZE4K) - hwpage_size = EHCA_MR_PGSIZE4K; - else if (length <= EHCA_MR_PGSIZE64K) - hwpage_size = EHCA_MR_PGSIZE64K; - else if (length <= EHCA_MR_PGSIZE1M) - hwpage_size = EHCA_MR_PGSIZE1M; - else - hwpage_size = EHCA_MR_PGSIZE16M; + int page_shift = PAGE_SHIFT; + if (e_mr->umem->hugetlb) { + /* determine page_shift, clamp between 4K and 16M */ + page_shift = (fls64(length - 1) + 3) & ~3; + page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), + EHCA_MR_PGSHIFT16M); + } + hwpage_size = 1UL << page_shift; } else - hwpage_size = EHCA_MR_PGSIZE4K; + hwpage_size = EHCA_MR_PGSIZE4K; /* ehca1 only supports 4k */ ehca_dbg(pd->device, "hwpage_size=%lx", hwpage_size); reg_user_mr_fallback: -- cgit v0.10.2 From 935ef2d7a2910ca810fce57511f4d06a0cf00b5e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 12 Sep 2007 05:00:25 -0500 Subject: RDMA/cma: Use neigh_event_send() to start neighbour discovery Calling arp_send() to initiate neighbour discovery (ND) doesn't do the full ND protocol. Namely, it doesn't handle retransmitting the arp request if it is dropped. The function neigh_event_send() does all this. Without doing full ND, RDMA address resolution fails in the presence of dropped ARP broadcast packets. Signed-off-by: Steve Wise Acked-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c5c33d3..5381c80 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -161,8 +161,7 @@ static void addr_send_arp(struct sockaddr_in *dst_in) if (ip_route_output_key(&rt, &fl)) return; - arp_send(ARPOP_REQUEST, ETH_P_ARP, rt->rt_gateway, rt->idev->dev, - rt->rt_src, NULL, rt->idev->dev->dev_addr, NULL); + neigh_event_send(rt->u.dst.neighbour, NULL); ip_rt_put(rt); } -- cgit v0.10.2 From ec2a1344ad348a789b1d9d9b32cccbef33161574 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:13 -0700 Subject: IB/iser: Remove unnecessary includes is not needed because everyplace it appears, also appears. is not needed because nothing seems to be using device IO anyway. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 9ea5b9a..a6f2303 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -34,8 +34,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 36cdf77..e05690e 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -36,8 +36,6 @@ #include #include #include -#include -#include #include #include "iscsi_iser.h" diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index d42ec01..654a4dc 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -32,7 +32,6 @@ * * $Id: iser_verbs.c 7051 2006-05-10 12:29:11Z ogerlitz $ */ -#include #include #include #include -- cgit v0.10.2 From ea98054fefa28b646c740baffe83c2962f8228f6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:13 -0700 Subject: mlx4_core: Change capability decoding: SRC->XRC The SRC ("scalable RC") transport has been renamed to XRC ("extended RC"), to avoid having an abbreviation that is so easily confused with an abbreviation for "source." Update the HCA capability decoding output to use the new name. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index c45cbe4..6471d33 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -76,7 +76,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags) [ 0] = "RC transport", [ 1] = "UC transport", [ 2] = "UD transport", - [ 3] = "SRC transport", + [ 3] = "XRC transport", [ 4] = "reliable multicast", [ 5] = "FCoIB support", [ 6] = "SRQ support", -- cgit v0.10.2 From 9faa559c01311281f26544291322252327b65922 Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Thu, 23 Aug 2007 04:58:30 +0530 Subject: IB/ehca: Misc cpuinit section annotations and #ifdef cleanups * Replace {un}register_cpu_notifier with {un}register_hotcpu_notifier thereby losing a couple of #ifdef HOTPLUG_CPU pairs. * Move comp_pool_callback_nb declaration to below that of callback function so that initialization of .notifier_call and .priority can occur at build time itself and not runtime. * Mark the notifier_block (and callback function, and another static function used by it) as __cpuinit{data} for the sake of consistency and remove enclosing #ifdef. (This may increase size for modular build of this module, however, because these are no longer dropped unconditionally now.) Signed-off-by: Satyam Sharma Acked-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index 7093986..3f617b2 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -69,9 +69,6 @@ static void queue_comp_task(struct ehca_cq *__cq); static struct ehca_comp_pool *pool; -#ifdef CONFIG_HOTPLUG_CPU -static struct notifier_block comp_pool_callback_nb; -#endif static inline void comp_event_callback(struct ehca_cq *cq) { @@ -760,9 +757,7 @@ static void destroy_comp_task(struct ehca_comp_pool *pool, kthread_stop(task); } -#ifdef CONFIG_HOTPLUG_CPU -static void take_over_work(struct ehca_comp_pool *pool, - int cpu) +static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu) { struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); LIST_HEAD(list); @@ -785,9 +780,9 @@ static void take_over_work(struct ehca_comp_pool *pool, } -static int comp_pool_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __cpuinit comp_pool_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct ehca_cpu_comp_task *cct; @@ -833,7 +828,11 @@ static int comp_pool_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif + +static struct notifier_block comp_pool_callback_nb __cpuinitdata = { + .notifier_call = comp_pool_callback, + .priority = 0, +}; int ehca_create_comp_pool(void) { @@ -864,11 +863,7 @@ int ehca_create_comp_pool(void) } } -#ifdef CONFIG_HOTPLUG_CPU - comp_pool_callback_nb.notifier_call = comp_pool_callback; - comp_pool_callback_nb.priority = 0; - register_cpu_notifier(&comp_pool_callback_nb); -#endif + register_hotcpu_notifier(&comp_pool_callback_nb); printk(KERN_INFO "eHCA scaling code enabled\n"); @@ -882,9 +877,7 @@ void ehca_destroy_comp_pool(void) if (!ehca_scaling_code) return; -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&comp_pool_callback_nb); -#endif + unregister_hotcpu_notifier(&comp_pool_callback_nb); for (i = 0; i < NR_CPUS; i++) { if (cpu_online(i)) -- cgit v0.10.2 From 57cb61d587e990d556385d367589ff61f6c2c0f2 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 20 Sep 2007 16:33:44 -0700 Subject: IB/core: Fix handling of multicast response failures I was looking at the code for multicast.c and noticed that ib_sa_join_multicast() calls queue_join() which puts the request at the front of the group->pending_list. If this is a second request, it seems like it would interfere with process_join_error() since group->last_join won't point to the member at the head of the pending_list. The sequence would thus be: 1. ib_sa_join_multicast() puts member1 on head of pending_list and starts work thread 2. mcast_work_handler() calls send_join() which sets group->last_join to member1 3. ib_sa_join_multicast() puts member2 on head of pending_list 4. join operation for member1 receives failures response from SA. 5. join_handler() is called with error status 6. process_join_error() fails to process member1 since it doesn't match the first entry in the group->pending_list. The impact is that the failed join request is tossed. The second request is processed, and after it completes, the original request ends up being retried. This change also results in join requests being processed in FIFO order. Signed-off-by: Ralph Campbell Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 15b4c4d..1bc1fe6 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -196,7 +196,7 @@ static void queue_join(struct mcast_member *member) unsigned long flags; spin_lock_irqsave(&group->lock, flags); - list_add(&member->list, &group->pending_list); + list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); -- cgit v0.10.2 From cd9281d873c91a01af0cb96ff0f75e9905e54403 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 18 Sep 2007 09:14:18 +0200 Subject: IB/mlx4: Display misc device information under /sys/class/infiniband/ display the following device information under /sys/class/infiniband/mlx4_X: board_id, fw_ver, hw_rev, hca_type. This patch makes this information available to userspace utilities such as ibstat and ibv_devinfo. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index dde8fe9..d9fc822 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -476,9 +476,48 @@ out: return err; } +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev); + return sprintf(buf, "MT%d\n", dev->dev->pdev->device); +} + +static ssize_t show_fw_ver(struct class_device *cdev, char *buf) +{ + struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev); + return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), + (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->dev->caps.fw_ver & 0xffff); +} + +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev); + return sprintf(buf, "%x\n", dev->dev->rev_id); +} + +static ssize_t show_board(struct class_device *cdev, char *buf) +{ + struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); +} + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct class_device_attribute *mlx4_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_fw_ver, + &class_device_attr_hca_type, + &class_device_attr_board_id +}; + static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; + int i; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { @@ -580,6 +619,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_mad_init(ibdev)) goto err_reg; + for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) { + if (class_device_create_file(&ibdev->ib_dev.class_dev, + mlx4_class_attributes[i])) + goto err_reg; + } + return ibdev; err_reg: diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 4b12694..9e590e1 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -524,8 +524,8 @@ static int __devinit mlx4_init_hca(struct mlx4_dev *dev) } priv->eq_table.inta_pin = adapter.inta_pin; - priv->rev_id = adapter.revision_id; - memcpy(priv->board_id, adapter.board_id, sizeof priv->board_id); + dev->rev_id = adapter.revision_id; + memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); return 0; diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index be304a7..b9f8397 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -56,10 +56,6 @@ enum { }; enum { - MLX4_BOARD_ID_LEN = 64 -}; - -enum { MLX4_MGM_ENTRY_SIZE = 0x40, MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2), MLX4_MTT_ENTRY_PER_SEG = 8 @@ -277,9 +273,6 @@ struct mlx4_priv { struct mlx4_uar driver_uar; void __iomem *kar; - - u32 rev_id; - char board_id[MLX4_BOARD_ID_LEN]; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index cfb78fb..a93520c 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -49,6 +49,10 @@ enum { }; enum { + MLX4_BOARD_ID_LEN = 64 +}; + +enum { MLX4_DEV_CAP_FLAG_RC = 1 << 0, MLX4_DEV_CAP_FLAG_UC = 1 << 1, MLX4_DEV_CAP_FLAG_UD = 1 << 2, @@ -272,6 +276,8 @@ struct mlx4_dev { unsigned long flags; struct mlx4_caps caps; struct radix_tree_root qp_table_tree; + u32 rev_id; + char board_id[MLX4_BOARD_ID_LEN]; }; struct mlx4_init_port_param { -- cgit v0.10.2 From 03f72a51cb1a0ba530e3308e3de84399a75b41ec Mon Sep 17 00:00:00 2001 From: Hoang-Nam Nguyen Date: Fri, 28 Sep 2007 17:16:27 +0200 Subject: IB/ehca: Fix mem leak of firmware ctrlblock in ehca_create_srq() Signed-off-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index b10c7df..2591651 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -890,6 +890,8 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd, goto create_srq2; } + ehca_free_fw_ctrlblock(mqpcb); + return &my_qp->ib_srq; create_srq2: -- cgit v0.10.2 From a66072237500f31cec19fa688210150de9c9f957 Mon Sep 17 00:00:00 2001 From: Hoang-Nam Nguyen Date: Fri, 28 Sep 2007 17:18:47 +0200 Subject: IB/ehca: Adjust 64-bit alignment of create QP response for userspace Signed-off-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index d670696..0f7a55d 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -351,6 +351,7 @@ struct ehca_create_qp_resp { /* qp_num assigned by ehca: sqp0/1 may have got different numbers */ u32 real_qp_num; u32 fw_handle_ofs; + u32 dummy; struct ipzu_queue_resp ipz_squeue; struct ipzu_queue_resp ipz_rqueue; }; -- cgit v0.10.2 From c01759cee91379cc3cb551bfd7c76f1b51f91ca2 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Fri, 28 Sep 2007 17:20:05 +0200 Subject: IB/ehca: Return srq_attr->max_sge in ehca_query_srq() Totally forgot this. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 2591651..e2bd62b 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1753,6 +1753,7 @@ int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) } srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1; + srq_attr->max_sge = qpcb->actual_nr_sges_in_rq_wqe; srq_attr->srq_limit = EHCA_BMASK_GET( MQPCB_CURR_SRQ_LIMIT, qpcb->curr_srq_limit); -- cgit v0.10.2 From 2be8e3ee8efd6f99ce454115c29d09750915021a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:15 -0700 Subject: IB/umad: Add P_Key index support Add support for setting the P_Key index of sent MADs and getting the P_Key index of received MADs. This requires a change to the layout of the ABI structure struct ib_user_mad_hdr, so to avoid breaking compatibility, we default to the old (unchanged) ABI and add a new ioctl IB_USER_MAD_ENABLE_PKEY that allows applications that are aware of the new ABI to opt into using it. We plan on switching to the new ABI by default in a year or so, and this patch adds a warning that is printed when an application uses the old ABI, to push people towards converting to the new ABI. Signed-off-by: Roland Dreier Reviewed-by: Sean Hefty Reviewed-by: Hal Rosenstock diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt index 8ec54b9..744687d 100644 --- a/Documentation/infiniband/user_mad.txt +++ b/Documentation/infiniband/user_mad.txt @@ -99,6 +99,20 @@ Transaction IDs request/response pairs. The upper 32 bits are reserved for use by the kernel and will be overwritten before a MAD is sent. +P_Key Index Handling + + The old ib_umad interface did not allow setting the P_Key index for + MADs that are sent and did not provide a way for obtaining the P_Key + index of received MADs. A new layout for struct ib_user_mad_hdr + with a pkey_index member has been defined; however, to preserve + binary compatibility with older applications, this new layout will + not be used unless the IB_USER_MAD_ENABLE_PKEY ioctl is called + before a file descriptor is used for anything else. + + In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented + to 6, the new layout of struct ib_user_mad_hdr will be used by + default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed. + Setting IsSM Capability Bit To set the IsSM capability bit for a port, simply open the diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index d97ded2..aee2913 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -118,6 +118,8 @@ struct ib_umad_file { wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; int agents_dead; + u8 use_pkey_index; + u8 already_used; }; struct ib_umad_packet { @@ -147,6 +149,12 @@ static void ib_umad_release_dev(struct kref *ref) kfree(dev); } +static int hdr_size(struct ib_umad_file *file) +{ + return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : + sizeof (struct ib_user_mad_hdr_old); +} + /* caller must hold port->mutex at least for reading */ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) { @@ -221,13 +229,13 @@ static void recv_handler(struct ib_mad_agent *agent, packet->length = mad_recv_wc->mad_len; packet->recv_wc = mad_recv_wc; - packet->mad.hdr.status = 0; - packet->mad.hdr.length = sizeof (struct ib_user_mad) + - mad_recv_wc->mad_len; - packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); - packet->mad.hdr.sl = mad_recv_wc->wc->sl; - packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; + packet->mad.hdr.status = 0; + packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; + packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); + packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.sl = mad_recv_wc->wc->sl; + packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; + packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); if (packet->mad.hdr.grh_present) { struct ib_ah_attr ah_attr; @@ -253,8 +261,8 @@ err1: ib_free_recv_mad(mad_recv_wc); } -static ssize_t copy_recv_mad(char __user *buf, struct ib_umad_packet *packet, - size_t count) +static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) { struct ib_mad_recv_buf *recv_buf; int left, seg_payload, offset, max_seg_payload; @@ -262,15 +270,15 @@ static ssize_t copy_recv_mad(char __user *buf, struct ib_umad_packet *packet, /* We need enough room to copy the first (or only) MAD segment. */ recv_buf = &packet->recv_wc->recv_buf; if ((packet->length <= sizeof (*recv_buf->mad) && - count < sizeof (packet->mad) + packet->length) || + count < hdr_size(file) + packet->length) || (packet->length > sizeof (*recv_buf->mad) && - count < sizeof (packet->mad) + sizeof (*recv_buf->mad))) + count < hdr_size(file) + sizeof (*recv_buf->mad))) return -EINVAL; - if (copy_to_user(buf, &packet->mad, sizeof (packet->mad))) + if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; - buf += sizeof (packet->mad); + buf += hdr_size(file); seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); if (copy_to_user(buf, recv_buf->mad, seg_payload)) return -EFAULT; @@ -280,7 +288,7 @@ static ssize_t copy_recv_mad(char __user *buf, struct ib_umad_packet *packet, * Multipacket RMPP MAD message. Copy remainder of message. * Note that last segment may have a shorter payload. */ - if (count < sizeof (packet->mad) + packet->length) { + if (count < hdr_size(file) + packet->length) { /* * The buffer is too small, return the first RMPP segment, * which includes the RMPP message length. @@ -300,18 +308,23 @@ static ssize_t copy_recv_mad(char __user *buf, struct ib_umad_packet *packet, return -EFAULT; } } - return sizeof (packet->mad) + packet->length; + return hdr_size(file) + packet->length; } -static ssize_t copy_send_mad(char __user *buf, struct ib_umad_packet *packet, - size_t count) +static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) { - ssize_t size = sizeof (packet->mad) + packet->length; + ssize_t size = hdr_size(file) + packet->length; if (count < size) return -EINVAL; - if (copy_to_user(buf, &packet->mad, size)) + if (copy_to_user(buf, &packet->mad, hdr_size(file))) + return -EFAULT; + + buf += hdr_size(file); + + if (copy_to_user(buf, packet->mad.data, packet->length)) return -EFAULT; return size; @@ -324,7 +337,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, struct ib_umad_packet *packet; ssize_t ret; - if (count < sizeof (struct ib_user_mad)) + if (count < hdr_size(file)) return -EINVAL; spin_lock_irq(&file->recv_lock); @@ -348,9 +361,9 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, spin_unlock_irq(&file->recv_lock); if (packet->recv_wc) - ret = copy_recv_mad(buf, packet, count); + ret = copy_recv_mad(file, buf, packet, count); else - ret = copy_send_mad(buf, packet, count); + ret = copy_send_mad(file, buf, packet, count); if (ret < 0) { /* Requeue packet */ @@ -442,15 +455,14 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; - if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR) + if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; - if (copy_from_user(&packet->mad, buf, - sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)) { + if (copy_from_user(&packet->mad, buf, hdr_size(file))) { ret = -EFAULT; goto err; } @@ -461,6 +473,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err; } + buf += hdr_size(file); + + if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) { + ret = -EFAULT; + goto err; + } + down_read(&file->port->mutex); agent = __get_agent(file, packet->mad.hdr.id); @@ -500,11 +519,11 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, IB_MGMT_RMPP_FLAG_ACTIVE; } - data_len = count - sizeof (struct ib_user_mad) - hdr_len; + data_len = count - hdr_size(file) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), - 0, rmpp_active, hdr_len, - data_len, GFP_KERNEL); + packet->mad.hdr.pkey_index, rmpp_active, + hdr_len, data_len, GFP_KERNEL); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; @@ -517,7 +536,6 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, /* Copy MAD header. Any RMPP header is already in place. */ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); - buf += sizeof (struct ib_user_mad); if (!rmpp_active) { if (copy_from_user(packet->msg->mad + copy_offset, @@ -646,6 +664,16 @@ found: goto out; } + if (!file->already_used) { + file->already_used = 1; + if (!file->use_pkey_index) { + printk(KERN_WARNING "user_mad: process %s did not enable " + "P_Key index support.\n", current->comm); + printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " + "has info on the new ABI.\n"); + } + } + file->agent[agent_id] = agent; ret = 0; @@ -682,6 +710,20 @@ out: return ret; } +static long ib_umad_enable_pkey(struct ib_umad_file *file) +{ + int ret = 0; + + down_write(&file->port->mutex); + if (file->already_used) + ret = -EINVAL; + else + file->use_pkey_index = 1; + up_write(&file->port->mutex); + + return ret; +} + static long ib_umad_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -690,6 +732,8 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd, return ib_umad_reg_agent(filp->private_data, arg); case IB_USER_MAD_UNREGISTER_AGENT: return ib_umad_unreg_agent(filp->private_data, arg); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); default: return -ENOIOCTLCMD; } diff --git a/include/rdma/ib_user_mad.h b/include/rdma/ib_user_mad.h index d66b15e..2a32043 100644 --- a/include/rdma/ib_user_mad.h +++ b/include/rdma/ib_user_mad.h @@ -52,7 +52,50 @@ */ /** + * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + */ +struct ib_user_mad_hdr_old { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; +}; + +/** * ib_user_mad_hdr - MAD packet header + * This layout allows specifying/receiving the P_Key index. To use + * this capability, an application must call the + * IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before + * any other actions with the file handle. * @id - ID of agent MAD received with/to be sent with * @status - 0 on successful receive, ETIMEDOUT if no response * received (transaction ID in data[] will be set to TID of original @@ -70,6 +113,7 @@ * @traffic_class - Traffic class in GRH * @gid - Remote GID in GRH * @flow_label - Flow label in GRH + * @pkey_index - P_Key index */ struct ib_user_mad_hdr { __u32 id; @@ -88,6 +132,8 @@ struct ib_user_mad_hdr { __u8 traffic_class; __u8 gid[16]; __be32 flow_label; + __u16 pkey_index; + __u8 reserved[6]; }; /** @@ -134,4 +180,6 @@ struct ib_user_mad_reg_req { #define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) +#define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) + #endif /* IB_USER_MAD_H */ -- cgit v0.10.2 From a394f83bdfec10b09d8cb111e622556b2e6fd0de Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:15 -0700 Subject: IB/umad: Fix bit ordering and 32-on-64 problems on big endian systems The declaration of struct ib_user_mad_reg_req.method_mask[] exported to userspace was an array of __u32, but the kernel internally treated it as a bitmap made up of longs. This makes a difference for 64-bit big-endian kernels, where numbering the bits in an array of__u32 gives: |31.....0|63....31|95....64|127...96| while numbering the bits in an array of longs gives: |63..............0|127............64| 64-bit userspace can handle this by just treating method_mask[] as an array of longs, but 32-bit userspace is really stuck: the meaning of the bits in method_mask[] depends on whether the kernel is 32-bit or 64-bit, and there's no sane way for userspace to know that. Fix this by updating to make it clear that method_mask[] is an array of longs, and using a compat_ioctl method to convert to an array of 64-bit longs to handle the 32-on-64 problem. This fixes the interface description to match existing behavior (so working binaries continue to work) in almost all situations, and gives consistent semantics in the case of 32-bit userspace that can run on either a 32-bit or 64-bit kernel, so that the same binary can work for both 32-on-32 and 32-on-64 systems. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index aee2913..b53eac4 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -607,7 +608,8 @@ static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wa return mask; } -static int ib_umad_reg_agent(struct ib_umad_file *file, unsigned long arg) +static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, + int compat_method_mask) { struct ib_user_mad_reg_req ureq; struct ib_mad_reg_req req; @@ -622,7 +624,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, unsigned long arg) goto out; } - if (copy_from_user(&ureq, (void __user *) arg, sizeof ureq)) { + if (copy_from_user(&ureq, arg, sizeof ureq)) { ret = -EFAULT; goto out; } @@ -643,8 +645,18 @@ found: if (ureq.mgmt_class) { req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; - memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask); - memcpy(req.oui, ureq.oui, sizeof req.oui); + memcpy(req.oui, ureq.oui, sizeof req.oui); + + if (compat_method_mask) { + u32 *umm = (u32 *) ureq.method_mask; + int i; + + for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i) + req.method_mask[i] = + umm[i * 2] | ((u64) umm[i * 2 + 1] << 32); + } else + memcpy(req.method_mask, ureq.method_mask, + sizeof req.method_mask); } agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, @@ -682,13 +694,13 @@ out: return ret; } -static int ib_umad_unreg_agent(struct ib_umad_file *file, unsigned long arg) +static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; u32 id; int ret = 0; - if (get_user(id, (u32 __user *) arg)) + if (get_user(id, arg)) return -EFAULT; down_write(&file->port->mutex); @@ -729,9 +741,9 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd, { switch (cmd) { case IB_USER_MAD_REGISTER_AGENT: - return ib_umad_reg_agent(filp->private_data, arg); + return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0); case IB_USER_MAD_UNREGISTER_AGENT: - return ib_umad_unreg_agent(filp->private_data, arg); + return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); default: @@ -739,6 +751,23 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd, } } +#ifdef CONFIG_COMPAT +static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case IB_USER_MAD_REGISTER_AGENT: + return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1); + case IB_USER_MAD_UNREGISTER_AGENT: + return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); + default: + return -ENOIOCTLCMD; + } +} +#endif + static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; @@ -826,7 +855,9 @@ static const struct file_operations umad_fops = { .write = ib_umad_write, .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, - .compat_ioctl = ib_umad_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ib_umad_compat_ioctl, +#endif .open = ib_umad_open, .release = ib_umad_close }; diff --git a/include/rdma/ib_user_mad.h b/include/rdma/ib_user_mad.h index 2a32043..29d2c72 100644 --- a/include/rdma/ib_user_mad.h +++ b/include/rdma/ib_user_mad.h @@ -147,6 +147,26 @@ struct ib_user_mad { __u64 data[0]; }; +/* + * Earlier versions of this interface definition declared the + * method_mask[] member as an array of __u32 but treated it as a + * bitmap made up of longs in the kernel. This ambiguity meant that + * 32-bit big-endian applications that can run on both 32-bit and + * 64-bit kernels had no consistent ABI to rely on, and 64-bit + * big-endian applications that treated method_mask as being made up + * of 32-bit words would have their bitmap misinterpreted. + * + * To clear up this confusion, we change the declaration of + * method_mask[] to use unsigned long and handle the conversion from + * 32-bit userspace to 64-bit kernel for big-endian systems in the + * compat_ioctl method. Unfortunately, to keep the structure layout + * the same, we need the method_mask[] array to be aligned only to 4 + * bytes even when long is 64 bits, which forces us into this ugly + * typedef. + */ +typedef unsigned long __attribute__((aligned(4))) packed_ulong; +#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long))) + /** * ib_user_mad_reg_req - MAD registration request * @id - Set by the kernel; used to identify agent in future requests. @@ -165,7 +185,7 @@ struct ib_user_mad { */ struct ib_user_mad_reg_req { __u32 id; - __u32 method_mask[4]; + packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK]; __u8 qpn; __u8 mgmt_class; __u8 mgmt_class_version; -- cgit v0.10.2 From 04d29b0ede242000b24cfc34cc78fbd164c47e1a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:15 -0700 Subject: IB/uverbs: Make ib_uverbs_release_event_file() static ib_uverbs_release_event_file() is only used in uverbs_main.c, so make it static to that file. Also move the definition before the first use, so a forward declaration is not needed. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index c33546f..c75eb6c 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -148,7 +148,6 @@ void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, int is_async, int *fd); -void ib_uverbs_release_event_file(struct kref *ref); struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 14d7ccd..7c2ac39 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -125,6 +125,14 @@ static void ib_uverbs_release_dev(struct kref *ref) complete(&dev->comp); } +static void ib_uverbs_release_event_file(struct kref *ref) +{ + struct ib_uverbs_event_file *file = + container_of(ref, struct ib_uverbs_event_file, ref); + + kfree(file); +} + void ib_uverbs_release_ucq(struct ib_uverbs_file *file, struct ib_uverbs_event_file *ev_file, struct ib_ucq_object *uobj) @@ -331,14 +339,6 @@ static unsigned int ib_uverbs_event_poll(struct file *filp, return pollflags; } -void ib_uverbs_release_event_file(struct kref *ref) -{ - struct ib_uverbs_event_file *file = - container_of(ref, struct ib_uverbs_event_file, ref); - - kfree(file); -} - static int ib_uverbs_event_fasync(int fd, struct file *filp, int on) { struct ib_uverbs_event_file *file = filp->private_data; -- cgit v0.10.2 From 5b0bf5e25efea77103b0ac7c8057cd56c778ef41 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 1 Aug 2007 12:28:20 +0300 Subject: mlx4_core: Support ICM tables in coherent memory Enable having ICM tables in coherent memory, and use coherent memory for the dMPT table. This will allow writing MPT entries for MRs both via the SW2HW_MPT command and also directly by the driver for FMR remapping without needing to flush or worry about cacheline boundaries. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c index b7a4aa8..250e248 100644 --- a/drivers/net/mlx4/icm.c +++ b/drivers/net/mlx4/icm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -50,19 +51,41 @@ enum { MLX4_TABLE_CHUNK_SIZE = 1 << 18 }; -void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm) +static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) { - struct mlx4_icm_chunk *chunk, *tmp; int i; - list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { - if (chunk->nsg > 0) - pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, - PCI_DMA_BIDIRECTIONAL); + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(chunk->mem[i].page, + get_order(chunk->mem[i].length)); +} - for (i = 0; i < chunk->npages; ++i) - __free_pages(chunk->mem[i].page, - get_order(chunk->mem[i].length)); +static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) +{ + int i; + + for (i = 0; i < chunk->npages; ++i) + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(chunk->mem[i].page), + sg_dma_address(&chunk->mem[i])); +} + +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent) +{ + struct mlx4_icm_chunk *chunk, *tmp; + + if (!icm) + return; + + list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { + if (coherent) + mlx4_free_icm_coherent(dev, chunk); + else + mlx4_free_icm_pages(dev, chunk); kfree(chunk); } @@ -70,16 +93,45 @@ void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm) kfree(icm); } +static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +{ + mem->page = alloc_pages(gfp_mask, order); + if (!mem->page) + return -ENOMEM; + + mem->length = PAGE_SIZE << order; + mem->offset = 0; + return 0; +} + +static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, + &sg_dma_address(mem), gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, - gfp_t gfp_mask) + gfp_t gfp_mask, int coherent) { struct mlx4_icm *icm; struct mlx4_icm_chunk *chunk = NULL; int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!icm) - return icm; + return NULL; icm->refcount = 0; INIT_LIST_HEAD(&icm->chunk_list); @@ -101,12 +153,20 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, while (1 << cur_order > npages) --cur_order; - chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order); - if (chunk->mem[chunk->npages].page) { - chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order; - chunk->mem[chunk->npages].offset = 0; + if (coherent) + ret = mlx4_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask); + + if (!ret) { + ++chunk->npages; - if (++chunk->npages == MLX4_ICM_CHUNK_LEN) { + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -125,7 +185,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, } } - if (chunk) { + if (!coherent && chunk) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -137,7 +197,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, return icm; fail: - mlx4_free_icm(dev, icm); + mlx4_free_icm(dev, icm, coherent); return NULL; } @@ -202,7 +262,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT, (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, table->coherent); if (!table->icm[i]) { ret = -ENOMEM; goto out; @@ -210,7 +270,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) if (mlx4_MAP_ICM(dev, table->icm[i], table->virt + (u64) i * MLX4_TABLE_CHUNK_SIZE)) { - mlx4_free_icm(dev, table->icm[i]); + mlx4_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; ret = -ENOMEM; goto out; @@ -234,7 +294,7 @@ void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) if (--table->icm[i]->refcount == 0) { mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i]); + mlx4_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; } @@ -309,7 +369,7 @@ void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem) + int use_lowmem, int use_coherent) { int obj_per_chunk; int num_icm; @@ -327,6 +387,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, table->num_obj = nobj; table->obj_size = obj_size; table->lowmem = use_lowmem; + table->coherent = use_coherent; mutex_init(&table->mutex); for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) { @@ -336,11 +397,11 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT, (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, use_coherent); if (!table->icm[i]) goto err; if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) { - mlx4_free_icm(dev, table->icm[i]); + mlx4_free_icm(dev, table->icm[i], use_coherent); table->icm[i] = NULL; goto err; } @@ -359,7 +420,7 @@ err: if (table->icm[i]) { mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE, MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i]); + mlx4_free_icm(dev, table->icm[i], use_coherent); } return -ENOMEM; @@ -373,7 +434,7 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table) if (table->icm[i]) { mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i]); + mlx4_free_icm(dev, table->icm[i], table->coherent); } kfree(table->icm); diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h index bea223d..a77db6d 100644 --- a/drivers/net/mlx4/icm.h +++ b/drivers/net/mlx4/icm.h @@ -67,8 +67,9 @@ struct mlx4_icm_iter { struct mlx4_dev; -struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask); -void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm); +struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, + gfp_t gfp_mask, int coherent); +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); @@ -78,7 +79,7 @@ void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, int start, int end); int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem); + int use_lowmem, int use_coherent); void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table); int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 9e590e1..07c2847 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -168,7 +168,7 @@ static int __devinit mlx4_load_fw(struct mlx4_dev *dev) int err; priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.fw_icm) { mlx4_err(dev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; @@ -192,7 +192,7 @@ err_unmap_fa: mlx4_UNMAP_FA(dev); err_free: - mlx4_free_icm(dev, priv->fw.fw_icm); + mlx4_free_icm(dev, priv->fw.fw_icm, 0); return err; } @@ -207,7 +207,7 @@ static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, ((u64) (MLX4_CMPT_TYPE_QP * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_qps, - dev->caps.reserved_qps, 0); + dev->caps.reserved_qps, 0, 0); if (err) goto err; @@ -216,7 +216,7 @@ static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, ((u64) (MLX4_CMPT_TYPE_SRQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_srqs, - dev->caps.reserved_srqs, 0); + dev->caps.reserved_srqs, 0, 0); if (err) goto err_qp; @@ -225,7 +225,7 @@ static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, ((u64) (MLX4_CMPT_TYPE_CQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_cqs, - dev->caps.reserved_cqs, 0); + dev->caps.reserved_cqs, 0, 0); if (err) goto err_srq; @@ -236,7 +236,7 @@ static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, cmpt_entry_sz, roundup_pow_of_two(MLX4_NUM_EQ + dev->caps.reserved_eqs), - MLX4_NUM_EQ + dev->caps.reserved_eqs, 0); + MLX4_NUM_EQ + dev->caps.reserved_eqs, 0, 0); if (err) goto err_cq; @@ -275,7 +275,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, (unsigned long long) aux_pages << 2); priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.aux_icm) { mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; @@ -303,7 +303,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->mtt_base, dev->caps.mtt_entry_sz, dev->caps.num_mtt_segs, - dev->caps.reserved_mtts, 1); + dev->caps.reserved_mtts, 1, 0); if (err) { mlx4_err(dev, "Failed to map MTT context memory, aborting.\n"); goto err_unmap_eq; @@ -313,7 +313,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->dmpt_base, dev_cap->dmpt_entry_sz, dev->caps.num_mpts, - dev->caps.reserved_mrws, 1); + dev->caps.reserved_mrws, 1, 1); if (err) { mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n"); goto err_unmap_mtt; @@ -323,7 +323,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->qpc_base, dev_cap->qpc_entry_sz, dev->caps.num_qps, - dev->caps.reserved_qps, 0); + dev->caps.reserved_qps, 0, 0); if (err) { mlx4_err(dev, "Failed to map QP context memory, aborting.\n"); goto err_unmap_dmpt; @@ -333,7 +333,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->auxc_base, dev_cap->aux_entry_sz, dev->caps.num_qps, - dev->caps.reserved_qps, 0); + dev->caps.reserved_qps, 0, 0); if (err) { mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n"); goto err_unmap_qp; @@ -343,7 +343,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->altc_base, dev_cap->altc_entry_sz, dev->caps.num_qps, - dev->caps.reserved_qps, 0); + dev->caps.reserved_qps, 0, 0); if (err) { mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n"); goto err_unmap_auxc; @@ -353,7 +353,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->rdmarc_base, dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift, dev->caps.num_qps, - dev->caps.reserved_qps, 0); + dev->caps.reserved_qps, 0, 0); if (err) { mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n"); goto err_unmap_altc; @@ -363,7 +363,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->cqc_base, dev_cap->cqc_entry_sz, dev->caps.num_cqs, - dev->caps.reserved_cqs, 0); + dev->caps.reserved_cqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map CQ context memory, aborting.\n"); goto err_unmap_rdmarc; @@ -373,7 +373,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->srqc_base, dev_cap->srq_entry_sz, dev->caps.num_srqs, - dev->caps.reserved_srqs, 0); + dev->caps.reserved_srqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n"); goto err_unmap_cq; @@ -388,7 +388,7 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, init_hca->mc_base, MLX4_MGM_ENTRY_SIZE, dev->caps.num_mgms + dev->caps.num_amgms, dev->caps.num_mgms + dev->caps.num_amgms, - 0); + 0, 0); if (err) { mlx4_err(dev, "Failed to map MCG context memory, aborting.\n"); goto err_unmap_srq; @@ -433,7 +433,7 @@ err_unmap_aux: mlx4_UNMAP_ICM_AUX(dev); err_free_aux: - mlx4_free_icm(dev, priv->fw.aux_icm); + mlx4_free_icm(dev, priv->fw.aux_icm, 0); return err; } @@ -458,7 +458,7 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_unmap_eq_icm(dev); mlx4_UNMAP_ICM_AUX(dev); - mlx4_free_icm(dev, priv->fw.aux_icm); + mlx4_free_icm(dev, priv->fw.aux_icm, 0); } static void mlx4_close_hca(struct mlx4_dev *dev) @@ -466,7 +466,7 @@ static void mlx4_close_hca(struct mlx4_dev *dev) mlx4_CLOSE_HCA(dev, 0); mlx4_free_icms(dev); mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm); + mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); } static int __devinit mlx4_init_hca(struct mlx4_dev *dev) @@ -537,7 +537,7 @@ err_free_icm: err_stop_fw: mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, priv->fw.fw_icm); + mlx4_free_icm(dev, priv->fw.fw_icm, 0); return err; } diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index b9f8397..2bad045 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -129,6 +129,7 @@ struct mlx4_icm_table { int num_obj; int obj_size; int lowmem; + int coherent; struct mutex mutex; struct mlx4_icm **icm; }; -- cgit v0.10.2 From cf78237d7b0e683d172e40b1e85b26ca49a3cdba Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:16 -0700 Subject: mlx4_core: Reserve the correct number of MTT segments Taking ilog2(dev->caps.reserved_mtts) to find out the order to pass to the MTT buddy allocator will do the wrong thing if reserved_mtts is ever not a power of 2. Be safe and use fls(dev->caps.reserved_mtts - 1). Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 5b87183..60a6ee2 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -444,7 +444,7 @@ int __devinit mlx4_init_mr_table(struct mlx4_dev *dev) goto err_buddy; if (dev->caps.reserved_mtts) { - if (mlx4_alloc_mtt_range(dev, ilog2(dev->caps.reserved_mtts)) == -1) { + if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) { mlx4_warn(dev, "MTT table of order %d is too small.\n", mr_table->mtt_buddy.max_order); err = -ENOMEM; -- cgit v0.10.2 From 121964ec38d3e17b5ea6183b3c0571df2f4b0eb6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:16 -0700 Subject: mlx4_core: Fix meaning of dev->caps.reserved_mtts Everything that uses caps.reserved_mtts expects it to be a count of MTT segments, not MTT entries. So convert the value that the FW gives us to a count of segments. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 07c2847..ed7e8d7 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -149,7 +149,8 @@ static int __devinit mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev dev->caps.max_cqes = dev_cap->max_cq_sz - 1; dev->caps.reserved_cqs = dev_cap->reserved_cqs; dev->caps.reserved_eqs = dev_cap->reserved_eqs; - dev->caps.reserved_mtts = dev_cap->reserved_mtts; + dev->caps.reserved_mtts = DIV_ROUND_UP(dev_cap->reserved_mtts, + MLX4_MTT_ENTRY_PER_SEG); dev->caps.reserved_mrws = dev_cap->reserved_mrws; dev->caps.reserved_uars = dev_cap->reserved_uars; dev->caps.reserved_pds = dev_cap->reserved_pds; -- cgit v0.10.2 From d7bb58fb1c0e7264a7261c7d0304121ef9402e94 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 1 Aug 2007 12:28:53 +0300 Subject: mlx4_core: Write MTTs from CPU instead with of WRITE_MTT FW command Write MTT entries directly to ICM from the driver (eliminating use of WRITE_MTT command). This reduces the number of FW commands needed to register an MR by at least a factor of 2 and speeds up memory registration significantly. This code will also be used to implement FMRs. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 85ae906..734ec2b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -96,11 +96,10 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, pages[i++] = sg_dma_address(&chunk->page_list[j]) + umem->page_size * k; /* - * Be friendly to WRITE_MTT firmware - * command, and pass it chunks of - * appropriate size. + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. */ - if (i == PAGE_SIZE / sizeof (u64) - 2) { + if (i == PAGE_SIZE / sizeof (u64)) { err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); if (err) diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c index 250e248..4b3c109 100644 --- a/drivers/net/mlx4/icm.c +++ b/drivers/net/mlx4/icm.c @@ -301,9 +301,9 @@ void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) mutex_unlock(&table->mutex); } -void *mlx4_table_find(struct mlx4_icm_table *table, int obj) +void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle) { - int idx, offset, i; + int idx, offset, dma_offset, i; struct mlx4_icm_chunk *chunk; struct mlx4_icm *icm; struct page *page = NULL; @@ -313,15 +313,26 @@ void *mlx4_table_find(struct mlx4_icm_table *table, int obj) mutex_lock(&table->mutex); - idx = obj & (table->num_obj - 1); - icm = table->icm[idx / (MLX4_TABLE_CHUNK_SIZE / table->obj_size)]; - offset = idx % (MLX4_TABLE_CHUNK_SIZE / table->obj_size); + idx = (obj & (table->num_obj - 1)) * table->obj_size; + icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE]; + dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE; if (!icm) goto out; list_for_each_entry(chunk, &icm->chunk_list, list) { for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* + * DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. + */ if (chunk->mem[i].length > offset) { page = chunk->mem[i].page; goto out; diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h index a77db6d..6c44edf 100644 --- a/drivers/net/mlx4/icm.h +++ b/drivers/net/mlx4/icm.h @@ -83,7 +83,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table); int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void *mlx4_table_find(struct mlx4_icm_table *table, int obj); +void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle); int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, int start, int end); void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index ed7e8d7..478b3ba 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -300,6 +300,17 @@ static int __devinit mlx4_init_icm(struct mlx4_dev *dev, goto err_unmap_cmpt; } + /* + * Reserved MTT entries must be aligned up to a cacheline + * boundary, since the FW will write to them, while the driver + * writes to all other MTT entries. (The variable + * dev->caps.mtt_entry_sz below is really the MTT segment + * size, not the raw entry size) + */ + dev->caps.reserved_mtts = + ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz, + dma_get_cache_alignment()) / dev->caps.mtt_entry_sz; + err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table, init_hca->mtt_base, dev->caps.mtt_entry_sz, diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 60a6ee2..3cc98c6 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -349,58 +349,57 @@ err_table: } EXPORT_SYMBOL_GPL(mlx4_mr_enable); -static int mlx4_WRITE_MTT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, - int num_mtt) +static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) { - return mlx4_cmd(dev, mailbox->dma, num_mtt, 0, MLX4_CMD_WRITE_MTT, - MLX4_CMD_TIME_CLASS_B); + struct mlx4_priv *priv = mlx4_priv(dev); + __be64 *mtts; + dma_addr_t dma_handle; + int i; + int s = start_index * sizeof (u64); + + /* All MTTs must fit in the same page */ + if (start_index / (PAGE_SIZE / sizeof (u64)) != + (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64))) + return -EINVAL; + + if (start_index & (MLX4_MTT_ENTRY_PER_SEG - 1)) + return -EINVAL; + + mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg + + s / dev->caps.mtt_entry_sz, &dma_handle); + if (!mtts) + return -ENOMEM; + + for (i = 0; i < npages; ++i) + mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); + + dma_sync_single(&dev->pdev->dev, dma_handle, npages * sizeof (u64), DMA_TO_DEVICE); + + return 0; } int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int start_index, int npages, u64 *page_list) { - struct mlx4_cmd_mailbox *mailbox; - __be64 *mtt_entry; - int i; - int err = 0; + int chunk; + int err; if (mtt->order < 0) return -EINVAL; - mailbox = mlx4_alloc_cmd_mailbox(dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - - mtt_entry = mailbox->buf; - while (npages > 0) { - mtt_entry[0] = cpu_to_be64(mlx4_mtt_addr(dev, mtt) + start_index * 8); - mtt_entry[1] = 0; - - for (i = 0; i < npages && i < MLX4_MAILBOX_SIZE / 8 - 2; ++i) - mtt_entry[i + 2] = cpu_to_be64(page_list[i] | - MLX4_MTT_FLAG_PRESENT); - - /* - * If we have an odd number of entries to write, add - * one more dummy entry for firmware efficiency. - */ - if (i & 1) - mtt_entry[i + 2] = 0; - - err = mlx4_WRITE_MTT(dev, mailbox, (i + 1) & ~1); + chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages); + err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); if (err) - goto out; + return err; - npages -= i; - start_index += i; - page_list += i; + npages -= chunk; + start_index += chunk; + page_list += chunk; } -out: - mlx4_free_cmd_mailbox(dev, mailbox); - - return err; + return 0; } EXPORT_SYMBOL_GPL(mlx4_write_mtt); -- cgit v0.10.2 From 8ad11fb6b0739e704953e2b0aed453bf7d75d4f6 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 1 Aug 2007 12:29:05 +0300 Subject: IB/mlx4: Implement FMRs Implement FMRs for mlx4. This is an adaptation of code from mthca. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d9fc822..d8287d9 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -607,6 +607,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + if (init_node_data(ibdev)) goto err_map; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 705ff2f..2869765 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -93,6 +93,11 @@ struct mlx4_ib_mr { struct ib_umem *umem; }; +struct mlx4_ib_fmr { + struct ib_fmr ibfmr; + struct mlx4_fmr mfmr; +}; + struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; @@ -199,6 +204,10 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx4_ib_mr, ibmr); } +static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); +} static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); @@ -284,6 +293,13 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, + u64 iova); +int mlx4_ib_unmap_fmr(struct list_head *fmr_list); +int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); + static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { return !!(ah->av.g_slid & 0x80); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 734ec2b..7dc91a3 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -181,3 +181,96 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_fmr *fmr; + int err = -ENOMEM; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift, &fmr->mfmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + if (err) + goto err_mr; + + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; + + return &fmr->ibfmr; + +err_mr: + mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + +err_free: + kfree(fmr); + + return ERR_PTR(err); +} + +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); + + return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, + &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); +} + +int mlx4_ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *ibfmr; + int err; + struct mlx4_dev *mdev = NULL; + + list_for_each_entry(ibfmr, fmr_list, list) { + if (mdev && to_mdev(ibfmr->device)->dev != mdev) + return -EINVAL; + mdev = to_mdev(ibfmr->device)->dev; + } + + if (!mdev) + return 0; + + list_for_each_entry(ibfmr, fmr_list, list) { + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + + mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); + } + + /* + * Make sure all MPT status updates are visible before issuing + * SYNC_TPT firmware command. + */ + wmb(); + + err = mlx4_SYNC_TPT(mdev); + if (err) + printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when " + "unmapping FMRs\n", err); + + return 0; +} + +int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); + int err; + + err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); + + if (!err) + kfree(ifmr); + + return err; +} diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 3cc98c6..4bc39e6 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -68,6 +68,9 @@ struct mlx4_mpt_entry { #define MLX4_MTT_FLAG_PRESENT 1 +#define MLX4_MPT_STATUS_SW 0xF0 +#define MLX4_MPT_STATUS_HW 0x00 + static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order) { int o; @@ -469,3 +472,165 @@ void mlx4_cleanup_mr_table(struct mlx4_dev *dev) mlx4_buddy_cleanup(&mr_table->mtt_buddy); mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); } + +static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova) +{ + int i, page_mask; + + if (npages > fmr->max_pages) + return -EINVAL; + + page_mask = (1 << fmr->page_shift) - 1; + + /* We are getting page lists, so va must be page aligned. */ + if (iova & page_mask) + return -EINVAL; + + /* Trust the user not to pass misaligned data in page_list */ + if (0) + for (i = 0; i < npages; ++i) { + if (page_list[i] & ~page_mask) + return -EINVAL; + } + + if (fmr->maps >= fmr->max_maps) + return -EINVAL; + + return 0; +} + +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey) +{ + u32 key; + int i, err; + + err = mlx4_check_fmr(fmr, page_list, npages, iova); + if (err) + return err; + + ++fmr->maps; + + key = key_to_hw_index(fmr->mr.key); + key += dev->caps.num_mpts; + *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); + + *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; + + /* Make sure MPT status is visible before writing MTT entries */ + wmb(); + + for (i = 0; i < npages; ++i) + fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); + + dma_sync_single(&dev->pdev->dev, fmr->dma_handle, + npages * sizeof(u64), DMA_TO_DEVICE); + + fmr->mpt->key = cpu_to_be32(key); + fmr->mpt->lkey = cpu_to_be32(key); + fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); + fmr->mpt->start = cpu_to_be64(iova); + + /* Make MTT entries are visible before setting MPT status */ + wmb(); + + *(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW; + + /* Make sure MPT status is visible before consumer can use FMR */ + wmb(); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); + +int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, + int max_maps, u8 page_shift, struct mlx4_fmr *fmr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 mtt_seg; + int err = -ENOMEM; + + if (page_shift < 12 || page_shift >= 32) + return -EINVAL; + + /* All MTTs must fit in the same page */ + if (max_pages * sizeof *fmr->mtts > PAGE_SIZE) + return -EINVAL; + + fmr->page_shift = page_shift; + fmr->max_pages = max_pages; + fmr->max_maps = max_maps; + fmr->maps = 0; + + err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages, + page_shift, &fmr->mr); + if (err) + return err; + + mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz; + + fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, + fmr->mr.mtt.first_seg, + &fmr->dma_handle); + if (!fmr->mtts) { + err = -ENOMEM; + goto err_free; + } + + fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table, + key_to_hw_index(fmr->mr.key), NULL); + if (!fmr->mpt) { + err = -ENOMEM; + goto err_free; + } + + return 0; + +err_free: + mlx4_mr_free(dev, &fmr->mr); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); + +int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) +{ + return mlx4_mr_enable(dev, &fmr->mr); +} +EXPORT_SYMBOL_GPL(mlx4_fmr_enable); + +void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u32 *lkey, u32 *rkey) +{ + u32 key; + + if (!fmr->maps) + return; + + key = key_to_hw_index(fmr->mr.key); + key &= dev->caps.num_mpts - 1; + *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); + + fmr->maps = 0; + + *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); + +int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) +{ + if (fmr->maps) + return -EBUSY; + + fmr->mr.enabled = 0; + mlx4_mr_free(dev, &fmr->mr); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_free); + +int mlx4_SYNC_TPT(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000); +} +EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a93520c..222815d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -214,6 +214,17 @@ struct mlx4_mr { int enabled; }; +struct mlx4_fmr { + struct mlx4_mr mr; + struct mlx4_mpt_entry *mpt; + __be64 *mtts; + dma_addr_t dma_handle; + int max_pages; + int max_maps; + int maps; + u8 page_shift; +}; + struct mlx4_uar { unsigned long pfn; int index; @@ -337,4 +348,14 @@ int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey); +int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, + int max_maps, u8 page_shift, struct mlx4_fmr *fmr); +int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); +void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u32 *lkey, u32 *rkey); +int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); +int mlx4_SYNC_TPT(struct mlx4_dev *dev); + #endif /* MLX4_DEVICE_H */ -- cgit v0.10.2 From e57ac0c297b9e1074dcb6b6ee5cd1d6407a7df14 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 2 Oct 2007 09:40:13 +0200 Subject: mlx4_core: Increase max number of QPs per multicast group to 56 Increase the number of QPs allowed per multicast group from 8 to 56. This allows for one QP per core on 16-core systems, which are now quite common, and allows some space for future growth. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 2bad045..53a1cdd 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -56,7 +56,7 @@ enum { }; enum { - MLX4_MGM_ENTRY_SIZE = 0x40, + MLX4_MGM_ENTRY_SIZE = 0x100, MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2), MLX4_MTT_ENTRY_PER_SEG = 8 }; -- cgit v0.10.2 From 1a1eb6a646f52dc62e3f9ceac4aab9c27e781186 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:17 -0700 Subject: IB/mthca: Increase max number of QPs per multicast group to 56 Increase the number of QPs allowed per multicast group from 8 to 56. This allows for one QP per core on 16-core systems, which are now quite common, and allows some space for future growth. This is basically the same patch that Jack Morgenstein just supplied for mlx4. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 9bae3cc..15aa32e 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -83,7 +83,7 @@ enum { MTHCA_QP_CONTEXT_SIZE = 0x200, MTHCA_RDB_ENTRY_SIZE = 0x20, MTHCA_AV_SIZE = 0x20, - MTHCA_MGM_ENTRY_SIZE = 0x40, + MTHCA_MGM_ENTRY_SIZE = 0x100, /* Arbel FW gives us these, but we need them for Tavor */ MTHCA_MPT_ENTRY_SIZE = 0x40, -- cgit v0.10.2 From de98b693e9857e183679cd2f49b3c30d2bc57629 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 1 Aug 2007 13:49:53 -0700 Subject: IB/cm: Modify interface to send MRAs in response to duplicate messages The IB CM provides a message received acknowledged (MRA) message that can be sent to indicate that a REQ or REP message has been received, but will require more time to process than the timeout specified by those messages. In many cases, the application may not know how long it will take to respond to a CM message, but the majority of the time, it will usually respond before a retry has been sent. Rather than sending an MRA in response to all messages just to handle the case where a longer timeout is needed, it is more efficient to queue the MRA for sending in case a duplicate message is received. This avoids sending an MRA when it is not needed, but limits the number of times that a REQ or REP will be resent. It also provides for a simpler implementation than generating the MRA based on a timer event. (That is, trying to send the MRA after receiving the first REQ or REP if a response has not been generated, so that it is received at the remote side before a duplicate REQ or REP has been received) Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4df269f..2e39236 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -2219,6 +2219,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; + enum ib_cm_state cm_state; + enum ib_cm_lap_state lap_state; + enum cm_msg_response msg_response; void *data; unsigned long flags; int ret; @@ -2235,48 +2238,40 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); switch(cm_id_priv->id.state) { case IB_CM_REQ_RCVD: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error1; - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REQ, service_timeout, - private_data, private_data_len); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error2; - cm_id->state = IB_CM_MRA_REQ_SENT; + cm_state = IB_CM_MRA_REQ_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error1; - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REP, service_timeout, - private_data, private_data_len); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error2; - cm_id->state = IB_CM_MRA_REP_SENT; + cm_state = IB_CM_MRA_REP_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: + cm_state = cm_id->state; + lap_state = IB_CM_MRA_LAP_SENT; + msg_response = CM_MSG_RESPONSE_OTHER; + break; + default: + ret = -EINVAL; + goto error1; + } + + if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error1; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_OTHER, service_timeout, + msg_response, service_timeout, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) goto error2; - cm_id->lap_state = IB_CM_MRA_LAP_SENT; - break; - default: - ret = -EINVAL; - goto error1; } + + cm_id->state = cm_state; + cm_id->lap_state = lap_state; cm_id_priv->service_timeout = service_timeout; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 12243e8..a627c86 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -477,12 +477,15 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); +#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */ + /** * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection * message. * @cm_id: Connection identifier associated with the connection message. - * @service_timeout: The maximum time required for the sender to reply to - * to the connection message. + * @service_timeout: The lower 5-bits specify the maximum time required for + * the sender to reply to to the connection message. The upper 3-bits + * specify additional control flags. * @private_data: Optional user-defined private data sent with the * message receipt acknowledgement. * @private_data_len: Size of the private data buffer, in bytes. -- cgit v0.10.2 From dcb3f974da827c964cb8d419fbb4350cdc08a559 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 1 Aug 2007 14:47:16 -0700 Subject: RDMA/cma: Queue IB CM MRAs to avoid unnecessary remote retries Automatically queue MRA message to decrease the number of retries sent by the remote side during connection establishment. This also has the effect of increasing the overall connection timeout without using a longer retry time in the case of dropped packets. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 19c9172..7253952 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -52,6 +52,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 +#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -1090,6 +1091,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); conn_id = cma_new_conn_id(&listen_id->id, ib_event); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); -- cgit v0.10.2 From 76d7cc0345a037e8eea426f8abc710abd22946dd Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:17 -0700 Subject: IB/mthca: Use mmiowb() to avoid firmware commands getting jumbled up Firmware commands are sent to the HCA by writing multiple words to a command register block. Access to this block of registers is serialized with a mutex. However, on large SGI systems, problems were seen with multiple CPUs issuing FW commands at the same time, because the writes to the register block may be reordered within the system interconnect and reach the HCA in a different order than they were issued (even with the mutex). Fix this by adding an mmiowb() before dropping the mutex. Tested-by: Arthur Kepner Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index acc9589..6966f94 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -290,6 +290,12 @@ static int mthca_cmd_post(struct mthca_dev *dev, err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier, op_modifier, op, token, event); + /* + * Make sure that our HCR writes don't get mixed in with + * writes from another CPU starting a FW command. + */ + mmiowb(); + mutex_unlock(&dev->cmd.hcr_mutex); return err; } -- cgit v0.10.2 From 2e61c646edfa013203e3428762f8d6a72e10bdea Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:18 -0700 Subject: mlx4_core: Use mmiowb() to avoid firmware commands getting jumbled up Firmware commands are sent to the HCA by writing multiple words to a command register block. Access to this block of registers is serialized with a mutex. However, on large SGI systems writes to the register block may be reordered within the system interconnect and reach the HCA in a different order than they were issued (even with the mutex). Fix this by adding an mmiowb() before dropping the mutex. This bug was observed with real workloads with the similar FW command code in the mthca driver, and adding the mmiowb() as in commit 66547550 ("IB/mthca: Use mmiowb() to avoid firmware commands getting jumbled up") was confirmed to fix the problems, so we should add the same fix to mlx4. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index b540820..db49051 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -184,6 +184,13 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, (event ? (1 << HCR_E_BIT) : 0) | (op_modifier << HCR_OPMOD_SHIFT) | op), hcr + 6); + + /* + * Make sure that our HCR writes don't get mixed in with + * writes from another CPU starting a FW command. + */ + mmiowb(); + cmd->toggle = cmd->toggle ^ 1; ret = 0; -- cgit v0.10.2 From ede6bc04f3a07a9c93f02c92cdc281d254398321 Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Sun, 7 Oct 2007 09:30:48 +0200 Subject: IPoIB/cm: Clean up initialization of QP attr in ipoib_cm_create_tx_qp() Make the way QP is being created in ipoib_cm_create_tx_qp() consistent with ipoib_cm_create_rx_qp(). Signed-off-by: Dotan Barak Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 08b4676..23addb3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -810,14 +810,16 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_qp_init_attr attr = {}; - attr.recv_cq = priv->cq; - attr.srq = priv->cm.srq; - attr.cap.max_send_wr = ipoib_sendq_size; - attr.cap.max_send_sge = 1; - attr.sq_sig_type = IB_SIGNAL_ALL_WR; - attr.qp_type = IB_QPT_RC; - attr.send_cq = cq; + struct ib_qp_init_attr attr = { + .send_cq = cq, + .recv_cq = priv->cq, + .srq = priv->cm.srq, + .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_sge = 1, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + }; + return ib_create_qp(priv->pd, &attr); } -- cgit v0.10.2 From 76dea3bc2644e99cce1d98d0bbd3124314e5b50a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 Oct 2007 19:59:18 -0700 Subject: IB/ehca: Fix clipping of device limits to INT_MAX Doing min_t(int, foo, INT_MAX) doesn't work correctly, because if foo is bigger than INT_MAX, then when treated as a signed integer, it will become negative and hence such an expression is just an elaborate NOP. Fix such cases in ehca to do min_t(unsigned, foo, INT_MAX) instead. This fixes negative reported values for max_cqe, max_pd and max_ah: Before: max_cqe: -64 max_pd: -1 max_ah: -1 After: max_cqe: 2147483647 max_pd: 2147483647 max_ah: 2147483647 Based on a bug report and fix from Anton Blanchard . Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index 3436c49..4aa3ffa 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -82,17 +82,17 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) props->vendor_id = rblock->vendor_id >> 8; props->vendor_part_id = rblock->vendor_part_id >> 16; props->hw_ver = rblock->hw_ver; - props->max_qp = min_t(int, rblock->max_qp, INT_MAX); - props->max_qp_wr = min_t(int, rblock->max_wqes_wq, INT_MAX); - props->max_sge = min_t(int, rblock->max_sge, INT_MAX); - props->max_sge_rd = min_t(int, rblock->max_sge_rd, INT_MAX); - props->max_cq = min_t(int, rblock->max_cq, INT_MAX); - props->max_cqe = min_t(int, rblock->max_cqe, INT_MAX); - props->max_mr = min_t(int, rblock->max_mr, INT_MAX); - props->max_mw = min_t(int, rblock->max_mw, INT_MAX); - props->max_pd = min_t(int, rblock->max_pd, INT_MAX); - props->max_ah = min_t(int, rblock->max_ah, INT_MAX); - props->max_fmr = min_t(int, rblock->max_mr, INT_MAX); + props->max_qp = min_t(unsigned, rblock->max_qp, INT_MAX); + props->max_qp_wr = min_t(unsigned, rblock->max_wqes_wq, INT_MAX); + props->max_sge = min_t(unsigned, rblock->max_sge, INT_MAX); + props->max_sge_rd = min_t(unsigned, rblock->max_sge_rd, INT_MAX); + props->max_cq = min_t(unsigned, rblock->max_cq, INT_MAX); + props->max_cqe = min_t(unsigned, rblock->max_cqe, INT_MAX); + props->max_mr = min_t(unsigned, rblock->max_mr, INT_MAX); + props->max_mw = min_t(unsigned, rblock->max_mw, INT_MAX); + props->max_pd = min_t(unsigned, rblock->max_pd, INT_MAX); + props->max_ah = min_t(unsigned, rblock->max_ah, INT_MAX); + props->max_fmr = min_t(unsigned, rblock->max_mr, INT_MAX); if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { props->max_srq = props->max_qp; @@ -104,15 +104,15 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) props->local_ca_ack_delay = rblock->local_ca_ack_delay; props->max_raw_ipv6_qp - = min_t(int, rblock->max_raw_ipv6_qp, INT_MAX); + = min_t(unsigned, rblock->max_raw_ipv6_qp, INT_MAX); props->max_raw_ethy_qp - = min_t(int, rblock->max_raw_ethy_qp, INT_MAX); + = min_t(unsigned, rblock->max_raw_ethy_qp, INT_MAX); props->max_mcast_grp - = min_t(int, rblock->max_mcast_grp, INT_MAX); + = min_t(unsigned, rblock->max_mcast_grp, INT_MAX); props->max_mcast_qp_attach - = min_t(int, rblock->max_mcast_qp_attach, INT_MAX); + = min_t(unsigned, rblock->max_mcast_qp_attach, INT_MAX); props->max_total_mcast_qp_attach - = min_t(int, rblock->max_total_mcast_qp_attach, INT_MAX); + = min_t(unsigned, rblock->max_total_mcast_qp_attach, INT_MAX); /* translate device capabilities */ props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID | -- cgit v0.10.2 From 327a338d4fd018d33e7cacde46c0d82622b4bda8 Mon Sep 17 00:00:00 2001 From: Arthur Jones Date: Thu, 2 Aug 2007 14:46:29 -0700 Subject: IB/ipath: iba6110 rev4 GPIO counters support On iba6110 rev4, support for three more IB counters were added. The LocalLinkIntegrityError counter, the ExcessiveBufferOverrunErrors counter and support for error counting of flow control packets on an invalid VL. These counters trigger GPIO interrupts and the sw keeps track of the counts. Since we also use GPIO interrupts to signal packet reception, we need to turn off the fast interrupts, or we risk losing a GPIO interrupt. Signed-off-by: Arthur Jones Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index 650745d..e1c5998 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -1559,6 +1559,14 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) ipath_dev_err(dd, "Unsupported InfiniPath serial " "number %.16s!\n", dd->ipath_serial); + if (dd->ipath_minrev >= 4) { + /* Rev4+ reports extra errors via internal GPIO pins */ + dd->ipath_flags |= IPATH_GPIO_ERRINTRS; + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + return 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index b29fe7e..11b3614 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1085,8 +1085,8 @@ irqreturn_t ipath_intr(int irq, void *data) * GPIO_2 indicates (on some HT4xx boards) that a packet * has arrived for Port 0. Checking for this * is controlled by flag IPATH_GPIO_INTR. - * GPIO_3..5 on IBA6120 Rev2 chips indicate errors - * that we need to count. Checking for this + * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate + * errors that we need to count. Checking for this * is controlled by flag IPATH_GPIO_ERRINTRS. */ u32 gpiostatus; -- cgit v0.10.2 From 210d6ca3db058cd1d6e6fd235ee3e25d6ac221cd Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 24 Jul 2007 13:55:39 -0700 Subject: IB/ipath: Performance optimization for CPU differences Different processors have different ordering restrictions for write combining. By taking advantage of this, we can eliminate some write barriers when writing to the send buffers. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index cf25cda..4137c77 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -446,19 +446,21 @@ static ssize_t ipath_diagpkt_write(struct file *fp, dd->ipath_unit, plen - 1, pbufn); if (dp.pbc_wd == 0) - /* Legacy operation, use computed pbc_wd */ dp.pbc_wd = plen; - - /* we have to flush after the PBC for correctness on some cpus - * or WC buffer can be written out of order */ writeq(dp.pbc_wd, piobuf); - ipath_flush_wc(); - /* copy all by the trigger word, then flush, so it's written + /* + * Copy all by the trigger word, then flush, so it's written * to chip before trigger word, then write trigger word, then - * flush again, so packet is sent. */ - __iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); - ipath_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + * flush again, so packet is sent. + */ + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); + ipath_flush_wc(); + __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + } else + __iowrite32_copy(piobuf + 2, tmpbuf, clen); + ipath_flush_wc(); ret = sizeof(dp); diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 5b6ac9a..a324c6f 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -1273,6 +1273,8 @@ static void ipath_pe_tidtemplate(struct ipath_devdata *dd) static int ipath_pe_early_init(struct ipath_devdata *dd) { dd->ipath_flags |= IPATH_4BYTE_TID; + if (ipath_unordered_wc()) + dd->ipath_flags |= IPATH_PIO_FLUSH_WC; /* * For openfabrics, we need to be able to handle an IB header of diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 7a7966f..d983f92 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -724,6 +724,8 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); #define IPATH_LINKACTIVE 0x200 /* link current state is unknown */ #define IPATH_LINKUNK 0x400 + /* Write combining flush needed for PIO */ +#define IPATH_PIO_FLUSH_WC 0x1000 /* no IB cable, or no device on IB cable */ #define IPATH_NOCABLE 0x4000 /* Supports port zero per packet receive interrupts via diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 16aa61f..559d4a6 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -631,7 +631,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) #endif static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, - u32 length) + u32 length, unsigned flush_wc) { u32 extra = 0; u32 data = 0; @@ -757,11 +757,14 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, } /* Update address before sending packet. */ update_sge(ss, length); - /* must flush early everything before trigger word */ - ipath_flush_wc(); - __raw_writel(last, piobuf); - /* be sure trigger word is written */ - ipath_flush_wc(); + if (flush_wc) { + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __raw_writel(last, piobuf); } /** @@ -776,6 +779,7 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, u32 *hdr, u32 len, struct ipath_sge_state *ss) { u32 __iomem *piobuf; + unsigned flush_wc; u32 plen; int ret; @@ -799,47 +803,55 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, * or WC buffer can be written out of order. */ writeq(plen, piobuf); - ipath_flush_wc(); piobuf += 2; + + flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC; if (len == 0) { /* * If there is just the header portion, must flush before * writing last word of header for correctness, and after * the last header word (trigger word). */ - __iowrite32_copy(piobuf, hdr, hdrwords - 1); - ipath_flush_wc(); - __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); - ipath_flush_wc(); - ret = 0; - goto bail; + if (flush_wc) { + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, hdr, hdrwords); + goto done; } + if (flush_wc) + ipath_flush_wc(); __iowrite32_copy(piobuf, hdr, hdrwords); piobuf += hdrwords; /* The common case is aligned and contained in one segment. */ if (likely(ss->num_sge == 1 && len <= ss->sge.length && !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { - u32 w; + u32 dwords; u32 *addr = (u32 *) ss->sge.vaddr; /* Update address before sending packet. */ update_sge(ss, len); /* Need to round up for the last dword in the packet. */ - w = (len + 3) >> 2; - __iowrite32_copy(piobuf, addr, w - 1); - /* must flush early everything before trigger word */ - ipath_flush_wc(); - __raw_writel(addr[w - 1], piobuf + w - 1); - /* be sure trigger word is written */ - ipath_flush_wc(); - ret = 0; - goto bail; + dwords = (len + 3) >> 2; + if (flush_wc) { + __iowrite32_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, addr, dwords); + goto done; } - copy_io(piobuf, ss, len); + copy_io(piobuf, ss, len, flush_wc); +done: ret = 0; - bail: return ret; } -- cgit v0.10.2 From 4ee97180ac76deb5a715ac45b7d7516e6ee82ae7 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 25 Jul 2007 11:08:28 -0700 Subject: IB/ipath: Change UD to queue work requests like RC & UC The code to post UD sends tried to process work requests at the time ib_post_send() is called without using a WQE queue. This was fine as long as HW resources were available for sending a packet. This patch changes UD to be handled more like RC and UC and shares more code. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 1324b35..a8c4a6b 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -338,6 +338,7 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->s_busy = 0; qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; qp->s_hdrwords = 0; + qp->s_wqe = NULL; qp->s_psn = 0; qp->r_psn = 0; qp->r_msn = 0; @@ -751,6 +752,9 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, switch (init_attr->qp_type) { case IB_QPT_UC: case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: sz = sizeof(struct ipath_sge) * init_attr->cap.max_send_sge + sizeof(struct ipath_swqe); @@ -759,10 +763,6 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, ret = ERR_PTR(-ENOMEM); goto bail; } - /* FALLTHROUGH */ - case IB_QPT_UD: - case IB_QPT_SMI: - case IB_QPT_GSI: sz = sizeof(*qp); if (init_attr->srq) { struct ipath_srq *srq = to_isrq(init_attr->srq); @@ -805,8 +805,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); init_waitqueue_head(&qp->wait); - tasklet_init(&qp->s_task, ipath_do_ruc_send, - (unsigned long)qp); + tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); INIT_LIST_HEAD(&qp->piowait); INIT_LIST_HEAD(&qp->timerwait); qp->state = IB_QPS_RESET; diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 46744ea..53259da 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -81,9 +81,8 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) * Note that we are in the responder's side of the QP context. * Note the QP s_lock must be held. */ -static int ipath_make_rc_ack(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, u32 pmtu) { struct ipath_ack_entry *e; u32 hwords; @@ -192,8 +191,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp, } qp->s_hdrwords = hwords; qp->s_cur_size = len; - *bth0p = bth0 | (1 << 22); /* Set M bit */ - *bth2p = bth2; + ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2); return 1; bail: @@ -203,32 +201,39 @@ bail: /** * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) * @qp: a pointer to the QP - * @ohdr: a pointer to the IB header being constructed - * @pmtu: the path MTU - * @bth0p: pointer to the BTH opcode word - * @bth2p: pointer to the BTH PSN word * * Return 1 if constructed; otherwise, return 0. - * Note the QP s_lock must be held and interrupts disabled. */ -int ipath_make_rc_req(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +int ipath_make_rc_req(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; struct ipath_sge_state *ss; struct ipath_swqe *wqe; u32 hwords; u32 len; u32 bth0; u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); char newreq; + unsigned long flags; + int ret = 0; + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); /* Sending responses has higher priority over sending requests. */ if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || (qp->s_flags & IPATH_S_ACK_PENDING) || qp->s_ack_state != OP(ACKNOWLEDGE)) && - ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p)) + ipath_make_rc_ack(dev, qp, ohdr, pmtu)) goto done; if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || @@ -560,13 +565,12 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_hdrwords = hwords; qp->s_cur_sge = ss; qp->s_cur_size = len; - *bth0p = bth0 | (qp->s_state << 24); - *bth2p = bth2; + ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); done: - return 1; - + ret = 1; bail: - return 0; + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; } /** @@ -627,7 +631,7 @@ static void send_rc_ack(struct ipath_qp *qp) /* * If we can send the ACK, clear the ACK state. */ - if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { + if (ipath_verbs_send(qp, &hdr, hwords, NULL, 0) == 0) { dev->n_unicast_xmit++; goto done; } @@ -757,7 +761,9 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) wc->vendor_err = 0; wc->byte_len = 0; wc->qp = &qp->ibqp; + wc->imm_data = 0; wc->src_qp = qp->remote_qpn; + wc->wc_flags = 0; wc->pkey_index = 0; wc->slid = qp->remote_ah_attr.dlid; wc->sl = qp->remote_ah_attr.sl; @@ -1041,7 +1047,9 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, wc.vendor_err = 0; wc.byte_len = 0; wc.qp = &qp->ibqp; + wc.imm_data = 0; wc.src_qp = qp->remote_qpn; + wc.wc_flags = 0; wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; @@ -1454,6 +1462,19 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, goto send_ack; } /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue && + !(qp->s_flags & IPATH_S_ACK_PENDING) && + qp->s_ack_state == OP(ACKNOWLEDGE)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index c69c252..4b6b7ee 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -31,6 +31,8 @@ * SOFTWARE. */ +#include + #include "ipath_verbs.h" #include "ipath_kernel.h" @@ -106,27 +108,30 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp) spin_unlock_irqrestore(&dev->pending_lock, flags); } -static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe) +/** + * ipath_init_sge - Validate a RWQE and fill in the SGE state + * @qp: the QP + * + * Return 1 if OK. + */ +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) { - int user = to_ipd(qp->ibqp.pd)->user; int i, j, ret; struct ib_wc wc; - qp->r_len = 0; + *lengthp = 0; for (i = j = 0; i < wqe->num_sge; i++) { if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if ((user && wqe->sg_list[i].lkey == 0) || - !ipath_lkey_ok(qp, &qp->r_sg_list[j], &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE)) + if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; + *lengthp += wqe->sg_list[i].length; j++; } - qp->r_sge.sge = qp->r_sg_list[0]; - qp->r_sge.sg_list = qp->r_sg_list + 1; - qp->r_sge.num_sge = j; + ss->num_sge = j; ret = 1; goto bail; @@ -172,6 +177,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) u32 tail; int ret; + qp->r_sge.sg_list = qp->r_sg_list; + if (qp->ibqp.srq) { srq = to_isrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; @@ -199,7 +206,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) wqe = get_rwqe_ptr(rq, tail); if (++tail >= rq->size) tail = 0; - } while (!wr_id_only && !init_sge(qp, wqe)); + } while (!wr_id_only && !ipath_init_sge(qp, wqe, &qp->r_len, + &qp->r_sge)); qp->r_wr_id = wqe->wr_id; wq->tail = tail; @@ -239,9 +247,9 @@ bail: /** * ipath_ruc_loopback - handle UC and RC lookback requests - * @sqp: the loopback QP + * @sqp: the sending QP * - * This is called from ipath_do_uc_send() or ipath_do_rc_send() to + * This is called from ipath_do_send() to * forward a WQE addressed to the same HCA. * Note that although we are single threaded due to the tasklet, we still * have to protect against post_send(). We don't have to worry about @@ -450,40 +458,18 @@ again: wc.byte_len = wqe->length; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - /* XXX do we know which pkey matched? Only needed for GSI. */ wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; wc.dlid_path_bits = 0; + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - - if (!(sqp->s_flags & IPATH_S_SIGNAL_REQ_WR) || - (wqe->wr.send_flags & IB_SEND_SIGNALED)) { - wc.wr_id = wqe->wr.wr_id; - wc.status = IB_WC_SUCCESS; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = wqe->length; - wc.qp = &sqp->ibqp; - wc.src_qp = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_cq_enter(to_icq(sqp->ibqp.send_cq), &wc, 0); - } - - /* Update s_last now that we are finished with the SWQE */ - spin_lock_irqsave(&sqp->s_lock, flags); - if (++sqp->s_last >= sqp->s_size) - sqp->s_last = 0; - spin_unlock_irqrestore(&sqp->s_lock, flags); + ipath_send_complete(sqp, wqe, IB_WC_SUCCESS); goto again; done: @@ -491,13 +477,11 @@ done: wake_up(&qp->wait); } -static int want_buffer(struct ipath_devdata *dd) +static void want_buffer(struct ipath_devdata *dd) { set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); - - return 0; } /** @@ -507,14 +491,11 @@ static int want_buffer(struct ipath_devdata *dd) * * Called when we run out of PIO buffers. */ -static void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) +static void ipath_no_bufs_available(struct ipath_qp *qp, + struct ipath_ibdev *dev) { unsigned long flags; - spin_lock_irqsave(&dev->pending_lock, flags); - if (list_empty(&qp->piowait)) - list_add_tail(&qp->piowait, &dev->piowait); - spin_unlock_irqrestore(&dev->pending_lock, flags); /* * Note that as soon as want_buffer() is called and * possibly before it returns, ipath_ib_piobufavail() @@ -524,101 +505,14 @@ static void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev * We leave the busy flag set so that another post send doesn't * try to put the same QP on the piowait list again. */ + spin_lock_irqsave(&dev->pending_lock, flags); + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock_irqrestore(&dev->pending_lock, flags); want_buffer(dev->dd); dev->n_piowait++; } /** - * ipath_post_ruc_send - post RC and UC sends - * @qp: the QP to post on - * @wr: the work request to send - */ -int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr) -{ - struct ipath_swqe *wqe; - unsigned long flags; - u32 next; - int i, j; - int acc; - int ret; - - /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. - */ - if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) { - ret = -EINVAL; - goto bail; - } - } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { - ret = -EINVAL; - goto bail; - } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) { - ret = -EINVAL; - goto bail; - } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) { - ret = -EINVAL; - goto bail; - } - /* IB spec says that num_sge == 0 is OK. */ - if (wr->num_sge > qp->s_max_sge) { - ret = -ENOMEM; - goto bail; - } - spin_lock_irqsave(&qp->s_lock, flags); - next = qp->s_head + 1; - if (next >= qp->s_size) - next = 0; - if (next == qp->s_last) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - - wqe = get_swqe_ptr(qp, qp->s_head); - wqe->wr = *wr; - wqe->ssn = qp->s_ssn++; - wqe->sg_list[0].mr = NULL; - wqe->sg_list[0].vaddr = NULL; - wqe->sg_list[0].length = 0; - wqe->sg_list[0].sge_length = 0; - wqe->length = 0; - acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0; - for (i = 0, j = 0; i < wr->num_sge; i++) { - if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok(qp, &wqe->sg_list[j], &wr->sg_list[i], - acc)) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - wqe->length += wr->sg_list[i].length; - j++; - } - wqe->wr.num_sge = j; - qp->s_head = next; - spin_unlock_irqrestore(&qp->s_lock, flags); - - ipath_do_ruc_send((unsigned long) qp); - - ret = 0; - -bail: - return ret; -} - -/** * ipath_make_grh - construct a GRH header * @dev: a pointer to the ipath device * @hdr: a pointer to the GRH header being constructed @@ -648,39 +542,66 @@ u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid); + bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22)); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); +} + /** - * ipath_do_ruc_send - perform a send on an RC or UC QP + * ipath_do_send - perform a send on a QP * @data: contains a pointer to the QP * * Process entries in the send work queue until credit or queue is * exhausted. Only allow one CPU to send a packet per QP (tasklet). - * Otherwise, after we drop the QP s_lock, two threads could send - * packets out of order. + * Otherwise, two threads could send packets out of order. */ -void ipath_do_ruc_send(unsigned long data) +void ipath_do_send(unsigned long data) { struct ipath_qp *qp = (struct ipath_qp *)data; struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - unsigned long flags; - u16 lrh0; - u32 nwords; - u32 extra_bytes; - u32 bth0; - u32 bth2; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); - struct ipath_other_headers *ohdr; + int (*make_req)(struct ipath_qp *qp); if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy)) goto bail; - if (unlikely(qp->remote_ah_attr.dlid == dev->dd->ipath_lid)) { + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { ipath_ruc_loopback(qp); goto clear; } - ohdr = &qp->s_hdr.u.oth; - if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr.u.l.oth; + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = ipath_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = ipath_make_uc_req; + else + make_req = ipath_make_ud_req; again: /* Check for a constructed packet to be sent. */ @@ -689,9 +610,8 @@ again: * If no PIO bufs are available, return. An interrupt will * call ipath_ib_piobufavail() when one is available. */ - if (ipath_verbs_send(dev->dd, qp->s_hdrwords, - (u32 *) &qp->s_hdr, qp->s_cur_size, - qp->s_cur_sge)) { + if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) { ipath_no_bufs_available(qp, dev); goto bail; } @@ -700,54 +620,42 @@ again: qp->s_hdrwords = 0; } - /* - * The lock is needed to synchronize between setting - * qp->s_ack_state, resend timer, and post_send(). - */ - spin_lock_irqsave(&qp->s_lock, flags); - - if (!((qp->ibqp.qp_type == IB_QPT_RC) ? - ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) : - ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) { - /* - * Clear the busy bit before unlocking to avoid races with - * adding new work queue items and then failing to process - * them. - */ - clear_bit(IPATH_S_BUSY, &qp->s_busy); - spin_unlock_irqrestore(&qp->s_lock, flags); - goto bail; - } + if (make_req(qp)) + goto again; +clear: + clear_bit(IPATH_S_BUSY, &qp->s_busy); +bail:; +} - spin_unlock_irqrestore(&qp->s_lock, flags); +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status) +{ + u32 last = qp->s_last; - /* Construct the header. */ - extra_bytes = (4 - qp->s_cur_size) & 3; - nwords = (qp->s_cur_size + extra_bytes) >> 2; - lrh0 = IPATH_LRH_BTH; - if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, - &qp->remote_ah_attr.grh, - qp->s_hdrwords, nwords); - lrh0 = IPATH_LRH_GRH; - } - lrh0 |= qp->remote_ah_attr.sl << 4; - qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + - SIZE_OF_CRC); - qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid); - bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); - bth0 |= extra_bytes << 20; - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[2] = cpu_to_be32(bth2); + if (++last == qp->s_size) + last = 0; + qp->s_last = last; - /* Check for more work to do. */ - goto again; + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; -clear: - clear_bit(IPATH_S_BUSY, &qp->s_busy); -bail: - return; + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = wqe->length; + wc.imm_data = 0; + wc.qp = &qp->ibqp; + wc.src_qp = 0; + wc.wc_flags = 0; + wc.pkey_index = 0; + wc.slid = 0; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } } diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 8380fbc..767beb9 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -37,72 +37,40 @@ /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_UC_##x -static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe, - struct ib_wc *wc) -{ - if (++qp->s_last == qp->s_size) - qp->s_last = 0; - if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || - (wqe->wr.send_flags & IB_SEND_SIGNALED)) { - wc->wr_id = wqe->wr.wr_id; - wc->status = IB_WC_SUCCESS; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = wqe->length; - wc->qp = &qp->ibqp; - wc->src_qp = qp->remote_qpn; - wc->pkey_index = 0; - wc->slid = qp->remote_ah_attr.dlid; - wc->sl = qp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 0); - } -} - /** * ipath_make_uc_req - construct a request packet (SEND, RDMA write) * @qp: a pointer to the QP - * @ohdr: a pointer to the IB header being constructed - * @pmtu: the path MTU - * @bth0p: pointer to the BTH opcode word - * @bth2p: pointer to the BTH PSN word * * Return 1 if constructed; otherwise, return 0. - * Note the QP s_lock must be held and interrupts disabled. */ -int ipath_make_uc_req(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +int ipath_make_uc_req(struct ipath_qp *qp) { + struct ipath_other_headers *ohdr; struct ipath_swqe *wqe; u32 hwords; u32 bth0; u32 len; - struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int ret = 0; if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) goto done; + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; bth0 = 1 << 22; /* Set M bit */ /* Get the next send request. */ - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; switch (qp->s_state) { default: - /* - * Signal the completion of the last send - * (if there is one). - */ - if (qp->s_last != qp->s_tail) { - complete_last_send(qp, wqe, &wc); - wqe = get_swqe_ptr(qp, qp->s_last); - } - /* Check if send work queue is empty. */ - if (qp->s_tail == qp->s_head) + if (qp->s_cur == qp->s_head) goto done; /* * Start a new request. @@ -131,6 +99,9 @@ int ipath_make_uc_req(struct ipath_qp *qp, } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; case IB_WR_RDMA_WRITE: @@ -157,13 +128,14 @@ int ipath_make_uc_req(struct ipath_qp *qp, if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; default: goto done; } - if (++qp->s_tail >= qp->s_size) - qp->s_tail = 0; break; case OP(SEND_FIRST): @@ -185,6 +157,9 @@ int ipath_make_uc_req(struct ipath_qp *qp, } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; case OP(RDMA_WRITE_FIRST): @@ -207,18 +182,22 @@ int ipath_make_uc_req(struct ipath_qp *qp, if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; } qp->s_len -= len; qp->s_hdrwords = hwords; qp->s_cur_sge = &qp->s_sge; qp->s_cur_size = len; - *bth0p = bth0 | (qp->s_state << 24); - *bth2p = qp->s_next_psn++ & IPATH_PSN_MASK; - return 1; + ipath_make_ruc_header(to_idev(qp->ibqp.device), + qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & IPATH_PSN_MASK); + ret = 1; done: - return 0; + return ret; } /** diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index f9a3338..34c4a0a 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -36,68 +36,17 @@ #include "ipath_verbs.h" #include "ipath_kernel.h" -static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, - u32 *lengthp, struct ipath_sge_state *ss) -{ - int user = to_ipd(qp->ibqp.pd)->user; - int i, j, ret; - struct ib_wc wc; - - *lengthp = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - if ((user && wqe->sg_list[i].lkey == 0) || - !ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) - goto bad_lkey; - *lengthp += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ret = 1; - goto bail; - -bad_lkey: - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; - wc.qp = &qp->ibqp; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - /* Signal solicited completion event. */ - ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - /** * ipath_ud_loopback - handle send on loopback QPs - * @sqp: the QP - * @ss: the SGE state - * @length: the length of the data to send - * @wr: the work request - * @wc: the work completion entry + * @sqp: the sending QP + * @swqe: the send work request * - * This is called from ipath_post_ud_send() to forward a WQE addressed + * This is called from ipath_make_ud_req() to forward a WQE addressed * to the same HCA. * Note that the receive interrupt handler may be calling ipath_ud_rcv() * while this is being called. */ -static void ipath_ud_loopback(struct ipath_qp *sqp, - struct ipath_sge_state *ss, - u32 length, struct ib_send_wr *wr, - struct ib_wc *wc) +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) { struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); struct ipath_qp *qp; @@ -110,12 +59,18 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_rwq *wq; struct ipath_rwqe *wqe; void (*handler)(struct ib_event *, void *); + struct ib_wc wc; u32 tail; u32 rlen; + u32 length; - qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn); - if (!qp) - return; + qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); + if (!qp) { + dev->n_pkt_drops++; + goto send_comp; + } + + rsge.sg_list = NULL; /* * Check that the qkey matches (except for QP0, see 9.6.1.4.1). @@ -123,39 +78,34 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, * qkey from the QP context instead of the WR (see 10.2.5). */ if (unlikely(qp->ibqp.qp_num && - ((int) wr->wr.ud.remote_qkey < 0 - ? qp->qkey : wr->wr.ud.remote_qkey) != qp->qkey)) { + ((int) swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) { /* XXX OK to lose a count once in a while. */ dev->qkey_violations++; dev->n_pkt_drops++; - goto done; + goto drop; } /* * A GRH is expected to preceed the data even if not * present on the wire. */ - wc->byte_len = length + sizeof(struct ib_grh); + length = swqe->length; + wc.byte_len = length + sizeof(struct ib_grh); - if (wr->opcode == IB_WR_SEND_WITH_IMM) { - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = wr->imm_data; + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.imm_data = swqe->wr.imm_data; } else { - wc->wc_flags = 0; - wc->imm_data = 0; + wc.wc_flags = 0; + wc.imm_data = 0; } - if (wr->num_sge > 1) { - rsge.sg_list = kmalloc((wr->num_sge - 1) * - sizeof(struct ipath_sge), - GFP_ATOMIC); - } else - rsge.sg_list = NULL; - /* - * Get the next work request entry to find where to put the data. - * Note that it is safe to drop the lock after changing rq->tail - * since ipath_post_receive() won't fill the empty slot. + * This would be a lot simpler if we could call ipath_get_rwqe() + * but that uses state that the receive interrupt handler uses + * so we would need to lock out receive interrupts while doing + * local loopback. */ if (qp->ibqp.srq) { srq = to_isrq(qp->ibqp.srq); @@ -167,32 +117,53 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, rq = &qp->r_rq; } + if (rq->max_sge > 1) { + /* + * XXX We could use GFP_KERNEL if ipath_do_send() + * was always called from the tasklet instead of + * from ipath_post_send(). + */ + rsge.sg_list = kmalloc((rq->max_sge - 1) * + sizeof(struct ipath_sge), + GFP_ATOMIC); + if (!rsge.sg_list) { + dev->n_pkt_drops++; + goto drop; + } + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ spin_lock_irqsave(&rq->lock, flags); wq = rq->wq; tail = wq->tail; - while (1) { - if (unlikely(tail == wq->head)) { - spin_unlock_irqrestore(&rq->lock, flags); - dev->n_pkt_drops++; - goto bail_sge; - } - /* Make sure entry is read after head index is read. */ - smp_rmb(); - wqe = get_rwqe_ptr(rq, tail); - if (++tail >= rq->size) - tail = 0; - if (init_sge(qp, wqe, &rlen, &rsge)) - break; - wq->tail = tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + wqe = get_rwqe_ptr(rq, tail); + if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; } /* Silently drop packets which are too big. */ - if (wc->byte_len > rlen) { + if (wc.byte_len > rlen) { spin_unlock_irqrestore(&rq->lock, flags); dev->n_pkt_drops++; - goto bail_sge; + goto drop; } + if (++tail >= rq->size) + tail = 0; wq->tail = tail; - wc->wr_id = wqe->wr_id; + wc.wr_id = wqe->wr_id; if (handler) { u32 n; @@ -221,13 +192,13 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, } else spin_unlock_irqrestore(&rq->lock, flags); - ah_attr = &to_iah(wr->wr.ud.ah)->attr; + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; if (ah_attr->ah_flags & IB_AH_GRH) { ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); - wc->wc_flags |= IB_WC_GRH; + wc.wc_flags |= IB_WC_GRH; } else ipath_skip_sge(&rsge, sizeof(struct ib_grh)); - sge = &ss->sge; + sge = swqe->sg_list; while (length) { u32 len = sge->length; @@ -241,8 +212,8 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, sge->length -= len; sge->sge_length -= len; if (sge->sge_length == 0) { - if (--ss->num_sge) - *sge = *ss->sg_list++; + if (--swqe->wr.num_sge) + sge++; } else if (sge->length == 0 && sge->mr != NULL) { if (++sge->n >= IPATH_SEGSZ) { if (++sge->m >= sge->mr->mapsz) @@ -256,123 +227,60 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, } length -= len; } - wc->status = IB_WC_SUCCESS; - wc->opcode = IB_WC_RECV; - wc->vendor_err = 0; - wc->qp = &qp->ibqp; - wc->src_qp = sqp->ibqp.qp_num; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; /* XXX do we know which pkey matched? Only needed for GSI. */ - wc->pkey_index = 0; - wc->slid = dev->dd->ipath_lid | + wc.pkey_index = 0; + wc.slid = dev->dd->ipath_lid | (ah_attr->src_path_bits & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1)); - wc->sl = ah_attr->sl; - wc->dlid_path_bits = + wc.sl = ah_attr->sl; + wc.dlid_path_bits = ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, - wr->send_flags & IB_SEND_SOLICITED); - -bail_sge: + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); +drop: kfree(rsge.sg_list); -done: if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); +send_comp: + ipath_send_complete(sqp, swqe, IB_WC_SUCCESS); } /** - * ipath_post_ud_send - post a UD send on QP + * ipath_make_ud_req - construct a UD request packet * @qp: the QP - * @wr: the work request * - * Note that we actually send the data as it is posted instead of putting - * the request into a ring buffer. If we wanted to use a ring buffer, - * we would need to save a reference to the destination address in the SWQE. + * Return 1 if constructed; otherwise, return 0. */ -int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) +int ipath_make_ud_req(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_other_headers *ohdr; struct ib_ah_attr *ah_attr; - struct ipath_sge_state ss; - struct ipath_sge *sg_list; - struct ib_wc wc; - u32 hwords; + struct ipath_swqe *wqe; u32 nwords; - u32 len; u32 extra_bytes; u32 bth0; u16 lrh0; u16 lid; - int i; - int ret; + int ret = 0; - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { - ret = 0; + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK))) goto bail; - } - if (wr->wr.ud.ah->pd != qp->ibqp.pd) { - ret = -EPERM; + if (qp->s_cur == qp->s_head) goto bail; - } - /* IB spec says that num_sge == 0 is OK. */ - if (wr->num_sge > qp->s_max_sge) { - ret = -EINVAL; - goto bail; - } - - if (wr->num_sge > 1) { - sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list), - GFP_ATOMIC); - if (!sg_list) { - ret = -ENOMEM; - goto bail; - } - } else - sg_list = NULL; - - /* Check the buffer to send. */ - ss.sg_list = sg_list; - ss.sge.mr = NULL; - ss.sge.vaddr = NULL; - ss.sge.length = 0; - ss.sge.sge_length = 0; - ss.num_sge = 0; - len = 0; - for (i = 0; i < wr->num_sge; i++) { - /* Check LKEY */ - if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { - ret = -EINVAL; - goto bail; - } - - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok(qp, ss.num_sge ? - sg_list + ss.num_sge - 1 : &ss.sge, - &wr->sg_list[i], 0)) { - ret = -EINVAL; - goto bail; - } - len += wr->sg_list[i].length; - ss.num_sge++; - } - /* Check for invalid packet size. */ - if (len > dev->dd->ipath_ibmtu) { - ret = -EINVAL; - goto bail; - } - extra_bytes = (4 - len) & 3; - nwords = (len + extra_bytes) >> 2; + wqe = get_swqe_ptr(qp, qp->s_cur); /* Construct the header. */ - ah_attr = &to_iah(wr->wr.ud.ah)->attr; - if (ah_attr->dlid == 0) { - ret = -EINVAL; - goto bail; - } + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { if (ah_attr->dlid != IPATH_PERMISSIVE_LID) dev->n_multicast_xmit++; @@ -383,64 +291,53 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) lid = ah_attr->dlid & ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); if (unlikely(lid == dev->dd->ipath_lid)) { - /* - * Pass in an uninitialized ib_wc to save stack - * space. - */ - ipath_ud_loopback(qp, &ss, len, wr, &wc); + ipath_ud_loopback(qp, wqe); goto done; } } + + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + qp->s_hdrwords++; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + if (ah_attr->ah_flags & IB_AH_GRH) { /* Header size in 32-bit words. */ - hwords = 17; + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); lrh0 = IPATH_LRH_GRH; ohdr = &qp->s_hdr.u.l.oth; - qp->s_hdr.u.l.grh.version_tclass_flow = - cpu_to_be32((6 << 28) | - (ah_attr->grh.traffic_class << 20) | - ah_attr->grh.flow_label); - qp->s_hdr.u.l.grh.paylen = - cpu_to_be16(((wr->opcode == - IB_WR_SEND_WITH_IMM ? 6 : 5) + - nwords + SIZE_OF_CRC) << 2); - /* next_hdr is defined by C8-7 in ch. 8.4.1 */ - qp->s_hdr.u.l.grh.next_hdr = 0x1B; - qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit; - /* The SGID is 32-bit aligned. */ - qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = - dev->gid_prefix; - qp->s_hdr.u.l.grh.sgid.global.interface_id = - dev->dd->ipath_guid; - qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid; /* * Don't worry about sending to locally attached multicast * QPs. It is unspecified by the spec. what happens. */ } else { /* Header size in 32-bit words. */ - hwords = 7; lrh0 = IPATH_LRH_BTH; ohdr = &qp->s_hdr.u.oth; } - if (wr->opcode == IB_WR_SEND_WITH_IMM) { - ohdr->u.ud.imm_data = wr->imm_data; - wc.imm_data = wr->imm_data; - hwords += 1; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + ohdr->u.ud.imm_data = wqe->wr.imm_data; bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; - } else if (wr->opcode == IB_WR_SEND) { - wc.imm_data = 0; + } else bth0 = IB_OPCODE_UD_SEND_ONLY << 24; - } else { - ret = -EINVAL; - goto bail; - } lrh0 |= ah_attr->sl << 4; if (qp->ibqp.qp_type == IB_QPT_SMI) lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ - qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); lid = dev->dd->ipath_lid; if (lid) { lid |= ah_attr->src_path_bits & @@ -448,7 +345,7 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) qp->s_hdr.lrh[3] = cpu_to_be16(lid); } else qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; - if (wr->send_flags & IB_SEND_SOLICITED) + if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : @@ -460,38 +357,20 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && ah_attr->dlid != IPATH_PERMISSIVE_LID ? __constant_cpu_to_be32(IPATH_MULTICAST_QPN) : - cpu_to_be32(wr->wr.ud.remote_qpn); - /* XXX Could lose a PSN count but not worth locking */ + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ? - qp->qkey : wr->wr.ud.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); - if (ipath_verbs_send(dev->dd, hwords, (u32 *) &qp->s_hdr, - len, &ss)) - dev->n_no_piobuf++; done: - /* Queue the completion status entry. */ - if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || - (wr->send_flags & IB_SEND_SIGNALED)) { - wc.wr_id = wr->wr_id; - wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; - wc.opcode = IB_WC_SEND; - wc.byte_len = len; - wc.qp = &qp->ibqp; - wc.src_qp = 0; - wc.wc_flags = 0; - /* XXX initialize other fields? */ - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); - } - kfree(sg_list); - - ret = 0; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + ret = 1; bail: return ret; @@ -673,6 +552,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, */ wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 559d4a6..3cc82b6 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -231,6 +231,103 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) } /** + * ipath_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Check that state is OK to post send. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UD) { + /* Check UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) + goto bail_inval; + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->ssn = qp->s_ssn++; + wqe->length = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = ipath_lkey_ok(qp, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval; + } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) + goto bail_inval; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval: + ret = -EINVAL; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** * ipath_post_send - post a send on a QP * @ibqp: the QP to post the send on * @wr: the list of work requests to post @@ -244,35 +341,17 @@ static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ipath_qp *qp = to_iqp(ibqp); int err = 0; - /* Check that state is OK to post send. */ - if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) { - *bad_wr = wr; - err = -EINVAL; - goto bail; - } - for (; wr; wr = wr->next) { - switch (qp->ibqp.qp_type) { - case IB_QPT_UC: - case IB_QPT_RC: - err = ipath_post_ruc_send(qp, wr); - break; - - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - err = ipath_post_ud_send(qp, wr); - break; - - default: - err = -EINVAL; - } + err = ipath_post_one_send(qp, wr); if (err) { *bad_wr = wr; - break; + goto bail; } } + /* Try to do the send work in the caller's context. */ + ipath_do_send((unsigned long) qp); + bail: return err; } @@ -641,11 +720,11 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, u32 len = ss->sge.length; u32 off; - BUG_ON(len == 0); if (len > length) len = length; if (len > ss->sge.sge_length) len = ss->sge.sge_length; + BUG_ON(len == 0); /* If the source address is not aligned, try to align it. */ off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); if (off) { @@ -767,30 +846,15 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, __raw_writel(last, piobuf); } -/** - * ipath_verbs_send - send a packet - * @dd: the infinipath device - * @hdrwords: the number of words in the header - * @hdr: the packet header - * @len: the length of the packet in bytes - * @ss: the SGE to send - */ -int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, - u32 *hdr, u32 len, struct ipath_sge_state *ss) +static int ipath_verbs_send_pio(struct ipath_qp *qp, u32 *hdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) { + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; u32 __iomem *piobuf; unsigned flush_wc; - u32 plen; int ret; - /* +1 is for the qword padding of pbc */ - plen = hdrwords + ((len + 3) >> 2) + 1; - if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) { - ret = -EINVAL; - goto bail; - } - - /* Get a PIO buffer to use. */ piobuf = ipath_getpiobuf(dd, NULL); if (unlikely(piobuf == NULL)) { ret = -EBUSY; @@ -831,13 +895,10 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, /* The common case is aligned and contained in one segment. */ if (likely(ss->num_sge == 1 && len <= ss->sge.length && !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { - u32 dwords; u32 *addr = (u32 *) ss->sge.vaddr; /* Update address before sending packet. */ update_sge(ss, len); - /* Need to round up for the last dword in the packet. */ - dwords = (len + 3) >> 2; if (flush_wc) { __iowrite32_copy(piobuf, addr, dwords - 1); /* must flush early everything before trigger word */ @@ -851,11 +912,37 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, } copy_io(piobuf, ss, len, flush_wc); done: + if (qp->s_wqe) + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); ret = 0; bail: return ret; } +/** + * ipath_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + */ +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len) +{ + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* +1 is for the qword padding of pbc */ + plen = hdrwords + dwords + 1; + + ret = ipath_verbs_send_pio(qp, (u32 *) hdr, hdrwords, + ss, len, plen, dwords); + + return ret; +} + int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, u64 *rwords, u64 *spkts, u64 *rpkts, u64 *xmit_wait) @@ -864,7 +951,6 @@ int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, if (!(dd->ipath_flags & IPATH_INITTED)) { /* no hardware, freeze, etc. */ - ipath_dbg("unit %u not usable\n", dd->ipath_unit); ret = -EINVAL; goto bail; } @@ -890,48 +976,44 @@ bail: int ipath_get_counters(struct ipath_devdata *dd, struct ipath_verbs_counters *cntrs) { + struct ipath_cregs const *crp = dd->ipath_cregs; int ret; if (!(dd->ipath_flags & IPATH_INITTED)) { /* no hardware, freeze, etc. */ - ipath_dbg("unit %u not usable\n", dd->ipath_unit); ret = -EINVAL; goto bail; } cntrs->symbol_error_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); + ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt); cntrs->link_error_recovery_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt); /* * The link downed counter counts when the other side downs the * connection. We add in the number of times we downed the link * due to local link integrity errors to compensate. */ cntrs->link_downed_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt); + ipath_snap_cntr(dd, crp->cr_iblinkdowncnt); cntrs->port_rcv_errors = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt) + + ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, crp->cr_rcvovflcnt) + + ipath_snap_cntr(dd, crp->cr_portovflcnt) + + ipath_snap_cntr(dd, crp->cr_err_rlencnt) + + ipath_snap_cntr(dd, crp->cr_invalidrlencnt) + + ipath_snap_cntr(dd, crp->cr_errlinkcnt) + + ipath_snap_cntr(dd, crp->cr_erricrccnt) + + ipath_snap_cntr(dd, crp->cr_errvcrccnt) + + ipath_snap_cntr(dd, crp->cr_errlpcrccnt) + + ipath_snap_cntr(dd, crp->cr_badformatcnt) + dd->ipath_rxfc_unsupvl_errs; cntrs->port_rcv_remphys_errors = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt); - cntrs->port_xmit_discards = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt); - cntrs->port_xmit_data = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); - cntrs->port_rcv_data = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); - cntrs->port_xmit_packets = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); - cntrs->port_rcv_packets = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + ipath_snap_cntr(dd, crp->cr_rcvebpcnt); + cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt); + cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt); + cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt); + cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt); + cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt); cntrs->local_link_integrity_errors = (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? dd->ipath_lli_errs : dd->ipath_lli_errors; @@ -1045,8 +1127,9 @@ static int ipath_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_devdata *dd = dev->dd; enum ib_mtu mtu; - u16 lid = dev->dd->ipath_lid; + u16 lid = dd->ipath_lid; u64 ibcstat; memset(props, 0, sizeof(*props)); @@ -1054,16 +1137,16 @@ static int ipath_query_port(struct ib_device *ibdev, props->lmc = dev->mkeyprot_resv_lmc & 7; props->sm_lid = dev->sm_lid; props->sm_sl = dev->sm_sl; - ibcstat = dev->dd->ipath_lastibcstat; + ibcstat = dd->ipath_lastibcstat; props->state = ((ibcstat >> 4) & 0x3) + 1; /* See phys_state_show() */ props->phys_state = ipath_cvt_physportstate[ - dev->dd->ipath_lastibcstat & 0xf]; + dd->ipath_lastibcstat & 0xf]; props->port_cap_flags = dev->port_cap_flags; props->gid_tbl_len = 1; props->max_msg_sz = 0x80000000; - props->pkey_tbl_len = ipath_get_npkeys(dev->dd); - props->bad_pkey_cntr = ipath_get_cr_errpkey(dev->dd) - + props->pkey_tbl_len = ipath_get_npkeys(dd); + props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) - dev->z_pkey_violations; props->qkey_viol_cntr = dev->qkey_violations; props->active_width = IB_WIDTH_4X; @@ -1073,12 +1156,12 @@ static int ipath_query_port(struct ib_device *ibdev, props->init_type_reply = 0; /* - * Note: the chips support a maximum MTU of 4096, but the driver + * Note: the chip supports a maximum MTU of 4096, but the driver * hasn't implemented this feature yet, so set the maximum value * to 2048. */ props->max_mtu = IB_MTU_2048; - switch (dev->dd->ipath_ibmtu) { + switch (dd->ipath_ibmtu) { case 4096: mtu = IB_MTU_4096; break; @@ -1427,9 +1510,7 @@ static int disable_timer(struct ipath_devdata *dd) { /* Disable GPIO bit 2 interrupt */ if (dd->ipath_flags & IPATH_GPIO_INTR) { - u64 val; /* Disable GPIO bit 2 interrupt */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, dd->ipath_gpio_mask); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 1a24c6a..619ad72 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -42,6 +42,8 @@ #include #include +#include "ipath_kernel.h" + #define IPATH_MAX_RDMA_ATOMIC 4 #define QPN_MAX (1 << 24) @@ -59,6 +61,7 @@ */ #define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) +/* AETH NAK opcode values */ #define IB_RNR_NAK 0x20 #define IB_NAK_PSN_ERROR 0x60 #define IB_NAK_INVALID_REQUEST 0x61 @@ -66,6 +69,7 @@ #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 #define IB_NAK_INVALID_RD_REQUEST 0x64 +/* Flags for checking QP state (see ib_ipath_state_ops[]) */ #define IPATH_POST_SEND_OK 0x01 #define IPATH_POST_RECV_OK 0x02 #define IPATH_PROCESS_RECV_OK 0x04 @@ -239,7 +243,7 @@ struct ipath_mregion { */ struct ipath_sge { struct ipath_mregion *mr; - void *vaddr; /* current pointer into the segment */ + void *vaddr; /* kernel virtual address of segment */ u32 sge_length; /* length of the SGE */ u32 length; /* remaining length of the segment */ u16 m; /* current index: mr->map[m] */ @@ -407,6 +411,7 @@ struct ipath_qp { u32 s_ssn; /* SSN of tail entry */ u32 s_lsn; /* limit sequence number (credit) */ struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_swqe *s_wqe; struct ipath_rq r_rq; /* receive work queue */ struct ipath_sge r_sg_list[0]; /* verified SGEs */ }; @@ -683,8 +688,8 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc); void ipath_get_credit(struct ipath_qp *qp, u32 aeth); -int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, - u32 *hdr, u32 len, struct ipath_sge_state *ss); +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len); void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); @@ -692,8 +697,6 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); -int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr); - void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int has_grh, void *data, u32 tlen, struct ipath_qp *qp); @@ -733,6 +736,8 @@ int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); int ipath_destroy_srq(struct ib_srq *ibsrq); +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, @@ -782,18 +787,28 @@ int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void ipath_insert_rnr_queue(struct ipath_qp *qp); +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss); + int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, struct ib_global_route *grh, u32 hwords, u32 nwords); -void ipath_do_ruc_send(unsigned long data); +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2); + +void ipath_do_send(unsigned long data); + +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status); + +int ipath_make_rc_req(struct ipath_qp *qp); -int ipath_make_rc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p); +int ipath_make_uc_req(struct ipath_qp *qp); -int ipath_make_uc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p); +int ipath_make_ud_req(struct ipath_qp *qp); int ipath_register_ib_device(struct ipath_devdata *); -- cgit v0.10.2 From 9bec3992312b8bb3aee71bd3b57d106a0a649479 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 1 Jun 2007 13:01:47 -0700 Subject: IB/ipath: Verify host bus bandwidth to chip will not limit performance There have been a number of issues where host bandwidth via HT or PCIe to the InfiniPath chip has been limited in some fashion (BIOS, configuration, etc.), resulting in user confusion. This check gives a clear warning that something is wrong and needs to be resolved. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 6ccba36..5248f57 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -280,6 +281,89 @@ void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) { } +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + writeq(0, piobuf); /* length 0, no dwords actually sent */ + ipath_flush_wc(); + + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); +} + static int __devinit ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -515,6 +599,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ret = 0; } + ipath_verify_pioperf(dd); + ipath_device_create_group(&pdev->dev, dd); ipathfs_add_device(dd); ipath_user_add(dd); -- cgit v0.10.2 From 1793b4771d258c93ed86a6356c95cac418781fdd Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 7 Aug 2007 18:09:34 -0700 Subject: IB/ipath: Remove unneeded code for ipathfs The ipathfs file system is used to export binary data verses ASCII data such as through /sys. This patch removes some unneeded files since the data is available through other /sys files. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 2e689b9..262c25d 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -130,175 +130,6 @@ static const struct file_operations atomic_counters_ops = { .read = atomic_counters_read, }; -static ssize_t atomic_node_info_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u32 nodeinfo[10]; - struct ipath_devdata *dd; - u64 guid; - - dd = file->f_path.dentry->d_inode->i_private; - - guid = be64_to_cpu(dd->ipath_guid); - - nodeinfo[0] = /* BaseVersion is SMA */ - /* ClassVersion is SMA */ - (1 << 8) /* NodeType */ - | (1 << 0); /* NumPorts */ - nodeinfo[1] = (u32) (guid >> 32); - nodeinfo[2] = (u32) (guid & 0xffffffff); - /* PortGUID == SystemImageGUID for us */ - nodeinfo[3] = nodeinfo[1]; - /* PortGUID == SystemImageGUID for us */ - nodeinfo[4] = nodeinfo[2]; - /* PortGUID == NodeGUID for us */ - nodeinfo[5] = nodeinfo[3]; - /* PortGUID == NodeGUID for us */ - nodeinfo[6] = nodeinfo[4]; - nodeinfo[7] = (4 << 16) /* we support 4 pkeys */ - | (dd->ipath_deviceid << 0); - /* our chip version as 16 bits major, 16 bits minor */ - nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16); - nodeinfo[9] = (dd->ipath_unit << 24) | (dd->ipath_vendorid << 0); - - return simple_read_from_buffer(buf, count, ppos, nodeinfo, - sizeof nodeinfo); -} - -static const struct file_operations atomic_node_info_ops = { - .read = atomic_node_info_read, -}; - -static ssize_t atomic_port_info_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u32 portinfo[13]; - u32 tmp, tmp2; - struct ipath_devdata *dd; - - dd = file->f_path.dentry->d_inode->i_private; - - /* so we only initialize non-zero fields. */ - memset(portinfo, 0, sizeof portinfo); - - /* - * Notimpl yet M_Key (64) - * Notimpl yet GID (64) - */ - - portinfo[4] = (dd->ipath_lid << 16); - - /* - * Notimpl yet SMLID. - * CapabilityMask is 0, we don't support any of these - * DiagCode is 0; we don't store any diag info for now Notimpl yet - * M_KeyLeasePeriod (we don't support M_Key) - */ - - /* LocalPortNum is whichever port number they ask for */ - portinfo[7] = (dd->ipath_unit << 24) - /* LinkWidthEnabled */ - | (2 << 16) - /* LinkWidthSupported (really 2, but not IB valid) */ - | (3 << 8) - /* LinkWidthActive */ - | (2 << 0); - tmp = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; - tmp2 = 5; - if (tmp == IPATH_IBSTATE_INIT) - tmp = 2; - else if (tmp == IPATH_IBSTATE_ARM) - tmp = 3; - else if (tmp == IPATH_IBSTATE_ACTIVE) - tmp = 4; - else { - tmp = 0; /* down */ - tmp2 = tmp & 0xf; - } - - portinfo[8] = (1 << 28) /* LinkSpeedSupported */ - | (tmp << 24) /* PortState */ - | (tmp2 << 20) /* PortPhysicalState */ - | (2 << 16) - - /* LinkDownDefaultState */ - /* M_KeyProtectBits == 0 */ - /* NotImpl yet LMC == 0 (we can support all values) */ - | (1 << 4) /* LinkSpeedActive */ - | (1 << 0); /* LinkSpeedEnabled */ - switch (dd->ipath_ibmtu) { - case 4096: - tmp = 5; - break; - case 2048: - tmp = 4; - break; - case 1024: - tmp = 3; - break; - case 512: - tmp = 2; - break; - case 256: - tmp = 1; - break; - default: /* oops, something is wrong */ - ipath_dbg("Problem, ipath_ibmtu 0x%x not a valid IB MTU, " - "treat as 2048\n", dd->ipath_ibmtu); - tmp = 4; - break; - } - portinfo[9] = (tmp << 28) - /* NeighborMTU */ - /* Notimpl MasterSMSL */ - | (1 << 20) - - /* VLCap */ - /* Notimpl InitType (actually, an SMA decision) */ - /* VLHighLimit is 0 (only one VL) */ - ; /* VLArbitrationHighCap is 0 (only one VL) */ - /* - * Note: the chips support a maximum MTU of 4096, but the driver - * hasn't implemented this feature yet, so set the maximum - * to 2048. - */ - portinfo[10] = /* VLArbitrationLowCap is 0 (only one VL) */ - /* InitTypeReply is SMA decision */ - (4 << 16) /* MTUCap 2048 */ - | (7 << 13) /* VLStallCount */ - | (0x1f << 8) /* HOQLife */ - | (1 << 4) - - /* OperationalVLs 0 */ - /* PartitionEnforcementInbound */ - /* PartitionEnforcementOutbound not enforced */ - /* FilterRawinbound not enforced */ - ; /* FilterRawOutbound not enforced */ - /* M_KeyViolations are not counted by hardware, SMA can count */ - tmp = ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); - /* P_KeyViolations are counted by hardware. */ - portinfo[11] = ((tmp & 0xffff) << 0); - portinfo[12] = - /* Q_KeyViolations are not counted by hardware */ - (1 << 8) - - /* GUIDCap */ - /* SubnetTimeOut handled by SMA */ - /* RespTimeValue handled by SMA */ - ; - /* LocalPhyErrors are programmed to max */ - portinfo[12] |= (0xf << 20) - | (0xf << 16) /* OverRunErrors are programmed to max */ - ; - - return simple_read_from_buffer(buf, count, ppos, portinfo, - sizeof portinfo); -} - -static const struct file_operations atomic_port_info_ops = { - .read = atomic_port_info_read, -}; - static ssize_t flash_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -427,22 +258,6 @@ static int create_device_files(struct super_block *sb, goto bail; } - ret = create_file("node_info", S_IFREG|S_IRUGO, dir, &tmp, - &atomic_node_info_ops, dd); - if (ret) { - printk(KERN_ERR "create_file(%s/node_info) " - "failed: %d\n", unit, ret); - goto bail; - } - - ret = create_file("port_info", S_IFREG|S_IRUGO, dir, &tmp, - &atomic_port_info_ops, dd); - if (ret) { - printk(KERN_ERR "create_file(%s/port_info) " - "failed: %d\n", unit, ret); - goto bail; - } - ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, &flash_ops, dd); if (ret) { @@ -508,8 +323,6 @@ static int remove_device_files(struct super_block *sb, } remove_file(dir, "flash"); - remove_file(dir, "port_info"); - remove_file(dir, "node_info"); remove_file(dir, "atomic_counters"); d_delete(dir); ret = simple_rmdir(root->d_inode, dir); -- cgit v0.10.2 From 9ef8617af77136e743e5dd4b081a61797888a977 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Thu, 9 Aug 2007 15:18:48 -0700 Subject: IB/ipath: Correctly describe workaround for TID write chip bug This is a comment change, only, correcting the comment to match the implemented workaround, rather than the original workaround, and clarifying why it's needed. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index a324c6f..d43f0b3 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -1143,11 +1143,14 @@ static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr, pa |= 2 << 29; } - /* workaround chip bug 9437 by writing each TID twice - * and holding a spinlock around the writes, so they don't - * intermix with other TID (eager or expected) writes - * Unfortunately, this call can be done from interrupt level - * for the port 0 eager TIDs, so we have to use irqsave + /* + * Workaround chip bug 9437 by writing the scratch register + * before and after the TID, and with an io write barrier. + * We use a spinlock around the writes, so they can't intermix + * with other TID (eager or expected) writes (the chip bug + * is triggered by back to back TID writes). Unfortunately, this + * call can be done from interrupt level for the port 0 eager TIDs, + * so we have to use irqsave locks. */ spin_lock_irqsave(&dd->ipath_tid_lock, flags); ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeeddeaf); -- cgit v0.10.2 From 55046698faf6357ff7c53593dbfd43a9a3f681a7 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 17 Aug 2007 11:28:48 -0700 Subject: IB/ipath: UC RDMA WRITE with IMMEDIATE doesn't send the immediate This patch fixes a bug in the receive processing for UC RDMA WRITE with immediate which caused the last packet to be dropped. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 767beb9..2dd8de2 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -464,6 +464,16 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): rdma_last_imm: + if (header_in_data) { + wc.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; /* Check for invalid length. */ @@ -484,16 +494,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, dev->n_pkt_drops++; goto done; } - if (header_in_data) { - wc.imm_data = *(__be32 *) data; - data += sizeof(__be32); - } else { - /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; - } - hdrsize += 4; - wc.wc_flags = IB_WC_WITH_IMM; - wc.byte_len = 0; + wc.byte_len = qp->r_len; goto last_imm; case OP(RDMA_WRITE_LAST): -- cgit v0.10.2 From d29cc6efb9731a415957b8d0ff16e31729ed6837 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 17 Aug 2007 14:42:14 -0700 Subject: IB/ipath: Future proof eeprom checksum code (contents reading) In an earlier change, the amount of data read from the flash was mistakenly limited to the size known to the current driver. This causes problems when the length is increased, and written with the new longer version; the checksum would fail because not enough data was read. Always read the full 128 byte length to prevent this. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c index b4503e9..bcfa3cc 100644 --- a/drivers/infiniband/hw/ipath/ipath_eeprom.c +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -596,7 +596,11 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) goto bail; } - len = offsetof(struct ipath_flash, if_future); + /* + * read full flash, not just currently used part, since it may have + * been written with a newer definition + * */ + len = sizeof(struct ipath_flash); buf = vmalloc(len); if (!buf) { ipath_dev_err(dd, "Couldn't allocate memory to read %u " @@ -737,8 +741,10 @@ int ipath_update_eeprom_log(struct ipath_devdata *dd) /* * The quick-check above determined that there is something worthy * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition */ - len = offsetof(struct ipath_flash, if_future); + len = sizeof(struct ipath_flash); buf = vmalloc(len); ret = 1; if (!buf) { -- cgit v0.10.2 From 036be09ca55ee8512c05742f4f6b88911d012a90 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 20 Aug 2007 16:52:35 -0700 Subject: IB/ipath: Remove redundant code This patch removes some redundant initialization code. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 5248f57..44784d0 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -382,8 +382,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); - read_bars(dd, pdev, &bar0, &bar1); - ret = pci_enable_device(pdev); if (ret) { /* This can happen iff: @@ -529,9 +527,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_regions; } - dd->ipath_deviceid = ent->device; /* save for later use */ - dd->ipath_vendorid = ent->vendor; - dd->ipath_pcirev = pdev->revision; #if defined(__powerpc__) -- cgit v0.10.2 From c9cf7db2bca9180f5888eebc23dc607666a9685b Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Sat, 25 Aug 2007 16:48:29 -0700 Subject: IB/ipath: Generate flush CQE when QP is in error state Follow the IB spec. (C10-96) for post send which states that a flushed completion event should be generated for work requests posted when a QP is in the error state. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 3cc82b6..495194b 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -230,6 +230,18 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) } } +static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wr->wr_id; + wc.status = IB_WC_WR_FLUSH_ERR; + wc.opcode = ib_ipath_wc_opcode[wr->opcode]; + wc.qp = &qp->ibqp; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); +} + /** * ipath_post_one_send - post one RC, UC, or UD send work request * @qp: the QP to post on @@ -248,8 +260,14 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) spin_lock_irqsave(&qp->s_lock, flags); /* Check that state is OK to post send. */ - if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) - goto bail_inval; + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) { + if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR) + goto bail_inval; + /* C10-96 says generate a flushed completion entry. */ + ipath_flush_wqe(qp, wr); + ret = 0; + goto bail; + } /* IB spec says that num_sge == 0 is OK. */ if (wr->num_sge > qp->s_max_sge) -- cgit v0.10.2 From d42b01b584b6f55f70c56f6a3dabc26f4982d30d Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Sat, 25 Aug 2007 16:45:03 -0700 Subject: IB/ipath: Implement IB_EVENT_QP_LAST_WQE_REACHED This patch implements the IB_EVENT_QP_LAST_WQE_REACHED event which is needed by ib_ipoib to destroy the QP when used in connected mode. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index a8c4a6b..6a41fdb 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -377,13 +377,15 @@ static void ipath_reset_qp(struct ipath_qp *qp) * @err: the receive completion error to signal if a RWQE is active * * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. * The QP s_lock should be held and interrupts disabled. */ -void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; + int ret = 0; ipath_dbg("QP%d/%d in error state\n", qp->ibqp.qp_num, qp->remote_qpn); @@ -454,7 +456,10 @@ void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) wq->tail = tail; spin_unlock(&qp->r_rq.lock); - } + } else if (qp->ibqp.event_handler) + ret = 1; + + return ret; } /** @@ -473,6 +478,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct ipath_qp *qp = to_iqp(ibqp); enum ib_qp_state cur_state, new_state; unsigned long flags; + int lastwqe = 0; int ret; spin_lock_irqsave(&qp->s_lock, flags); @@ -532,7 +538,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, break; case IB_QPS_ERR: - ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: @@ -591,6 +597,14 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->state = new_state; spin_unlock_irqrestore(&qp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } ret = 0; goto bail; diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 53259da..5c29b2b 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -1497,11 +1497,21 @@ send_ack: static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) { unsigned long flags; + int lastwqe; spin_lock_irqsave(&qp->s_lock, flags); qp->state = IB_QPS_ERR; - ipath_error_qp(qp, err); + lastwqe = ipath_error_qp(qp, err); spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } } static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 619ad72..a197229 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -672,7 +672,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, int ipath_destroy_qp(struct ib_qp *ibqp); -void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); -- cgit v0.10.2 From 6cff2faaf135b602c914710f3414630c3fc1ee83 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 7 Sep 2007 16:54:01 -0700 Subject: IB/ipath: Optimize completion queue entry insertion and polling The code to add an entry to the completion queue stored the QPN which is needed for the user level verbs view of the completion queue entry but the kernel struct ib_wc contains a pointer to the QP instead of a QPN. When the kernel polled for a completion queue entry, the QPN was lookup up and the QP pointer recovered. This patch stores the CQE differently based on whether the CQ is a kernel CQ or a user CQ thus avoiding the QPN to QP lookup overhead. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c index a6f04d2..645ed71 100644 --- a/drivers/infiniband/hw/ipath/ipath_cq.c +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -76,22 +76,25 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) } return; } - wc->queue[head].wr_id = entry->wr_id; - wc->queue[head].status = entry->status; - wc->queue[head].opcode = entry->opcode; - wc->queue[head].vendor_err = entry->vendor_err; - wc->queue[head].byte_len = entry->byte_len; - wc->queue[head].imm_data = (__u32 __force)entry->imm_data; - wc->queue[head].qp_num = entry->qp->qp_num; - wc->queue[head].src_qp = entry->src_qp; - wc->queue[head].wc_flags = entry->wc_flags; - wc->queue[head].pkey_index = entry->pkey_index; - wc->queue[head].slid = entry->slid; - wc->queue[head].sl = entry->sl; - wc->queue[head].dlid_path_bits = entry->dlid_path_bits; - wc->queue[head].port_num = entry->port_num; - /* Make sure queue entry is written before the head index. */ - smp_wmb(); + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || @@ -130,6 +133,12 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) int npolled; u32 tail; + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + spin_lock_irqsave(&cq->lock, flags); wc = cq->queue; @@ -137,31 +146,10 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (tail > (u32) cq->ibcq.cqe) tail = (u32) cq->ibcq.cqe; for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { - struct ipath_qp *qp; - if (tail == wc->head) break; - /* Make sure entry is read after head index is read. */ - smp_rmb(); - qp = ipath_lookup_qpn(&to_idev(cq->ibcq.device)->qp_table, - wc->queue[tail].qp_num); - entry->qp = &qp->ibqp; - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); - - entry->wr_id = wc->queue[tail].wr_id; - entry->status = wc->queue[tail].status; - entry->opcode = wc->queue[tail].opcode; - entry->vendor_err = wc->queue[tail].vendor_err; - entry->byte_len = wc->queue[tail].byte_len; - entry->imm_data = wc->queue[tail].imm_data; - entry->src_qp = wc->queue[tail].src_qp; - entry->wc_flags = wc->queue[tail].wc_flags; - entry->pkey_index = wc->queue[tail].pkey_index; - entry->slid = wc->queue[tail].slid; - entry->sl = wc->queue[tail].sl; - entry->dlid_path_bits = wc->queue[tail].dlid_path_bits; - entry->port_num = wc->queue[tail].port_num; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; if (tail >= cq->ibcq.cqe) tail = 0; else @@ -171,6 +159,7 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) spin_unlock_irqrestore(&cq->lock, flags); +bail: return npolled; } @@ -215,6 +204,7 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec struct ipath_cq *cq; struct ipath_cq_wc *wc; struct ib_cq *ret; + u32 sz; if (entries < 1 || entries > ib_ipath_max_cqes) { ret = ERR_PTR(-EINVAL); @@ -235,7 +225,12 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * entries); + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); if (!wc) { ret = ERR_PTR(-ENOMEM); goto bail_cq; @@ -247,9 +242,8 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec */ if (udata && udata->outlen >= sizeof(__u64)) { int err; - u32 s = sizeof *wc + sizeof(struct ib_wc) * entries; - cq->ip = ipath_create_mmap_info(dev, s, context, wc); + cq->ip = ipath_create_mmap_info(dev, sz, context, wc); if (!cq->ip) { ret = ERR_PTR(-ENOMEM); goto bail_wc; @@ -380,6 +374,7 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) struct ipath_cq_wc *wc; u32 head, tail, n; int ret; + u32 sz; if (cqe < 1 || cqe > ib_ipath_max_cqes) { ret = -EINVAL; @@ -389,7 +384,12 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * cqe); + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); if (!wc) { ret = -ENOMEM; goto bail; @@ -430,7 +430,10 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail; } for (n = 0; tail != head; n++) { - wc->queue[n] = old_wc->queue[tail]; + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; if (tail == (u32) cq->ibcq.cqe) tail = 0; else @@ -447,9 +450,8 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) if (cq->ip) { struct ipath_ibdev *dev = to_idev(ibcq->device); struct ipath_mmap_info *ip = cq->ip; - u32 s = sizeof *wc + sizeof(struct ib_wc) * cqe; - ipath_update_mmap_info(dev, ip, s, wc); + ipath_update_mmap_info(dev, ip, sz, wc); spin_lock_irq(&dev->pending_lock); if (list_empty(&ip->pending_mmaps)) list_add(&ip->pending_mmaps, &dev->pending_mmaps); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index a197229..9be9bf9 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -191,7 +191,11 @@ struct ipath_mmap_info { struct ipath_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - struct ib_uverbs_wc queue[1]; /* this is actually size ibcq.cqe + 1 */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; }; /* -- cgit v0.10.2 From 15cba26f42c13ca30cbb4388f132ac0ddf4df538 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 12 Sep 2007 15:00:58 -0700 Subject: IB/ipath: Add ability to set the LMC via the sysfs debugging interface This patch adds the ability to set the LMC via a sysfs file as if the SM sent a SubnSet(PortInfo) MAD. It is useful for debugging when no SM is running. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c index 16238cd..e1ad7cf 100644 --- a/drivers/infiniband/hw/ipath/ipath_sysfs.c +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -163,6 +163,42 @@ static ssize_t show_boardversion(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); } +static ssize_t show_lmc(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc); +} + +static ssize_t store_lmc(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lmc = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lmc); + if (ret < 0) + goto invalid; + + if (lmc > 7) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, dd->ipath_lid, lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc); +bail: + return ret; +} + static ssize_t show_lid(struct device *dev, struct device_attribute *attr, char *buf) @@ -190,7 +226,7 @@ static ssize_t store_lid(struct device *dev, goto invalid; } - ipath_set_lid(dd, lid, 0); + ipath_set_lid(dd, lid, dd->ipath_lmc); goto bail; invalid: @@ -648,6 +684,7 @@ static struct attribute_group driver_attr_group = { }; static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc); static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); @@ -667,6 +704,7 @@ static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); static struct attribute *dev_attributes[] = { &dev_attr_guid.attr, + &dev_attr_lmc.attr, &dev_attr_lid.attr, &dev_attr_link_state.attr, &dev_attr_mlid.attr, -- cgit v0.10.2 From 542869a17eee2edf389273f40f757aa4e662b3da Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 13 Sep 2007 11:42:52 -0700 Subject: IB/ipath: Remove duplicate copy of LMC The LMC value was being saved by the SMA in two places. This patch cleans it up so only one copy is kept. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index d61c030..8f15216 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -245,7 +245,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, /* Only return the mkey if the protection field allows it. */ if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || - (dev->mkeyprot_resv_lmc >> 6) == 0) + dev->mkeyprot == 0) pip->mkey = dev->mkey; pip->gid_prefix = dev->gid_prefix; lid = dev->dd->ipath_lid; @@ -264,7 +264,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, pip->portphysstate_linkdown = (ipath_cvt_physportstate[ibcstat & 0xf] << 4) | (get_linkdowndefaultstate(dev->dd) ? 1 : 2); - pip->mkeyprot_resv_lmc = dev->mkeyprot_resv_lmc; + pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dev->dd->ipath_lmc; pip->linkspeedactive_enabled = 0x11; /* 2.5Gbps, 2.5Gbps */ switch (dev->dd->ipath_ibmtu) { case 4096: @@ -401,6 +401,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, struct ib_port_info *pip = (struct ib_port_info *)smp->data; struct ib_event event; struct ipath_ibdev *dev; + struct ipath_devdata *dd; u32 flags; char clientrereg = 0; u16 lid, smlid; @@ -415,6 +416,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, goto err; dev = to_idev(ibdev); + dd = dev->dd; event.device = ibdev; event.element.port_num = port; @@ -423,11 +425,12 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); lid = be16_to_cpu(pip->lid); - if (lid != dev->dd->ipath_lid) { + if (dd->ipath_lid != lid || + dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) { /* Must be a valid unicast LID address. */ if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) goto err; - ipath_set_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7); + ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7); event.event = IB_EVENT_LID_CHANGE; ib_dispatch_event(&event); } @@ -461,18 +464,18 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, case 0: /* NOP */ break; case 1: /* SLEEP */ - if (set_linkdowndefaultstate(dev->dd, 1)) + if (set_linkdowndefaultstate(dd, 1)) goto err; break; case 2: /* POLL */ - if (set_linkdowndefaultstate(dev->dd, 0)) + if (set_linkdowndefaultstate(dd, 0)) goto err; break; default: goto err; } - dev->mkeyprot_resv_lmc = pip->mkeyprot_resv_lmc; + dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6; dev->vl_high_limit = pip->vl_high_limit; switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { @@ -495,7 +498,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, /* XXX We have already partially updated our state! */ goto err; } - ipath_set_mtu(dev->dd, mtu); + ipath_set_mtu(dd, mtu); dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; @@ -511,16 +514,16 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * later. */ if (pip->pkey_violations == 0) - dev->z_pkey_violations = ipath_get_cr_errpkey(dev->dd); + dev->z_pkey_violations = ipath_get_cr_errpkey(dd); if (pip->qkey_violations == 0) dev->qkey_violations = 0; ore = pip->localphyerrors_overrunerrors; - if (set_phyerrthreshold(dev->dd, (ore >> 4) & 0xF)) + if (set_phyerrthreshold(dd, (ore >> 4) & 0xF)) goto err; - if (set_overrunthreshold(dev->dd, (ore & 0xF))) + if (set_overrunthreshold(dd, (ore & 0xF))) goto err; dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; @@ -538,7 +541,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * is down or is being set to down. */ state = pip->linkspeed_portstate & 0xF; - flags = dev->dd->ipath_flags; + flags = dd->ipath_flags; lstate = (pip->portphysstate_linkdown >> 4) & 0xF; if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) goto err; @@ -554,7 +557,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, /* FALLTHROUGH */ case IB_PORT_DOWN: if (lstate == 0) - if (get_linkdowndefaultstate(dev->dd)) + if (get_linkdowndefaultstate(dd)) lstate = IPATH_IB_LINKDOWN_SLEEP; else lstate = IPATH_IB_LINKDOWN; @@ -566,7 +569,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, lstate = IPATH_IB_LINKDOWN_DISABLE; else goto err; - ipath_set_linkstate(dev->dd, lstate); + ipath_set_linkstate(dd, lstate); if (flags & IPATH_LINKACTIVE) { event.event = IB_EVENT_PORT_ERR; ib_dispatch_event(&event); @@ -575,7 +578,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, case IB_PORT_ARMED: if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE))) break; - ipath_set_linkstate(dev->dd, IPATH_IB_LINKARM); + ipath_set_linkstate(dd, IPATH_IB_LINKARM); if (flags & IPATH_LINKACTIVE) { event.event = IB_EVENT_PORT_ERR; ib_dispatch_event(&event); @@ -584,7 +587,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, case IB_PORT_ACTIVE: if (!(flags & IPATH_LINKARMED)) break; - ipath_set_linkstate(dev->dd, IPATH_IB_LINKACTIVE); + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); event.event = IB_EVENT_PORT_ACTIVE; ib_dispatch_event(&event); break; @@ -1350,7 +1353,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) { /* Clear timeout and mkey protection field. */ dev->mkey_lease_timeout = 0; - dev->mkeyprot_resv_lmc &= 0x3F; + dev->mkeyprot = 0; } /* @@ -1361,7 +1364,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, dev->mkey != smp->mkey && (smp->method == IB_MGMT_METHOD_SET || (smp->method == IB_MGMT_METHOD_GET && - (dev->mkeyprot_resv_lmc >> 7) != 0))) { + dev->mkeyprot >= 2))) { if (dev->mkey_violations != 0xFFFF) ++dev->mkey_violations; if (dev->mkey_lease_timeout || diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 34c4a0a..16a2a93 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -236,10 +236,10 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) wc.pkey_index = 0; wc.slid = dev->dd->ipath_lid | (ah_attr->src_path_bits & - ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1)); + ((1 << dev->dd->ipath_lmc) - 1)); wc.sl = ah_attr->sl; wc.dlid_path_bits = - ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1); wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, @@ -289,7 +289,7 @@ int ipath_make_ud_req(struct ipath_qp *qp) } else { dev->n_unicast_xmit++; lid = ah_attr->dlid & - ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + ~((1 << dev->dd->ipath_lmc) - 1); if (unlikely(lid == dev->dd->ipath_lid)) { ipath_ud_loopback(qp, wqe); goto done; @@ -341,7 +341,7 @@ int ipath_make_ud_req(struct ipath_qp *qp) lid = dev->dd->ipath_lid; if (lid) { lid |= ah_attr->src_path_bits & - ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + ((1 << dev->dd->ipath_lmc) - 1); qp->s_hdr.lrh[3] = cpu_to_be16(lid); } else qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; @@ -551,7 +551,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, * Save the LMC lower bits if the destination LID is a unicast LID. */ wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : - dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + dlid & ((1 << dev->dd->ipath_lmc) - 1); wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 495194b..13aba3d 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -513,7 +513,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, /* Check for a valid destination LID (see ch. 7.11.1). */ lid = be16_to_cpu(hdr->lrh[1]); if (lid < IPATH_MULTICAST_LID_BASE) { - lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + lid &= ~((1 << dev->dd->ipath_lmc) - 1); if (unlikely(lid != dev->dd->ipath_lid)) { dev->rcv_errors++; goto bail; @@ -1152,7 +1152,7 @@ static int ipath_query_port(struct ib_device *ibdev, memset(props, 0, sizeof(*props)); props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE); - props->lmc = dev->mkeyprot_resv_lmc & 7; + props->lmc = dd->ipath_lmc; props->sm_lid = dev->sm_lid; props->sm_sl = dev->sm_sl; ibcstat = dd->ipath_lastibcstat; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 9be9bf9..6ccb54f 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -501,7 +501,7 @@ struct ipath_ibdev { int ib_unit; /* This is the device number */ u16 sm_lid; /* in host order */ u8 sm_sl; - u8 mkeyprot_resv_lmc; + u8 mkeyprot; /* non-zero when timer is set */ unsigned long mkey_lease_timeout; -- cgit v0.10.2 From 70c51da2c4f84317bb13a2b564600afdcebd686f Mon Sep 17 00:00:00 2001 From: Arthur Jones Date: Fri, 14 Sep 2007 12:22:49 -0700 Subject: IB/ipath: Use counters in ipath_poll and cleanup interrupts in ipath_close ipath_poll() suffered from a couple subtle bugs. Under the right conditions we could leave recv interrupts enabled on an ipath user context on close, thereby taking potentially unwanted interrupts on the next open -- this is fixed by unconditionally turning off recv interrupts on close. Also, we now use counters rather than set/clear bits which allows us to make sure we catch all interrupts at the cost of changing the semantics slightly (it's now give me all events since the last time I called poll() rather than give me all events since I called _this_ poll routine). We also added some memory barriers which may help ensure we get all notifications in a timely manner. Signed-off-by: Arthur Jones Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 33ab0d6..016e7c4 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1341,6 +1341,19 @@ bail: return ret; } +static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd) +{ + unsigned pollflag = 0; + + if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) && + pd->port_hdrqfull != pd->port_hdrqfull_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + } + + return pollflag; +} + static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, struct file *fp, struct poll_table_struct *pt) @@ -1350,22 +1363,20 @@ static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, dd = pd->port_dd; - if (test_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag)) { - pollflag |= POLLERR; - clear_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag); - } + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); - if (test_bit(IPATH_PORT_WAITING_URG, &pd->int_flag)) { + if (pd->port_urgent != pd->port_urgent_poll) { pollflag |= POLLIN | POLLRDNORM; - clear_bit(IPATH_PORT_WAITING_URG, &pd->int_flag); + pd->port_urgent_poll = pd->port_urgent; } if (!pollflag) { + /* this saves a spin_lock/unlock in interrupt handler... */ set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag); - if (pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) - set_bit(IPATH_PORT_WAITING_OVERFLOW, - &pd->port_flag); - + /* flush waiting flag so don't miss an event... */ + wmb(); poll_wait(fp, &pd->port_wait, pt); } @@ -1376,31 +1387,27 @@ static unsigned int ipath_poll_next(struct ipath_portdata *pd, struct file *fp, struct poll_table_struct *pt) { - u32 head, tail; + u32 head; + u32 tail; unsigned pollflag = 0; struct ipath_devdata *dd; dd = pd->port_dd; + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); tail = *(volatile u64 *)pd->port_rcvhdrtail_kvaddr; - if (test_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag)) { - pollflag |= POLLERR; - clear_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag); - } - - if (tail != head || - test_bit(IPATH_PORT_WAITING_RCV, &pd->int_flag)) { + if (head != tail) pollflag |= POLLIN | POLLRDNORM; - clear_bit(IPATH_PORT_WAITING_RCV, &pd->int_flag); - } - - if (!pollflag) { + else { + /* this saves a spin_lock/unlock in interrupt handler */ set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); - if (pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) - set_bit(IPATH_PORT_WAITING_OVERFLOW, - &pd->port_flag); + /* flush waiting flag so we don't miss an event */ + wmb(); set_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT, &dd->ipath_rcvctrl); @@ -1917,6 +1924,12 @@ static int ipath_do_user_init(struct file *fp, ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", pd->port_port, head32); pd->port_tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + pd->port_urgent = 0; + pd->port_urgent_poll = 0; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + /* * now enable the port; the tail registers will be written to memory * by the chip as soon as it sees the write to @@ -2039,9 +2052,11 @@ static int ipath_close(struct inode *in, struct file *fp) if (dd->ipath_kregbase) { int i; - /* atomically clear receive enable port. */ + /* atomically clear receive enable port and intr avail. */ clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, &dd->ipath_rcvctrl); + clear_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT, + &dd->ipath_rcvctrl); ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); /* and read back from chip to be sure that nothing diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 11b3614..61eac8c 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -688,17 +688,9 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) chkerrpkts = 1; dd->ipath_lastrcvhdrqtails[i] = tl; pd->port_hdrqfull++; - if (test_bit(IPATH_PORT_WAITING_OVERFLOW, - &pd->port_flag)) { - clear_bit( - IPATH_PORT_WAITING_OVERFLOW, - &pd->port_flag); - set_bit( - IPATH_PORT_WAITING_OVERFLOW, - &pd->int_flag); - wake_up_interruptible( - &pd->port_wait); - } + /* flush hdrqfull so that poll() sees it */ + wmb(); + wake_up_interruptible(&pd->port_wait); } } } @@ -960,6 +952,8 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat) int i; int rcvdint = 0; + /* test_bit below needs this... */ + rmb(); portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) & dd->ipath_i_rcvavail_mask) | ((istat >> INFINIPATH_I_RCVURG_SHIFT) & @@ -967,22 +961,15 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat) for (i = 1; i < dd->ipath_cfgports; i++) { struct ipath_portdata *pd = dd->ipath_pd[i]; if (portr & (1 << i) && pd && pd->port_cnt) { - if (test_bit(IPATH_PORT_WAITING_RCV, - &pd->port_flag)) { - clear_bit(IPATH_PORT_WAITING_RCV, - &pd->port_flag); - set_bit(IPATH_PORT_WAITING_RCV, - &pd->int_flag); + if (test_and_clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { clear_bit(i + INFINIPATH_R_INTRAVAIL_SHIFT, &dd->ipath_rcvctrl); wake_up_interruptible(&pd->port_wait); rcvdint = 1; - } else if (test_bit(IPATH_PORT_WAITING_URG, - &pd->port_flag)) { - clear_bit(IPATH_PORT_WAITING_URG, - &pd->port_flag); - set_bit(IPATH_PORT_WAITING_URG, - &pd->int_flag); + } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG, + &pd->port_flag)) { + pd->port_urgent++; wake_up_interruptible(&pd->port_wait); } } diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index d983f92..872fb36 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -139,6 +139,12 @@ struct ipath_portdata { u32 port_pionowait; /* total number of rcvhdrqfull errors */ u32 port_hdrqfull; + /* saved total number of rcvhdrqfull errors for poll edge trigger */ + u32 port_hdrqfull_poll; + /* total number of polled urgent packets */ + u32 port_urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 port_urgent_poll; /* pid of process using this port */ pid_t port_pid; /* same size as task_struct .comm[] */ @@ -757,8 +763,6 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); #define IPATH_PORT_MASTER_UNINIT 4 /* waiting for an urgent packet to arrive */ #define IPATH_PORT_WAITING_URG 5 - /* waiting for a header overflow */ -#define IPATH_PORT_WAITING_OVERFLOW 6 /* free up any allocated data at closes */ void ipath_free_data(struct ipath_portdata *dd); -- cgit v0.10.2 From 4bec0b9155d6757847b754e21b55ecafcecef839 Mon Sep 17 00:00:00 2001 From: Arthur Jones Date: Tue, 18 Sep 2007 14:24:23 -0700 Subject: IB/ipath: iba6110 rev4 no longer needs recv header overrun workaround iba6110 rev3 and earlier had a chip bug where the chip could overrun the recv header queue. rev4 fixed this chip bug so userspace no longer needs to workaround it. Now we only set the workaround flag for older chip versions. Signed-off-by: Arthur Jones Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index e1c5998..d4940be 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -1599,8 +1599,10 @@ static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) { struct ipath_base_info *kinfo = kbase; - kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | - IPATH_RUNTIME_RCVHDR_COPY; + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT; + + if (pd->port_dd->ipath_minrev < 4) + kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; return 0; } -- cgit v0.10.2 From 20bed343142bfaf08765e35aaefa56dc5cc287db Mon Sep 17 00:00:00 2001 From: Arthur Jones Date: Tue, 18 Sep 2007 14:44:45 -0700 Subject: IB/ipath: Indicate a couple of chip bugs to userspace A couple of chip bugs in the iba6110 and in the iba6120 are not in more recent chips. This first bug swaps two of the pioavail register locations. In the second bug, the chip can sometimes forget to dma the pio avail register to memory. We indicate the presence of these bugs with runtime flags and we indicate the presence of the flags by bumping the SWMINOR. Signed-off-by: Arthur Jones Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h index 6ad822c..851df8a 100644 --- a/drivers/infiniband/hw/ipath/ipath_common.h +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -189,6 +189,8 @@ typedef enum _ipath_ureg { #define IPATH_RUNTIME_RCVHDR_COPY 0x8 #define IPATH_RUNTIME_MASTER 0x10 /* 0x20 and 0x40 are no longer used, but are reserved for ABI compatibility */ +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 /* * This structure is returned by ipath_userinit() immediately after @@ -350,7 +352,7 @@ struct ipath_base_info { * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define IPATH_USER_SWMINOR 5 +#define IPATH_USER_SWMINOR 6 #define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index d4940be..df42a1e 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -1599,7 +1599,8 @@ static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) { struct ipath_base_info *kinfo = kbase; - kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT; + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | + IPATH_RUNTIME_PIO_REGSWAPPED; if (pd->port_dd->ipath_minrev < 4) kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index d43f0b3..0103d6f 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -1348,7 +1348,8 @@ static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase) dd = pd->port_dd; done: - kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE; + kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE | + IPATH_RUNTIME_FORCE_PIOAVAIL | IPATH_RUNTIME_PIO_REGSWAPPED; return 0; } -- cgit v0.10.2 From aa7c79abd154ed9aba4c19b861d439ef6af35d3a Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Thu, 16 Aug 2007 18:10:43 -0700 Subject: IB/ipath: Fix QHT7040 serial number check Remove all the OEM and bringup boards, and complain and fail initialization if one is found. QHT7040 with GPIO rework (128ywwuuuu) is OK, older 112ywwuuuu is no longer supported). The check that had been added was failing both the 112 and 128 series. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index df42a1e..ddbebe4 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -631,56 +631,35 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, { char *n = NULL; u8 boardrev = dd->ipath_boardrev; - int ret; + int ret = 0; switch (boardrev) { - case 4: /* Ponderosa is one of the bringup boards */ - n = "Ponderosa"; - break; case 5: /* * original production board; two production levels, with * different serial number ranges. See ipath_ht_early_init() for * case where we enable IPATH_GPIO_INTR for later serial # range. + * Original 112* serial number is no longer supported. */ n = "InfiniPath_QHT7040"; break; - case 6: - n = "OEM_Board_3"; - break; case 7: /* small form factor production board */ n = "InfiniPath_QHT7140"; break; - case 8: - n = "LS/X-1"; - break; - case 9: /* Comstock bringup test board */ - n = "Comstock"; - break; - case 10: - n = "OEM_Board_2"; - break; - case 11: - n = "InfiniPath_HT-470"; /* obsoleted */ - break; - case 12: - n = "OEM_Board_4"; - break; default: /* don't know, just print the number */ ipath_dev_err(dd, "Don't yet know about board " "with ID %u\n", boardrev); snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u", boardrev); + ret = 1; break; } if (n) snprintf(name, namelen, "%s", n); - if (dd->ipath_boardrev != 6 && dd->ipath_boardrev != 7 && - dd->ipath_boardrev != 11) { + if (ret) { ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name); - ret = 1; goto bail; } if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || @@ -1554,10 +1533,17 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) * can use GPIO interrupts. They have serial #'s starting * with 128, rather than 112. */ - dd->ipath_flags |= IPATH_GPIO_INTR; - } else - ipath_dev_err(dd, "Unsupported InfiniPath serial " - "number %.16s!\n", dd->ipath_serial); + if (dd->ipath_serial[0] == '1' && + dd->ipath_serial[1] == '2' && + dd->ipath_serial[2] == '8') + dd->ipath_flags |= IPATH_GPIO_INTR; + else { + ipath_dev_err(dd, "Unsupported InfiniPath board " + "(serial number %.16s)!\n", + dd->ipath_serial); + return 1; + } + } if (dd->ipath_minrev >= 4) { /* Rev4+ reports extra errors via internal GPIO pins */ -- cgit v0.10.2 From 192594d5230f447ef2df8de9d7902ac90d11c118 Mon Sep 17 00:00:00 2001 From: Michael Albaugh Date: Tue, 2 Oct 2007 13:26:45 -0700 Subject: IB/ipath: Maintain active time on all chips There is a count of "active hours" maintained in EEPROM, to aid troubleshooting. The definition of "active" is based on traffic exceeding a threshold in any given 5-second polling interval. As originally written, the check was inadvertently bypassed for chips whose counters were 64-bits wide, and only applied to chips with 32-bit wide counters. This patch moves the test for amount of traffic "out" to a more common location, rather than depending on a side-effect of the software emulation of 64-bit counts on chips whose hardware is only 32-bits wide. Signed-off-by: Michael Albaugh Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c index bae4f56..f027141 100644 --- a/drivers/infiniband/hw/ipath/ipath_stats.c +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -55,7 +55,6 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) u64 val64; unsigned long t0, t1; u64 ret; - unsigned long flags; t0 = jiffies; /* If fast increment counters are only 32 bits, snapshot them, @@ -92,18 +91,12 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) if (creg == dd->ipath_cregs->cr_wordsendcnt) { if (val != dd->ipath_lastsword) { dd->ipath_sword += val - dd->ipath_lastsword; - spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); - dd->ipath_traffic_wds += val - dd->ipath_lastsword; - spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); dd->ipath_lastsword = val; } val64 = dd->ipath_sword; } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) { if (val != dd->ipath_lastrword) { dd->ipath_rword += val - dd->ipath_lastrword; - spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); - dd->ipath_traffic_wds += val - dd->ipath_lastrword; - spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); dd->ipath_lastrword = val; } val64 = dd->ipath_rword; @@ -247,6 +240,7 @@ void ipath_get_faststats(unsigned long opaque) u32 val; static unsigned cnt; unsigned long flags; + u64 traffic_wds; /* * don't access the chip while running diags, or memory diags can @@ -262,12 +256,13 @@ void ipath_get_faststats(unsigned long opaque) * exceeding a threshold, so we need to check the word-counts * even if they are 64-bit. */ - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); - if (dd->ipath_traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) + traffic_wds -= dd->ipath_traffic_wds; + dd->ipath_traffic_wds += traffic_wds; + if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) atomic_add(5, &dd->ipath_active_time); /* S/B #define */ - dd->ipath_traffic_wds = 0; spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); if (dd->ipath_flags & IPATH_32BITCOUNTERS) { -- cgit v0.10.2 From 6a733cdc71b7aa8107caa57f2a16629aa731242a Mon Sep 17 00:00:00 2001 From: Michael Albaugh Date: Wed, 3 Oct 2007 10:47:38 -0700 Subject: IB/ipath: Better handling of unexpected GPIO interrupts The General Purpose I/O pins can be configured to cause interrupts. At the end of the interrupt code dealing with all known causes, a message is output if any bits remain un-handled. Since this is a "can't happen" scenario, it should only be triggered by bugs elsewhere. It is harmless, and potentially beneficial, to limit the damage by masking any such unexpected interrupts. This patch adds disabling of interrupts from any pins that should not have been allowed to interrupt, in addition to emitting a message. Signed-off-by: Michael Albaugh Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 61eac8c..801a20d 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1124,10 +1124,8 @@ irqreturn_t ipath_intr(int irq, void *data) /* * Some unexpected bits remain. If they could have * caused the interrupt, complain and clear. - * MEA: this is almost certainly non-ideal. - * we should look into auto-disable of unexpected - * GPIO interrupts, possibly on a "three strikes" - * basis. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. */ const u32 mask = (u32) dd->ipath_gpio_mask; @@ -1135,6 +1133,10 @@ irqreturn_t ipath_intr(int irq, void *data) ipath_dbg("Unexpected GPIO IRQ bits %x\n", gpiostatus & mask); to_clear |= (gpiostatus & mask); + dd->ipath_gpio_mask &= ~(gpiostatus & mask); + ipath_write_kreg(dd, + dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } } if (to_clear) { -- cgit v0.10.2 From 49739b3e24a10d819d3167a1c5b319d0b1186245 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 19 Sep 2007 16:47:31 -0700 Subject: IB/ipath: Fix IB_EVENT_PORT_ERR event The link state event calls were being generated when the SM told the SMA to change link states. This works for IB_EVENT_PORT_ACTIVE but not if the link goes down and stays down. The fix is to generate event calls from the interrupt handler when the HW link state changes. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 44784d0..1f152de 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -2086,6 +2086,8 @@ void ipath_shutdown_device(struct ipath_devdata *dd) INFINIPATH_IBCC_LINKINITCMD_SHIFT); ipath_cancel_sends(dd, 0); + signal_ib_event(dd, IB_EVENT_PORT_ERR); + /* disable IBC */ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 801a20d..6a5dd5c 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -275,6 +275,16 @@ static char *ib_linkstate(u32 linkstate) return ret; } +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev) +{ + struct ib_event event; + + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = 1; + event.event = ev; + ib_dispatch_event(&event); +} + static void handle_e_ibstatuschanged(struct ipath_devdata *dd, ipath_err_t errs, int noprint) { @@ -373,6 +383,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd, dd->ipath_ibpollcnt = 0; /* some state other than 2 or 3 */ ipath_stats.sps_iblink++; if (ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); dd->ipath_flags |= IPATH_LINKDOWN; dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKACTIVE | @@ -405,7 +417,10 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd, *dd->ipath_statusp |= IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; dd->ipath_f_setextled(dd, lstate, ltstate); + signal_ib_event(dd, IB_EVENT_PORT_ACTIVE); } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_INIT) { + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); /* * set INIT and DOWN. Down is checked by most of the other * code, but INIT is useful to know in a few places. @@ -418,6 +433,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd, | IPATH_STATUS_IB_READY); dd->ipath_f_setextled(dd, lstate, ltstate); } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ARM) { + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); dd->ipath_flags |= IPATH_LINKARMED; dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT | diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 872fb36..8786dd7 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -42,6 +42,7 @@ #include #include #include +#include #include "ipath_common.h" #include "ipath_debug.h" @@ -775,6 +776,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *); int ipath_update_eeprom_log(struct ipath_devdata *dd); void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); /* * Set LED override, only the two LSBs have "public" meaning, but diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 8f15216..0ae3a7c 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -570,26 +570,16 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, else goto err; ipath_set_linkstate(dd, lstate); - if (flags & IPATH_LINKACTIVE) { - event.event = IB_EVENT_PORT_ERR; - ib_dispatch_event(&event); - } break; case IB_PORT_ARMED: if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE))) break; ipath_set_linkstate(dd, IPATH_IB_LINKARM); - if (flags & IPATH_LINKACTIVE) { - event.event = IB_EVENT_PORT_ERR; - ib_dispatch_event(&event); - } break; case IB_PORT_ACTIVE: if (!(flags & IPATH_LINKARMED)) break; ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); - event.event = IB_EVENT_PORT_ACTIVE; - ib_dispatch_event(&event); break; default: /* XXX We have already partially updated our state! */ diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 13aba3d..74f77e7 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -948,6 +948,7 @@ bail: int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, u32 hdrwords, struct ipath_sge_state *ss, u32 len) { + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; u32 plen; int ret; u32 dwords = (len + 3) >> 2; @@ -955,8 +956,15 @@ int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, /* +1 is for the qword padding of pbc */ plen = hdrwords + dwords + 1; - ret = ipath_verbs_send_pio(qp, (u32 *) hdr, hdrwords, - ss, len, plen, dwords); + /* Drop non-VL15 packets if we are not in the active state */ + if (!(dd->ipath_flags & IPATH_LINKACTIVE) && + qp->ibqp.qp_type != IB_QPT_SMI) { + if (qp->s_wqe) + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + ret = 0; + } else + ret = ipath_verbs_send_pio(qp, (u32 *) hdr, hdrwords, + ss, len, plen, dwords); return ret; } -- cgit v0.10.2 From bda94e32b39c0e60d43b34a175363601b6f12ca4 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 5 Oct 2007 16:03:21 -0700 Subject: IB/ipath: Remove redundant link state checks This patch removes some redundant checks when the SMA changes the link state since the same checks are made in the lower level function that sets the state. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 0ae3a7c..3d1432d 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -402,7 +402,6 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, struct ib_event event; struct ipath_ibdev *dev; struct ipath_devdata *dd; - u32 flags; char clientrereg = 0; u16 lid, smlid; u8 lwe; @@ -541,7 +540,6 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * is down or is being set to down. */ state = pip->linkspeed_portstate & 0xF; - flags = dd->ipath_flags; lstate = (pip->portphysstate_linkdown >> 4) & 0xF; if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) goto err; @@ -572,13 +570,9 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, ipath_set_linkstate(dd, lstate); break; case IB_PORT_ARMED: - if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE))) - break; ipath_set_linkstate(dd, IPATH_IB_LINKARM); break; case IB_PORT_ACTIVE: - if (!(flags & IPATH_LINKARMED)) - break; ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); break; default: -- cgit v0.10.2 From 3ac8c70f74ca67111c570f4ba828cc4b6fc229f4 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Wed, 3 Oct 2007 12:47:00 -0700 Subject: IB/ipath: Minor fix to ordering of freeing and zeroing of tid pages. Fixed to be the same as everywhere else. copy and then zero the page * in the array first, and then pass the copy to the VM routines. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 016e7c4..5de3243 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -538,6 +538,9 @@ static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, continue; cnt++; if (dd->ipath_pageshadow[porttid + tid]) { + struct page *p; + p = dd->ipath_pageshadow[porttid + tid]; + dd->ipath_pageshadow[porttid + tid] = NULL; ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", pd->port_pid, tid); dd->ipath_f_put_tid(dd, &tidbase[tid], @@ -546,9 +549,7 @@ static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, pci_unmap_page(dd->pcidev, dd->ipath_physshadow[porttid + tid], PAGE_SIZE, PCI_DMA_FROMDEVICE); - ipath_release_user_pages( - &dd->ipath_pageshadow[porttid + tid], 1); - dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_release_user_pages(&p, 1); ipath_stats.sps_pageunlocks++; } else ipath_dbg("Unused tid %u, ignoring\n", tid); -- cgit v0.10.2 From 55a98e955caab78a5959933a4a3a0136e2491d6c Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 10 Oct 2007 17:55:37 +0200 Subject: IB/mthca: Mark error paths as unlikely() in post_srq_recv functions Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 88d219e..3f58c11 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -509,7 +509,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, for (nreq = 0; wr; wr = wr->next) { ind = srq->first_free; - if (ind < 0) { + if (unlikely(ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; @@ -519,7 +519,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); - if (next_ind < 0) { + if (unlikely(next_ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; @@ -623,7 +623,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { ind = srq->first_free; - if (ind < 0) { + if (unlikely(ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; @@ -633,7 +633,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); - if (next_ind < 0) { + if (unlikely(next_ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; -- cgit v0.10.2 From 335a64a5a958002bc238c90de695e120c3c8c120 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 8 Oct 2007 10:13:00 +0200 Subject: IPoIB: Allow setting policy to ignore multicast groups The kernel IB stack allows (through the RDMA CM) userspace applications to join and use multicast groups from the IPoIB MGID range. This allows multicast traffic to be handled directly from userspace QPs, without going through the kernel stack, which gives better performance for some applications. However, to fully interoperate with IP multicast, such userspace applications need to participate in IGMP reports and queries, or else routers may not forward the multicast traffic to the system where the application is running. The simplest way to do this is to share the kernel IGMP implementation by using the IP_ADD_MEMBERSHIP option to join multicast groups that are being handled directly in userspace. However, in such cases, the actual multicast traffic should not also be handled by the IPoIB interface, because that would burn resources handling multicast packets that will just be discarded in the kernel. To handle this, this patch adds lookup on the database used for IB multicast group reference counting when IPoIB is joining multicast groups, and if a multicast group is already handled by user space, then the IPoIB kernel driver ignores the group. This is controlled by a per-interface policy flag. When the flag is set, IPoIB will not join and attach its QP to a multicast group which already has an entry in the database; when the flag is cleared, IPoIB will behave as before this change. For each IPoIB interface, the /sys/class/net/$intf/umcast attribute controls the policy flag. The default value is off/0. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index fc16bce..a198ce8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -86,6 +86,7 @@ enum { IPOIB_MCAST_STARTED = 8, IPOIB_FLAG_NETIF_STOPPED = 9, IPOIB_FLAG_ADMIN_CM = 10, + IPOIB_FLAG_UMCAST = 11, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -384,6 +385,7 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) int ipoib_open(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_ah *address, u32 qpn); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 900335a..ff17fe3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1019,6 +1019,37 @@ static ssize_t show_pkey(struct device *dev, } static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static ssize_t show_umcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); + + return count; +} +static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_umcast); +} + static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1136,6 +1167,8 @@ static struct net_device *ipoib_add_port(const char *format, goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 94a5709..62abfb6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -761,6 +761,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); unsigned long flags; + struct ib_sa_mcmember_rec rec; ipoib_dbg_mcast(priv, "restarting multicast task\n"); @@ -794,6 +795,14 @@ void ipoib_mcast_restart_task(struct work_struct *work) if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; + /* ignore group which is directly joined by userspace */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid " + IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid)); + continue; + } + /* Not found or send-only group, let's add a new entry */ ipoib_dbg_mcast(priv, "adding multicast entry for mgid " IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 6762988..293f5b8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -119,6 +119,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_parent)) goto sysfs_failed; -- cgit v0.10.2 From 3d73c2884f45f9a297cbc956cea101405a9703f2 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 10 Oct 2007 15:43:54 -0700 Subject: mlx4_core: Fix section mismatches Commit ee49bd93 ("mlx4_core: Reset device when internal error is detected") introduced some section mismatch problems when CONFIG_HOTPLUG=n, because the error recovery code tears down and reinitializes the device after everything is loaded, which ends up calling into lots of code marked __devinit and __devexit from regular .text. Fix this by getting rid of these now-incorrect section markers. Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index 39253d0..d4441fe 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -231,7 +231,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) } EXPORT_SYMBOL_GPL(mlx4_cq_free); -int __devinit mlx4_init_cq_table(struct mlx4_dev *dev) +int mlx4_init_cq_table(struct mlx4_dev *dev) { struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table; int err; diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 2095c84..9c36c20 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -300,8 +300,7 @@ static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, MLX4_CMD_TIME_CLASS_A); } -static void __devinit __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, - struct mlx4_eq *eq) +static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) { struct mlx4_priv *priv = mlx4_priv(dev); int index; @@ -323,8 +322,8 @@ static void __devinit __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4); } -static int __devinit mlx4_create_eq(struct mlx4_dev *dev, int nent, - u8 intr, struct mlx4_eq *eq) +static int mlx4_create_eq(struct mlx4_dev *dev, int nent, + u8 intr, struct mlx4_eq *eq) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; @@ -485,7 +484,7 @@ static void mlx4_free_irqs(struct mlx4_dev *dev) free_irq(eq_table->eq[i].irq, eq_table->eq + i); } -static int __devinit mlx4_map_clr_int(struct mlx4_dev *dev) +static int mlx4_map_clr_int(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -506,7 +505,7 @@ static void mlx4_unmap_clr_int(struct mlx4_dev *dev) iounmap(priv->clr_base); } -int __devinit mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) +int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) { struct mlx4_priv *priv = mlx4_priv(dev); int ret; @@ -548,7 +547,7 @@ void mlx4_unmap_eq_icm(struct mlx4_dev *dev) __free_page(priv->eq_table.icm_page); } -int __devinit mlx4_init_eq_table(struct mlx4_dev *dev) +int mlx4_init_eq_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 478b3ba..e029b8a 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -85,7 +85,7 @@ static struct mlx4_profile default_profile = { .num_mtt = 1 << 20, }; -static int __devinit mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) +static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; int i; @@ -256,10 +256,8 @@ err: return err; } -static int __devinit mlx4_init_icm(struct mlx4_dev *dev, - struct mlx4_dev_cap *dev_cap, - struct mlx4_init_hca_param *init_hca, - u64 icm_size) +static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *init_hca, u64 icm_size) { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; @@ -481,7 +479,7 @@ static void mlx4_close_hca(struct mlx4_dev *dev) mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); } -static int __devinit mlx4_init_hca(struct mlx4_dev *dev) +static int mlx4_init_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_adapter adapter; @@ -554,7 +552,7 @@ err_stop_fw: return err; } -static int __devinit mlx4_setup_hca(struct mlx4_dev *dev) +static int mlx4_setup_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; @@ -721,19 +719,12 @@ no_msi: priv->eq_table.eq[i].irq = dev->pdev->irq; } -static int __devinit mlx4_init_one(struct pci_dev *pdev, - const struct pci_device_id *id) +static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - static int mlx4_version_printed; struct mlx4_priv *priv; struct mlx4_dev *dev; int err; - if (!mlx4_version_printed) { - printk(KERN_INFO "%s", mlx4_version); - ++mlx4_version_printed; - } - printk(KERN_INFO PFX "Initializing %s\n", pci_name(pdev)); @@ -883,7 +874,20 @@ err_disable_pdev: return err; } -static void __devexit mlx4_remove_one(struct pci_dev *pdev) +static int __devinit mlx4_init_one(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + static int mlx4_version_printed; + + if (!mlx4_version_printed) { + printk(KERN_INFO "%s", mlx4_version); + ++mlx4_version_printed; + } + + return mlx4_init_one(pdev, id); +} + +static void mlx4_remove_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); @@ -924,7 +928,7 @@ static void __devexit mlx4_remove_one(struct pci_dev *pdev) int mlx4_restart_one(struct pci_dev *pdev) { mlx4_remove_one(pdev); - return mlx4_init_one(pdev, NULL); + return __mlx4_init_one(pdev, NULL); } static struct pci_device_id mlx4_pci_table[] = { diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index 672024a..a99e772 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -359,7 +359,7 @@ out: } EXPORT_SYMBOL_GPL(mlx4_multicast_detach); -int __devinit mlx4_init_mcg_table(struct mlx4_dev *dev) +int mlx4_init_mcg_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 4bc39e6..0c05a10 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -430,7 +430,7 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, } EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt); -int __devinit mlx4_init_mr_table(struct mlx4_dev *dev) +int mlx4_init_mr_table(struct mlx4_dev *dev) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; int err; diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 23dea1e..3a93c5f 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -57,7 +57,7 @@ void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn) } EXPORT_SYMBOL_GPL(mlx4_pd_free); -int __devinit mlx4_init_pd_table(struct mlx4_dev *dev) +int mlx4_init_pd_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index 278414b..cc4b1be 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -251,7 +251,7 @@ static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn) MLX4_CMD_TIME_CLASS_B); } -int __devinit mlx4_init_qp_table(struct mlx4_dev *dev) +int mlx4_init_qp_table(struct mlx4_dev *dev) { struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; int err; diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index 31b255a..d23f46d 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -235,7 +235,7 @@ err_out: } EXPORT_SYMBOL_GPL(mlx4_srq_query); -int __devinit mlx4_init_srq_table(struct mlx4_dev *dev) +int mlx4_init_srq_table(struct mlx4_dev *dev) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; int err; -- cgit v0.10.2