From 65b302ad31b02b0790417f4e65833af494cb35ce Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Mon, 21 Apr 2014 17:02:42 +0200 Subject: RDMA/cxgb4: Fix memory leaks in c4iw_alloc() error paths c4iw_alloc() bails out without freeing the storage that 'devp' points to. Picked up by Coverity - CID 1204241. Fixes: fa658a98a2 ("RDMA/cxgb4: Use the BAR2/WC path for kernel QPs and T5 devices") Signed-off-by: Christoph Jaeger Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index f4fa50a..8914ea9 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -736,6 +736,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) pci_resource_len(devp->rdev.lldi.pdev, 2)); if (!devp->rdev.bar2_kva) { pr_err(MOD "Unable to ioremap BAR2\n"); + ib_dealloc_device(&devp->ibdev); return ERR_PTR(-EINVAL); } } else if (ocqp_supported(infop)) { @@ -747,6 +748,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) devp->rdev.lldi.vr->ocq.size); if (!devp->rdev.oc_mw_kva) { pr_err(MOD "Unable to ioremap onchip mem\n"); + ib_dealloc_device(&devp->ibdev); return ERR_PTR(-EINVAL); } } -- cgit v0.10.2 From 0cc65dd6918f529ae2c19be95b86dec19549b7ed Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Wed, 16 Apr 2014 09:37:49 +0800 Subject: RDMA/ocrdma: Convert to use simple_open() This removes an open-coded duplicate of simple_open(). Signed-off-by: Duan Jiong Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 6c54106..41a9aec 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -510,16 +510,9 @@ exit: return status; } -static int ocrdma_debugfs_open(struct inode *inode, struct file *file) -{ - if (inode->i_private) - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations ocrdma_dbg_ops = { .owner = THIS_MODULE, - .open = ocrdma_debugfs_open, + .open = simple_open, .read = ocrdma_dbgfs_ops_read, }; -- cgit v0.10.2 From 11b8e22d4d0979d8201cbdf0b5fffdbe2d5bcedf Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 16 May 2014 12:42:46 -0500 Subject: RDMA/cxgb4: Fix vlan support RDMA connections over a vlan interface don't work due to import_ep() not using the correct egress device. - use the real device in import_ep() - use rdma_vlan_dev_real_dev() in get_real_dev(). Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 1f863a9..28114e6 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -47,6 +47,8 @@ #include #include +#include + #include "iw_cxgb4.h" static char *states[] = { @@ -341,10 +343,7 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) static struct net_device *get_real_dev(struct net_device *egress_dev) { - struct net_device *phys_dev = egress_dev; - if (egress_dev->priv_flags & IFF_802_1Q_VLAN) - phys_dev = vlan_dev_real_dev(egress_dev); - return phys_dev; + return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev; } static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) @@ -1746,16 +1745,16 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, if (!ep->l2t) goto out; ep->mtu = dst_mtu(dst); - ep->tx_chan = cxgb4_port_chan(n->dev); - ep->smac_idx = (cxgb4_port_viid(n->dev) & 0x7F) << 1; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; step = cdev->rdev.lldi.ntxq / cdev->rdev.lldi.nchan; - ep->txq_idx = cxgb4_port_idx(n->dev) * step; - ep->ctrlq_idx = cxgb4_port_idx(n->dev); + ep->txq_idx = cxgb4_port_idx(pdev) * step; + ep->ctrlq_idx = cxgb4_port_idx(pdev); step = cdev->rdev.lldi.nrxq / cdev->rdev.lldi.nchan; ep->rss_qid = cdev->rdev.lldi.rxq_ids[ - cxgb4_port_idx(n->dev) * step]; + cxgb4_port_idx(pdev) * step]; if (clear_mpa_v1) { ep->retry_with_mpa_v1 = 0; -- cgit v0.10.2 From 024ca90151f5e4296d30f72c13ff9a075e23c9ec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:03:49 +0200 Subject: IB/srp: Fix a sporadic crash triggered by cable pulling Avoid that the loops that iterate over the request ring can encounter a pointer to a SCSI command in req->scmnd that is no longer associated with that request. If the function srp_unmap_data() is invoked twice for a SCSI command that is not in flight then that would cause ib_fmr_pool_unmap() to be invoked with an invalid pointer as argument, resulting in a kernel oops. Reported-by: Sagi Grimberg Reference: http://thread.gmane.org/gmane.linux.drivers.rdma/19068/focus=19069 Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Cc: stable Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 66a908b..5b2bed8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1594,6 +1594,12 @@ err_unmap: err_iu: srp_put_tx_iu(target, iu, SRP_IU_CMD); + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + spin_lock_irqsave(&target->lock, flags); list_add(&req->list, &target->free_reqs); -- cgit v0.10.2 From af24663bc8204695181cf3b92b7129efadd8d455 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:04:21 +0200 Subject: IB/srp: Fix kernel-doc warnings Avoid that the kernel-doc tool warns about missing argument descriptions for the ib_srp.[ch] source files. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 5b2bed8..4f8be37 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -813,6 +813,10 @@ static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, /** * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. */ static void srp_free_req(struct srp_target_port *target, struct srp_request *req, struct scsi_cmnd *scmnd, @@ -1455,6 +1459,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) /** * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. * * Note: This function may get invoked before the rport has been created, * hence the target->rport test. @@ -2316,6 +2321,8 @@ static struct class srp_class = { /** * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. */ static bool srp_conn_unique(struct srp_host *host, struct srp_target_port *target) -- cgit v0.10.2 From 62154b2e892807b8373a107c277f1fa8338f4333 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:04:45 +0200 Subject: IB/srp: Introduce an additional local variable This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4f8be37..281f785 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -290,6 +290,7 @@ static int srp_new_cm_id(struct srp_target_port *target) static int srp_create_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; struct ib_qp_init_attr *init_attr; struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; @@ -299,16 +300,14 @@ static int srp_create_target_ib(struct srp_target_port *target) if (!init_attr) return -ENOMEM; - recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, + recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, target->queue_size, target->comp_vector); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } - send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, + send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, target->queue_size, target->comp_vector); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); @@ -327,7 +326,7 @@ static int srp_create_target_ib(struct srp_target_port *target) init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; - qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); + qp = ib_create_qp(dev->pd, init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_send_cq; -- cgit v0.10.2 From 76bc1e1ddd1ea8d4bc230fc0aa45c1d14c232531 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:05:21 +0200 Subject: IB/srp: Introduce srp_map_fmr() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 281f785..be84c94 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1047,12 +1047,54 @@ static int srp_map_sg_entry(struct srp_map_state *state, return ret; } +static void srp_map_fmr(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; + int i, use_fmr; + + state->desc = req->indirect_desc; + state->pages = req->map_page; + state->next_fmr = req->fmr_list; + + use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(state, target, sg, i, use_fmr)) { + /* FMR mapping failed, so backtrack to the first + * unmapped entry and continue on without using FMR. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state->unmapped_sg; + i = state->unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state->unmapped_addr - dma_addr); + dma_addr = state->unmapped_addr; + use_fmr = SRP_MAP_NO_FMR; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + } + } + + if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(state, target)) + goto backtrack; + + req->nfmr = state->nfmr; +} + static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct scatterlist *scat, *sg; + struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int i, len, nents, count, use_fmr; + int len, nents, count; struct srp_device *dev; struct ib_device *ibdev; struct srp_map_state state; @@ -1111,35 +1153,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); - state.desc = req->indirect_desc; - state.pages = req->map_page; - state.next_fmr = req->fmr_list; - - use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; - - for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) { - /* FMR mapping failed, so backtrack to the first - * unmapped entry and continue on without using FMR. - */ - dma_addr_t dma_addr; - unsigned int dma_len; - -backtrack: - sg = state.unmapped_sg; - i = state.unmapped_index; - - dma_addr = ib_sg_dma_address(ibdev, sg); - dma_len = ib_sg_dma_len(ibdev, sg); - dma_len -= (state.unmapped_addr - dma_addr); - dma_addr = state.unmapped_addr; - use_fmr = SRP_MAP_NO_FMR; - srp_map_desc(&state, dma_addr, dma_len, target->rkey); - } - } - - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) - goto backtrack; + srp_map_fmr(&state, target, req, scat, count); /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -1147,7 +1161,6 @@ backtrack: * guaranteed to fit into the command, as the SCSI layer won't * give us more S/G entries than we allow. */ - req->nfmr = state.nfmr; if (state.ndesc == 1) { /* FMR mapping was able to collapse this to one entry, * so use a direct descriptor. -- cgit v0.10.2 From 539dde6fc5c30cfa76439de03ed3ba444d2579b3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:05:46 +0200 Subject: IB/srp: Introduce srp_finish_mapping() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index be84c94..c2e0ad3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -935,16 +935,6 @@ static int srp_map_finish_fmr(struct srp_map_state *state, struct ib_pool_fmr *fmr; u64 io_addr = 0; - if (!state->npages) - return 0; - - if (state->npages == 1) { - srp_map_desc(state, state->base_dma_addr, state->fmr_len, - target->rkey); - state->npages = state->fmr_len = 0; - return 0; - } - fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, state->npages, io_addr); if (IS_ERR(fmr)) @@ -954,10 +944,32 @@ static int srp_map_finish_fmr(struct srp_map_state *state, state->nfmr++; srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); - state->npages = state->fmr_len = 0; + return 0; } +static int srp_finish_mapping(struct srp_map_state *state, + struct srp_target_port *target) +{ + int ret = 0; + + if (state->npages == 0) + return 0; + + if (state->npages == 1) + srp_map_desc(state, state->base_dma_addr, state->fmr_len, + target->rkey); + else + ret = srp_map_finish_fmr(state, target); + + if (ret == 0) { + state->npages = 0; + state->fmr_len = 0; + } + + return ret; +} + static void srp_map_update_start(struct srp_map_state *state, struct scatterlist *sg, int sg_index, dma_addr_t dma_addr) @@ -998,7 +1010,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, * avoided using FMR on such page fragments. */ if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { - ret = srp_map_finish_fmr(state, target); + ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1017,7 +1029,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, while (dma_len) { if (state->npages == SRP_FMR_SIZE) { - ret = srp_map_finish_fmr(state, target); + ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1040,7 +1052,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, */ ret = 0; if (len != dev->fmr_page_size) { - ret = srp_map_finish_fmr(state, target); + ret = srp_finish_mapping(state, target); if (!ret) srp_map_update_start(state, NULL, 0, 0); } @@ -1083,7 +1095,7 @@ backtrack: } } - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(state, target)) + if (use_fmr == SRP_MAP_ALLOW_FMR && srp_finish_mapping(state, target)) goto backtrack; req->nfmr = state->nfmr; -- cgit v0.10.2 From b1b8854d1622b99b64cd98ed307ffd168c6d3ebd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:06:41 +0200 Subject: IB/srp: Introduce the 'register_always' kernel module parameter Add a kernel module parameter that enables memory registration also for SG-lists that can be processed without memory registration. This makes it easier for kernel developers to test the memory registration code. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index c2e0ad3..77ba965 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -66,6 +66,7 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; +static bool register_always; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -87,6 +88,10 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + static struct kernel_param_ops srp_tmo_ops; static int srp_reconnect_delay = 10; @@ -956,7 +961,7 @@ static int srp_finish_mapping(struct srp_map_state *state, if (state->npages == 0) return 0; - if (state->npages == 1) + if (state->npages == 1 && !register_always) srp_map_desc(state, state->base_dma_addr, state->fmr_len, target->rkey); else @@ -1138,7 +1143,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - if (count == 1) { + if (count == 1 && !register_always) { /* * The midlayer only generated a single gather/scatter * entry, or DMA mapping coalesced everything to a -- cgit v0.10.2 From d1b4289e16477fe13e95b88ffb7067c87b10ab6e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:07:20 +0200 Subject: IB/srp: One FMR pool per SRP connection Allocate one FMR pool per SRP connection instead of one SRP pool per HCA. This improves scalability of the SRP initiator. Only request the SCSI mid-layer to retry a SCSI command after a temporary mapping failure (-ENOMEM) but not after a permanent mapping failure. This avoids that SCSI commands are retried indefinitely if a permanent memory mapping failure occurs. Tell the SCSI mid-layer to reduce queue depth temporarily in the unlikely case where an application is queuing many requests with more than max_pages_per_fmr sg-list elements. For FMR pool allocation, base the max_pages_per_fmr parameter on the HCA memory registration limit. Only try to allocate an FMR pool if FMR is supported. Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 77ba965..80dfe17 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -293,12 +293,31 @@ static int srp_new_cm_id(struct srp_target_port *target) return 0; } +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_fmr_pool_param fmr_param; + + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.dirty_watermark = fmr_param.pool_size / 4; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = dev->max_pages_per_fmr; + fmr_param.page_shift = ilog2(dev->fmr_page_size); + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + return ib_create_fmr_pool(dev->pd, &fmr_param); +} + static int srp_create_target_ib(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; struct ib_qp_init_attr *init_attr; struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool = NULL; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); @@ -341,6 +360,19 @@ static int srp_create_target_ib(struct srp_target_port *target) if (ret) goto err_qp; + if (dev->has_fmr) { + fmr_pool = srp_alloc_fmr_pool(target); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FMR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + target->fmr_pool = fmr_pool; + } + if (target->qp) ib_destroy_qp(target->qp); if (target->recv_cq) @@ -377,6 +409,8 @@ static void srp_free_target_ib(struct srp_target_port *target) { int i; + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); ib_destroy_qp(target->qp); ib_destroy_cq(target->send_cq); ib_destroy_cq(target->recv_cq); @@ -623,8 +657,8 @@ static int srp_alloc_req_data(struct srp_target_port *target) req = &target->req_ring[i]; req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof(void *), - GFP_KERNEL); + req->map_page = kmalloc(srp_dev->max_pages_per_fmr * + sizeof(void *), GFP_KERNEL); req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); if (!req->fmr_list || !req->map_page || !req->indirect_desc) goto out; @@ -936,11 +970,10 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, static int srp_map_finish_fmr(struct srp_map_state *state, struct srp_target_port *target) { - struct srp_device *dev = target->srp_host->srp_dev; struct ib_pool_fmr *fmr; u64 io_addr = 0; - fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, + fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, state->npages, io_addr); if (IS_ERR(fmr)) return PTR_ERR(fmr); @@ -1033,7 +1066,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, srp_map_update_start(state, sg, sg_index, dma_addr); while (dma_len) { - if (state->npages == SRP_FMR_SIZE) { + if (state->npages == dev->max_pages_per_fmr) { ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1077,7 +1110,7 @@ static void srp_map_fmr(struct srp_map_state *state, state->pages = req->map_page; state->next_fmr = req->fmr_list; - use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; + use_fmr = target->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; for_each_sg(scat, sg, count, i) { if (srp_map_sg_entry(state, target, sg, i, use_fmr)) { @@ -1555,7 +1588,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; - int len, result; + int len, ret; const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; /* @@ -1567,12 +1600,9 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) if (in_scsi_eh) mutex_lock(&rport->mutex); - result = srp_chkready(target->rport); - if (unlikely(result)) { - scmnd->result = result; - scmnd->scsi_done(scmnd); - goto unlock_rport; - } + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; spin_lock_irqsave(&target->lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); @@ -1587,7 +1617,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, DMA_TO_DEVICE); - scmnd->result = 0; scmnd->host_scribble = (void *) req; cmd = iu->buf; @@ -1604,7 +1633,15 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) len = srp_map_data(scmnd, target, req); if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, - PFX "Failed to map data\n"); + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_fmr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; goto err_iu; } @@ -1616,11 +1653,13 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_unmap; } + ret = 0; + unlock_rport: if (in_scsi_eh) mutex_unlock(&rport->mutex); - return 0; + return ret; err_unmap: srp_unmap_data(scmnd, target, req); @@ -1640,10 +1679,15 @@ err_iu: err_unlock: spin_unlock_irqrestore(&target->lock, flags); - if (in_scsi_eh) - mutex_unlock(&rport->mutex); +err: + if (scmnd->result) { + scmnd->scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } - return SCSI_MLQUEUE_HOST_BUSY; + goto unlock_rport; } /* @@ -2647,7 +2691,8 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; - struct ib_device *ibdev = host->srp_dev->dev; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; int ret; target_host = scsi_host_alloc(&srp_template, @@ -2692,8 +2737,8 @@ static ssize_t srp_create_target(struct device *dev, goto err; } - if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && - target->cmd_sg_cnt < target->sg_tablesize) { + if (!srp_dev->has_fmr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { pr_warn("No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } @@ -2832,9 +2877,9 @@ static void srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *dev_attr; - struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int max_pages_per_fmr, fmr_page_shift, s, e, p; + int fmr_page_shift, s, e, p; + u64 max_pages_per_fmr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2849,6 +2894,9 @@ static void srp_add_one(struct ib_device *device) if (!srp_dev) goto free_attr; + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists @@ -2857,7 +2905,15 @@ static void srp_add_one(struct ib_device *device) fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); srp_dev->fmr_page_size = 1 << fmr_page_shift; srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); - srp_dev->fmr_max_size = srp_dev->fmr_page_size * SRP_FMR_SIZE; + max_pages_per_fmr = dev_attr->max_mr_size; + do_div(max_pages_per_fmr, srp_dev->fmr_page_size); + srp_dev->max_pages_per_fmr = min_t(u64, SRP_FMR_SIZE, + max_pages_per_fmr); + srp_dev->fmr_max_size = srp_dev->fmr_page_size * + srp_dev->max_pages_per_fmr; + pr_debug("%s: fmr_page_shift = %d, dev_attr->max_mr_size = %#llx, max_pages_per_fmr = %d, fmr_max_size = %#x\n", + device->name, fmr_page_shift, dev_attr->max_mr_size, + srp_dev->max_pages_per_fmr, srp_dev->fmr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); @@ -2873,27 +2929,6 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - for (max_pages_per_fmr = SRP_FMR_SIZE; - max_pages_per_fmr >= SRP_FMR_MIN_SIZE; - max_pages_per_fmr /= 2, srp_dev->fmr_max_size /= 2) { - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = SRP_FMR_POOL_SIZE; - fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = max_pages_per_fmr; - fmr_param.page_shift = fmr_page_shift; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); - if (!IS_ERR(srp_dev->fmr_pool)) - break; - } - - if (IS_ERR(srp_dev->fmr_pool)) - srp_dev->fmr_pool = NULL; - if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; @@ -2956,8 +2991,6 @@ static void srp_remove_one(struct ib_device *device) kfree(host); } - if (srp_dev->fmr_pool) - ib_destroy_fmr_pool(srp_dev->fmr_pool); ib_dereg_mr(srp_dev->mr); ib_dealloc_pd(srp_dev->pd); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index aad27b7..2d99e52 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -67,9 +67,6 @@ enum { SRP_TAG_TSK_MGMT = 1U << 31, SRP_FMR_SIZE = 512, - SRP_FMR_MIN_SIZE = 128, - SRP_FMR_POOL_SIZE = 1024, - SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, SRP_MAP_ALLOW_FMR = 0, SRP_MAP_NO_FMR = 1, @@ -91,10 +88,11 @@ struct srp_device { struct ib_device *dev; struct ib_pd *pd; struct ib_mr *mr; - struct ib_fmr_pool *fmr_pool; u64 fmr_page_mask; int fmr_page_size; int fmr_max_size; + int max_pages_per_fmr; + bool has_fmr; }; struct srp_host { @@ -131,6 +129,7 @@ struct srp_target_port { struct ib_cq *send_cq ____cacheline_aligned_in_smp; struct ib_cq *recv_cq; struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool; u32 lkey; u32 rkey; enum srp_target_state state; -- cgit v0.10.2 From 52ede08f00ebfc21d50a1f3e5908a78655900519 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:07:45 +0200 Subject: IB/srp: Rename FMR-related variables The next patch will cause the renamed variables to be shared between the code for FMR and for FR memory registration. Make the names of these variables independent of the memory registration mode. This patch does not change any functionality. The start of this patch was the changes applied via the following shell command: sed -i.orig 's/SRP_FMR_SIZE/SRP_MAX_PAGES_PER_MR/g; \ s/fmr_page_mask/mr_page_mask/g;s/fmr_page_size/mr_page_size/g; \ s/fmr_page_shift/mr_page_shift/g;s/fmr_max_size/mr_max_size/g; \ s/max_pages_per_fmr/max_pages_per_mr/g;s/nfmr/nmdesc/g; \ s/fmr_len/dma_len/g' drivers/infiniband/ulp/srp/ib_srp.[ch] Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 80dfe17..c9b3b9e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -302,8 +302,8 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) fmr_param.pool_size = target->scsi_host->can_queue; fmr_param.dirty_watermark = fmr_param.pool_size / 4; fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = dev->max_pages_per_fmr; - fmr_param.page_shift = ilog2(dev->fmr_page_size); + fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; + fmr_param.page_shift = ilog2(dev->mr_page_size); fmr_param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); @@ -657,7 +657,7 @@ static int srp_alloc_req_data(struct srp_target_port *target) req = &target->req_ring[i]; req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), GFP_KERNEL); - req->map_page = kmalloc(srp_dev->max_pages_per_fmr * + req->map_page = kmalloc(srp_dev->max_pages_per_mr * sizeof(void *), GFP_KERNEL); req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); if (!req->fmr_list || !req->map_page || !req->indirect_desc) @@ -810,7 +810,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, return; pfmr = req->fmr_list; - while (req->nfmr--) + while (req->nmdesc--) ib_fmr_pool_unmap(*pfmr++); ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), @@ -979,9 +979,9 @@ static int srp_map_finish_fmr(struct srp_map_state *state, return PTR_ERR(fmr); *state->next_fmr++ = fmr; - state->nfmr++; + state->nmdesc++; - srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); + srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); return 0; } @@ -995,14 +995,14 @@ static int srp_finish_mapping(struct srp_map_state *state, return 0; if (state->npages == 1 && !register_always) - srp_map_desc(state, state->base_dma_addr, state->fmr_len, + srp_map_desc(state, state->base_dma_addr, state->dma_len, target->rkey); else ret = srp_map_finish_fmr(state, target); if (ret == 0) { state->npages = 0; - state->fmr_len = 0; + state->dma_len = 0; } return ret; @@ -1047,7 +1047,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, * that were never quite defined, but went away when the initiator * avoided using FMR on such page fragments. */ - if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { + if (dma_addr & ~dev->mr_page_mask || dma_len > dev->mr_max_size) { ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1066,7 +1066,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, srp_map_update_start(state, sg, sg_index, dma_addr); while (dma_len) { - if (state->npages == dev->max_pages_per_fmr) { + if (state->npages == dev->max_pages_per_mr) { ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1074,12 +1074,12 @@ static int srp_map_sg_entry(struct srp_map_state *state, srp_map_update_start(state, sg, sg_index, dma_addr); } - len = min_t(unsigned int, dma_len, dev->fmr_page_size); + len = min_t(unsigned int, dma_len, dev->mr_page_size); if (!state->npages) state->base_dma_addr = dma_addr; state->pages[state->npages++] = dma_addr; - state->fmr_len += len; + state->dma_len += len; dma_addr += len; dma_len -= len; } @@ -1089,7 +1089,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, * boundries. */ ret = 0; - if (len != dev->fmr_page_size) { + if (len != dev->mr_page_size) { ret = srp_finish_mapping(state, target); if (!ret) srp_map_update_start(state, NULL, 0, 0); @@ -1136,7 +1136,7 @@ backtrack: if (use_fmr == SRP_MAP_ALLOW_FMR && srp_finish_mapping(state, target)) goto backtrack; - req->nfmr = state->nfmr; + req->nmdesc = state->nmdesc; } static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, @@ -1189,7 +1189,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->key = cpu_to_be32(target->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - req->nfmr = 0; + req->nmdesc = 0; goto map_complete; } @@ -1637,7 +1637,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) /* * If we ran out of memory descriptors (-ENOMEM) because an * application is queuing many requests with more than - * max_pages_per_fmr sg-list elements, tell the SCSI mid-layer + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer * to reduce queue depth temporarily. */ scmnd->result = len == -ENOMEM ? @@ -2878,8 +2878,8 @@ static void srp_add_one(struct ib_device *device) struct srp_device *srp_dev; struct ib_device_attr *dev_attr; struct srp_host *host; - int fmr_page_shift, s, e, p; - u64 max_pages_per_fmr; + int mr_page_shift, s, e, p; + u64 max_pages_per_mr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2902,18 +2902,18 @@ static void srp_add_one(struct ib_device *device) * minimum of 4096 bytes. We're unlikely to build large sglists * out of smaller entries. */ - fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); - srp_dev->fmr_page_size = 1 << fmr_page_shift; - srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); - max_pages_per_fmr = dev_attr->max_mr_size; - do_div(max_pages_per_fmr, srp_dev->fmr_page_size); - srp_dev->max_pages_per_fmr = min_t(u64, SRP_FMR_SIZE, - max_pages_per_fmr); - srp_dev->fmr_max_size = srp_dev->fmr_page_size * - srp_dev->max_pages_per_fmr; - pr_debug("%s: fmr_page_shift = %d, dev_attr->max_mr_size = %#llx, max_pages_per_fmr = %d, fmr_max_size = %#x\n", - device->name, fmr_page_shift, dev_attr->max_mr_size, - srp_dev->max_pages_per_fmr, srp_dev->fmr_max_size); + mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = dev_attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, dev_attr->max_mr_size, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 2d99e52..eb130486 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -66,7 +66,7 @@ enum { SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, - SRP_FMR_SIZE = 512, + SRP_MAX_PAGES_PER_MR = 512, SRP_MAP_ALLOW_FMR = 0, SRP_MAP_NO_FMR = 1, @@ -88,10 +88,10 @@ struct srp_device { struct ib_device *dev; struct ib_pd *pd; struct ib_mr *mr; - u64 fmr_page_mask; - int fmr_page_size; - int fmr_max_size; - int max_pages_per_fmr; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; bool has_fmr; }; @@ -114,7 +114,7 @@ struct srp_request { u64 *map_page; struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; - short nfmr; + short nmdesc; short index; }; @@ -201,10 +201,10 @@ struct srp_map_state { struct srp_direct_buf *desc; u64 *pages; dma_addr_t base_dma_addr; - u32 fmr_len; + u32 dma_len; u32 total_len; unsigned int npages; - unsigned int nfmr; + unsigned int nmdesc; unsigned int ndesc; struct scatterlist *unmapped_sg; int unmapped_index; -- cgit v0.10.2 From 5cfb17828d877a5541171087b9d746befdf2a126 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 15:08:34 +0200 Subject: IB/srp: Add fast registration support Certain HCA types (e.g. Connect-IB) and certain configurations (e.g. ConnectX VF) support fast registration but not FMR. Hence add fast registration support. In function srp_rport_reconnect(), move the the srp_finish_req() loop from after to before the srp_create_target_ib() call. This is needed to avoid that srp_finish_req() tries to queue any invalidation requests for rkeys associated with the old queue pair on the newly allocated queue pair. Invoking srp_finish_req() before the queue pair has been reallocated is safe since srp_claim_req() handles completions correctly that arrive after srp_finish_req() has been invoked. Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index c9b3b9e..b42f132 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -66,6 +66,7 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; +static bool prefer_fr; static bool register_always; static int topspin_workarounds = 1; @@ -88,6 +89,10 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); @@ -311,6 +316,132 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) return ib_create_fmr_pool(dev->pd, &fmr_param); } +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->frpl) + ib_free_fast_reg_page_list(d->frpl); + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + int i, ret = -EINVAL; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(sizeof(struct srp_fr_pool) + + pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto destroy_pool; + } + d->mr = mr; + frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); + if (IS_ERR(frpl)) { + ret = PTR_ERR(frpl); + goto destroy_pool; + } + d->frpl = frpl; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, + target->scsi_host->can_queue, + dev->max_pages_per_mr); +} + static int srp_create_target_ib(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; @@ -318,6 +449,8 @@ static int srp_create_target_ib(struct srp_target_port *target) struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; struct ib_fmr_pool *fmr_pool = NULL; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); @@ -332,7 +465,7 @@ static int srp_create_target_ib(struct srp_target_port *target) } send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, - target->queue_size, target->comp_vector); + m * target->queue_size, target->comp_vector); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; @@ -341,11 +474,11 @@ static int srp_create_target_ib(struct srp_target_port *target) ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = target->queue_size; + init_attr->cap.max_send_wr = m * target->queue_size; init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; init_attr->qp_type = IB_QPT_RC; init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; @@ -360,7 +493,18 @@ static int srp_create_target_ib(struct srp_target_port *target) if (ret) goto err_qp; - if (dev->has_fmr) { + if (dev->use_fast_reg && dev->has_fr) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + target->fr_pool = fr_pool; + } else if (!dev->use_fast_reg && dev->has_fmr) { fmr_pool = srp_alloc_fmr_pool(target); if (IS_ERR(fmr_pool)) { ret = PTR_ERR(fmr_pool); @@ -407,10 +551,16 @@ err: */ static void srp_free_target_ib(struct srp_target_port *target) { + struct srp_device *dev = target->srp_host->srp_dev; int i; - if (target->fmr_pool) - ib_destroy_fmr_pool(target->fmr_pool); + if (dev->use_fast_reg) { + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + } else { + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + } ib_destroy_qp(target->qp); ib_destroy_cq(target->send_cq); ib_destroy_cq(target->recv_cq); @@ -615,7 +765,8 @@ static void srp_disconnect_target(struct srp_target_port *target) static void srp_free_req_data(struct srp_target_port *target) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; struct srp_request *req; int i; @@ -624,7 +775,10 @@ static void srp_free_req_data(struct srp_target_port *target) for (i = 0; i < target->req_ring_size; ++i) { req = &target->req_ring[i]; - kfree(req->fmr_list); + if (dev->use_fast_reg) + kfree(req->fr_list); + else + kfree(req->fmr_list); kfree(req->map_page); if (req->indirect_dma_addr) { ib_dma_unmap_single(ibdev, req->indirect_dma_addr, @@ -643,6 +797,7 @@ static int srp_alloc_req_data(struct srp_target_port *target) struct srp_device *srp_dev = target->srp_host->srp_dev; struct ib_device *ibdev = srp_dev->dev; struct srp_request *req; + void *mr_list; dma_addr_t dma_addr; int i, ret = -ENOMEM; @@ -655,12 +810,20 @@ static int srp_alloc_req_data(struct srp_target_port *target) for (i = 0; i < target->req_ring_size; ++i) { req = &target->req_ring[i]; - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), - GFP_KERNEL); + mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + if (!mr_list) + goto out; + if (srp_dev->use_fast_reg) + req->fr_list = mr_list; + else + req->fmr_list = mr_list; req->map_page = kmalloc(srp_dev->max_pages_per_mr * sizeof(void *), GFP_KERNEL); + if (!req->map_page) + goto out; req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) + if (!req->indirect_desc) goto out; dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, @@ -797,21 +960,56 @@ static int srp_connect_target(struct srp_target_port *target) } } +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_id = LOCAL_INV_WR_ID_MASK, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; - struct ib_pool_fmr **pfmr; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; - pfmr = req->fmr_list; - while (req->nmdesc--) - ib_fmr_pool_unmap(*pfmr++); + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(target, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(target->fr_pool, req->fr_list, + req->nmdesc); + } else { + struct ib_pool_fmr **pfmr; + + for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) + ib_fmr_pool_unmap(*pfmr); + } ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), scmnd->sc_data_direction); @@ -924,21 +1122,19 @@ static int srp_rport_reconnect(struct srp_rport *rport) * callbacks will have finished before a new QP is allocated. */ ret = srp_new_cm_id(target); - /* - * Whether or not creating a new CM ID succeeded, create a new - * QP. This guarantees that all completion callback function - * invocations have finished before request resetting starts. - */ - if (ret == 0) - ret = srp_create_target_ib(target); - else - srp_create_target_ib(target); for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; srp_finish_req(target, req, NULL, DID_RESET << 16); } + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all callback functions for the old QP have + * finished before any send requests are posted on the new QP. + */ + ret += srp_create_target_ib(target); + INIT_LIST_HEAD(&target->free_tx); for (i = 0; i < target->queue_size; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); @@ -986,6 +1182,47 @@ static int srp_map_finish_fmr(struct srp_map_state *state, return 0; } +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + + desc = srp_fr_pool_get(target->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + memcpy(desc->frpl->page_list, state->pages, + sizeof(state->pages[0]) * state->npages); + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_FAST_REG_MR; + wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.fast_reg.iova_start = state->base_dma_addr; + wr.wr.fast_reg.page_list = desc->frpl; + wr.wr.fast_reg.page_list_len = state->npages; + wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); + wr.wr.fast_reg.length = state->dma_len; + wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + wr.wr.fast_reg.rkey = desc->mr->lkey; + + *state->next_fr++ = desc; + state->nmdesc++; + + srp_map_desc(state, state->base_dma_addr, state->dma_len, + desc->mr->rkey); + + return ib_post_send(target->qp, &wr, &bad_wr); +} + static int srp_finish_mapping(struct srp_map_state *state, struct srp_target_port *target) { @@ -998,7 +1235,9 @@ static int srp_finish_mapping(struct srp_map_state *state, srp_map_desc(state, state->base_dma_addr, state->dma_len, target->rkey); else - ret = srp_map_finish_fmr(state, target); + ret = target->srp_host->srp_dev->use_fast_reg ? + srp_map_finish_fr(state, target) : + srp_map_finish_fmr(state, target); if (ret == 0) { state->npages = 0; @@ -1020,7 +1259,7 @@ static void srp_map_update_start(struct srp_map_state *state, static int srp_map_sg_entry(struct srp_map_state *state, struct srp_target_port *target, struct scatterlist *sg, int sg_index, - int use_fmr) + bool use_mr) { struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; @@ -1032,22 +1271,24 @@ static int srp_map_sg_entry(struct srp_map_state *state, if (!dma_len) return 0; - if (use_fmr == SRP_MAP_NO_FMR) { - /* Once we're in direct map mode for a request, we don't - * go back to FMR mode, so no need to update anything + if (!use_mr) { + /* + * Once we're in direct map mode for a request, we don't + * go back to FMR or FR mode, so no need to update anything * other than the descriptor. */ srp_map_desc(state, dma_addr, dma_len, target->rkey); return 0; } - /* If we start at an offset into the FMR page, don't merge into - * the current FMR. Finish it out, and use the kernel's MR for this - * sg entry. This is to avoid potential bugs on some SRP targets - * that were never quite defined, but went away when the initiator - * avoided using FMR on such page fragments. + /* + * Since not all RDMA HW drivers support non-zero page offsets for + * FMR, if we start at an offset into a page, don't merge into the + * current FMR mapping. Finish it out, and use the kernel's MR for + * this sg entry. */ - if (dma_addr & ~dev->mr_page_mask || dma_len > dev->mr_max_size) { + if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || + dma_len > dev->mr_max_size) { ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1057,16 +1298,18 @@ static int srp_map_sg_entry(struct srp_map_state *state, return 0; } - /* If this is the first sg to go into the FMR, save our position. - * We need to know the first unmapped entry, its index, and the - * first unmapped address within that entry to be able to restart - * mapping after an error. + /* + * If this is the first sg that will be mapped via FMR or via FR, save + * our position. We need to know the first unmapped entry, its index, + * and the first unmapped address within that entry to be able to + * restart mapping after an error. */ if (!state->unmapped_sg) srp_map_update_start(state, sg, sg_index, dma_addr); while (dma_len) { - if (state->npages == dev->max_pages_per_mr) { + unsigned offset = dma_addr & ~dev->mr_page_mask; + if (state->npages == dev->max_pages_per_mr || offset != 0) { ret = srp_finish_mapping(state, target); if (ret) return ret; @@ -1074,17 +1317,18 @@ static int srp_map_sg_entry(struct srp_map_state *state, srp_map_update_start(state, sg, sg_index, dma_addr); } - len = min_t(unsigned int, dma_len, dev->mr_page_size); + len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); if (!state->npages) state->base_dma_addr = dma_addr; - state->pages[state->npages++] = dma_addr; + state->pages[state->npages++] = dma_addr & dev->mr_page_mask; state->dma_len += len; dma_addr += len; dma_len -= len; } - /* If the last entry of the FMR wasn't a full page, then we need to + /* + * If the last entry of the MR wasn't a full page, then we need to * close it out and start a new one -- we can only merge at page * boundries. */ @@ -1097,25 +1341,32 @@ static int srp_map_sg_entry(struct srp_map_state *state, return ret; } -static void srp_map_fmr(struct srp_map_state *state, - struct srp_target_port *target, struct srp_request *req, - struct scatterlist *scat, int count) +static int srp_map_sg(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) { struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; struct scatterlist *sg; - int i, use_fmr; + int i; + bool use_mr; state->desc = req->indirect_desc; state->pages = req->map_page; - state->next_fmr = req->fmr_list; - - use_fmr = target->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; + if (dev->use_fast_reg) { + state->next_fr = req->fr_list; + use_mr = !!target->fr_pool; + } else { + state->next_fmr = req->fmr_list; + use_mr = !!target->fmr_pool; + } for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(state, target, sg, i, use_fmr)) { - /* FMR mapping failed, so backtrack to the first - * unmapped entry and continue on without using FMR. + if (srp_map_sg_entry(state, target, sg, i, use_mr)) { + /* + * Memory registration failed, so backtrack to the + * first unmapped entry and continue on without using + * memory registration. */ dma_addr_t dma_addr; unsigned int dma_len; @@ -1128,15 +1379,17 @@ backtrack: dma_len = ib_sg_dma_len(ibdev, sg); dma_len -= (state->unmapped_addr - dma_addr); dma_addr = state->unmapped_addr; - use_fmr = SRP_MAP_NO_FMR; + use_mr = false; srp_map_desc(state, dma_addr, dma_len, target->rkey); } } - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_finish_mapping(state, target)) + if (use_mr && srp_finish_mapping(state, target)) goto backtrack; req->nmdesc = state->nmdesc; + + return 0; } static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, @@ -1193,9 +1446,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, goto map_complete; } - /* We have more than one scatter/gather entry, so build our indirect - * descriptor table, trying to merge as many entries with FMR as we - * can. + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. */ indirect_hdr = (void *) cmd->add_data; @@ -1203,7 +1456,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); - srp_map_fmr(&state, target, req, scat, count); + srp_map_sg(&state, target, req, scat, count); /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -1212,7 +1465,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, * give us more S/G entries than we allow. */ if (state.ndesc == 1) { - /* FMR mapping was able to collapse this to one entry, + /* + * Memory registration collapsed the sg-list into one entry, * so use a direct descriptor. */ struct srp_direct_buf *buf = (void *) cmd->add_data; @@ -1535,14 +1789,24 @@ static void srp_tl_err_work(struct work_struct *work) srp_start_tl_fail_timers(target->rport); } -static void srp_handle_qp_err(enum ib_wc_status wc_status, bool send_err, - struct srp_target_port *target) +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, + bool send_err, struct srp_target_port *target) { if (target->connected && !target->qp_in_error) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed %s status %d\n", - send_err ? "send" : "receive", - wc_status); + if (wr_id & LOCAL_INV_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "LOCAL_INV failed with status %d\n", + wc_status); + } else if (wr_id & FAST_REG_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "FAST_REG_MR failed status %d\n", + wc_status); + } else { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d for iu %p\n", + send_err ? "send" : "receive", + wc_status, (void *)(uintptr_t)wr_id); + } queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; @@ -1558,7 +1822,7 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) if (likely(wc.status == IB_WC_SUCCESS)) { srp_handle_recv(target, &wc); } else { - srp_handle_qp_err(wc.status, false, target); + srp_handle_qp_err(wc.wr_id, wc.status, false, target); } } } @@ -1574,7 +1838,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) iu = (struct srp_iu *) (uintptr_t) wc.wr_id; list_add(&iu->list, &target->free_tx); } else { - srp_handle_qp_err(wc.status, true, target); + srp_handle_qp_err(wc.wr_id, wc.status, true, target); } } } @@ -2737,9 +3001,9 @@ static ssize_t srp_create_target(struct device *dev, goto err; } - if (!srp_dev->has_fmr && !target->allow_ext_sg && + if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && target->cmd_sg_cnt < target->sg_tablesize) { - pr_warn("No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } @@ -2896,6 +3160,13 @@ static void srp_add_one(struct ib_device *device) srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (dev_attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!srp_dev->has_fmr && !srp_dev->has_fr) + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); /* * Use the smallest page size supported by the HCA, down to a @@ -2909,10 +3180,16 @@ static void srp_add_one(struct ib_device *device) do_div(max_pages_per_mr, srp_dev->mr_page_size); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + dev_attr->max_fast_reg_page_list_len); + } srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; - pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, max_pages_per_mr = %d, mr_max_size = %#x\n", + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", device->name, mr_page_shift, dev_attr->max_mr_size, + dev_attr->max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index eb130486..e46ecb1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -68,8 +68,8 @@ enum { SRP_MAX_PAGES_PER_MR = 512, - SRP_MAP_ALLOW_FMR = 0, - SRP_MAP_NO_FMR = 1, + LOCAL_INV_WR_ID_MASK = 1, + FAST_REG_WR_ID_MASK = 2, }; enum srp_target_state { @@ -83,6 +83,12 @@ enum srp_iu_type { SRP_IU_RSP, }; +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + * request. + */ struct srp_device { struct list_head dev_list; struct ib_device *dev; @@ -93,6 +99,8 @@ struct srp_device { int mr_max_size; int max_pages_per_mr; bool has_fmr; + bool has_fr; + bool use_fast_reg; }; struct srp_host { @@ -110,7 +118,10 @@ struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct ib_pool_fmr **fmr_list; + union { + struct ib_pool_fmr **fmr_list; + struct srp_fr_desc **fr_list; + }; u64 *map_page; struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; @@ -129,7 +140,10 @@ struct srp_target_port { struct ib_cq *send_cq ____cacheline_aligned_in_smp; struct ib_cq *recv_cq; struct ib_qp *qp; - struct ib_fmr_pool *fmr_pool; + union { + struct ib_fmr_pool *fmr_pool; + struct srp_fr_pool *fr_pool; + }; u32 lkey; u32 rkey; enum srp_target_state state; @@ -196,8 +210,59 @@ struct srp_iu { enum dma_data_direction direction; }; +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next + * FMR or FR memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr: DMA address of the first element mapped via FMR or FR. + */ struct srp_map_state { - struct ib_pool_fmr **next_fmr; + union { + struct ib_pool_fmr **next_fmr; + struct srp_fr_desc **next_fr; + }; struct srp_direct_buf *desc; u64 *pages; dma_addr_t base_dma_addr; -- cgit v0.10.2 From b73c3adabdb1e2cb2f2c69bc3cbb9306aa3f9700 Mon Sep 17 00:00:00 2001 From: Ariel Nahum Date: Thu, 22 May 2014 11:00:18 +0300 Subject: IB/iser: Simplify connection management iSER relies on refcounting to manage iser connections establishment and teardown. Following commit 39ff05dbbbdb ("IB/iser: Enhance disconnection logic for multi-pathing"), iser connection maintain 3 references: - iscsi_endpoint (at creation stage) - cma_id (at connection request stage) - iscsi_conn (at bind stage) We can avoid taking explicit refcounts by correctly serializing iser teardown flows (graceful and non-graceful). Our approach is to trigger a scheduled work to handle ordered teardown by gracefully waiting for 2 cleanup stages to complete: 1. Cleanup of live pending tasks indicated by iscsi_conn_stop completion 2. Flush errors processing Each completed stage will notify a waiting worker thread when it is done to allow teardwon continuation. Since iSCSI connection establishment may trigger endpoint disconnect without a successful endpoint connect, we rely on the iscsi <-> iser binding (.conn_bind) to learn about the teardown policy we should take wrt cleanup stages. Since all cleanup worker threads are scheduled (release_wq) in .ep_disconnect it is safe to assume that when module_exit is called, all cleanup workers are already scheduled. Thus proper module unload shall flush all scheduled works before allowing safe exit, to guarantee no resources got left behind. Signed-off-by: Ariel Nahum Signed-off-by: Sagi Grimberg Reviewed-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 25f195e..f217488 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -99,6 +99,7 @@ MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); module_param_named(pi_guard, iser_pi_guard, int, 0644); MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); +static struct workqueue_struct *release_wq; struct iser_global ig; void @@ -337,24 +338,6 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) return cls_conn; } -static void -iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) -{ - struct iscsi_conn *conn = cls_conn->dd_data; - struct iser_conn *ib_conn = conn->dd_data; - - iscsi_conn_teardown(cls_conn); - /* - * Userspace will normally call the stop callback and - * already have freed the ib_conn, but if it goofed up then - * we free it here. - */ - if (ib_conn) { - ib_conn->iscsi_conn = NULL; - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ - } -} - static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, @@ -392,29 +375,39 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, conn->dd_data = ib_conn; ib_conn->iscsi_conn = conn; - iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ return 0; } +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *ib_conn; + + iscsi_conn = cls_conn->dd_data; + ib_conn = iscsi_conn->dd_data; + reinit_completion(&ib_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) { struct iscsi_conn *conn = cls_conn->dd_data; struct iser_conn *ib_conn = conn->dd_data; + iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); + iscsi_conn_stop(cls_conn, flag); + /* * Userspace may have goofed up and not bound the connection or * might have only partially setup the connection. */ if (ib_conn) { - iscsi_conn_stop(cls_conn, flag); - /* - * There is no unbind event so the stop callback - * must release the ref from the bind. - */ - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ + conn->dd_data = NULL; + complete(&ib_conn->stop_completion); } - conn->dd_data = NULL; } static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) @@ -652,19 +645,20 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) struct iser_conn *ib_conn; ib_conn = ep->dd_data; - if (ib_conn->iscsi_conn) - /* - * Must suspend xmit path if the ep is bound to the - * iscsi_conn, so we know we are not accessing the ib_conn - * when we free it. - * - * This may not be bound if the ep poll failed. - */ - iscsi_suspend_tx(ib_conn->iscsi_conn); - - - iser_info("ib conn %p state %d\n", ib_conn, ib_conn->state); + iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); iser_conn_terminate(ib_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop + * call and ISER_CONN_DOWN state before freeing the iser resources. + * otherwise we are safe to free resources immediately. + */ + if (ib_conn->iscsi_conn) { + INIT_WORK(&ib_conn->release_work, iser_release_work); + queue_work(release_wq, &ib_conn->release_work); + } else { + iser_conn_release(ib_conn); + } } static umode_t iser_attr_is_visible(int param_type, int param) @@ -748,13 +742,13 @@ static struct iscsi_transport iscsi_iser_transport = { /* connection management */ .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, - .destroy_conn = iscsi_iser_conn_destroy, + .destroy_conn = iscsi_conn_teardown, .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, .get_conn_param = iscsi_conn_get_param, .get_ep_param = iscsi_iser_get_ep_param, .get_session_param = iscsi_session_get_param, - .start_conn = iscsi_conn_start, + .start_conn = iscsi_iser_conn_start, .stop_conn = iscsi_iser_conn_stop, /* iscsi host params */ .get_host_param = iscsi_host_get_param, @@ -801,6 +795,12 @@ static int __init iser_init(void) mutex_init(&ig.connlist_mutex); INIT_LIST_HEAD(&ig.connlist); + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + return -ENOMEM; + } + iscsi_iser_scsi_transport = iscsi_register_transport( &iscsi_iser_transport); if (!iscsi_iser_scsi_transport) { @@ -819,7 +819,24 @@ register_transport_failure: static void __exit iser_exit(void) { + struct iser_conn *ib_conn, *n; + int connlist_empty; + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway.\n"); + list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { + iser_conn_release(ib_conn); + } + } + iscsi_unregister_transport(&iscsi_iser_transport); kmem_cache_destroy(ig.desc_cache); } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 324129f..d309620 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -333,6 +333,8 @@ struct iser_conn { int post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ char name[ISER_OBJECT_NAME_SIZE]; + struct work_struct release_work; + struct completion stop_completion; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; @@ -417,12 +419,12 @@ void iscsi_iser_recv(struct iscsi_conn *conn, void iser_conn_init(struct iser_conn *ib_conn); -void iser_conn_get(struct iser_conn *ib_conn); - -int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); +void iser_conn_release(struct iser_conn *ib_conn); void iser_conn_terminate(struct iser_conn *ib_conn); +void iser_release_work(struct work_struct *work); + void iser_rcv_completion(struct iser_rx_desc *desc, unsigned long dto_xfer_len, struct iser_conn *ib_conn); diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 32849f2..4c698e5 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -581,14 +581,30 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, return ret; } +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *ib_conn; + + ib_conn = container_of(work, struct iser_conn, release_work); + + /* wait for .conn_stop callback */ + wait_for_completion(&ib_conn->stop_completion); + + /* wait for the qp`s post send and post receive buffers to empty */ + wait_event_interruptible(ib_conn->wait, + ib_conn->state == ISER_CONN_DOWN); + + iser_conn_release(ib_conn); +} + /** * Frees all conn objects and deallocs conn descriptor */ -static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) +void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; - BUG_ON(ib_conn->state != ISER_CONN_DOWN); + BUG_ON(ib_conn->state == ISER_CONN_UP); mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); @@ -600,27 +616,13 @@ static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) if (device != NULL) iser_device_try_release(device); /* if cma handler context, the caller actually destroy the id */ - if (ib_conn->cma_id != NULL && can_destroy_id) { + if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } iscsi_destroy_endpoint(ib_conn->ep); } -void iser_conn_get(struct iser_conn *ib_conn) -{ - atomic_inc(&ib_conn->refcount); -} - -int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) -{ - if (atomic_dec_and_test(&ib_conn->refcount)) { - iser_conn_release(ib_conn, can_destroy_id); - return 1; - } - return 0; -} - /** * triggers start of the disconnect procedures and wait for them to be done */ @@ -638,24 +640,19 @@ void iser_conn_terminate(struct iser_conn *ib_conn) if (err) iser_err("Failed to disconnect, conn: 0x%p err %d\n", ib_conn,err); - - wait_event_interruptible(ib_conn->wait, - ib_conn->state == ISER_CONN_DOWN); - - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ } -static int iser_connect_error(struct rdma_cm_id *cma_id) +static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + ib_conn = (struct iser_conn *)cma_id->context; ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); - return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ } -static int iser_addr_handler(struct rdma_cm_id *cma_id) +static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; @@ -664,7 +661,8 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } ib_conn = (struct iser_conn *)cma_id->context; @@ -686,13 +684,12 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } - - return 0; } -static int iser_route_handler(struct rdma_cm_id *cma_id) +static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; @@ -720,9 +717,9 @@ static int iser_route_handler(struct rdma_cm_id *cma_id) goto failure; } - return 0; + return; failure: - return iser_connect_error(cma_id); + iser_connect_error(cma_id); } static void iser_connected_handler(struct rdma_cm_id *cma_id) @@ -739,10 +736,9 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) wake_up_interruptible(&ib_conn->wait); } -static int iser_disconnected_handler(struct rdma_cm_id *cma_id) +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; - int ret; ib_conn = (struct iser_conn *)cma_id->context; @@ -762,24 +758,19 @@ static int iser_disconnected_handler(struct rdma_cm_id *cma_id) ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); } - - ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ - return ret; } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - int ret = 0; - iser_info("event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = iser_addr_handler(cma_id); + iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = iser_route_handler(cma_id); + iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); @@ -789,18 +780,18 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - ret = iser_connect_error(cma_id); + iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - ret = iser_disconnected_handler(cma_id); + iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } - return ret; + return 0; } void iser_conn_init(struct iser_conn *ib_conn) @@ -809,7 +800,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); - atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ + init_completion(&ib_conn->stop_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); } @@ -837,7 +828,6 @@ int iser_connect(struct iser_conn *ib_conn, ib_conn->state = ISER_CONN_PENDING; - iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, RDMA_PS_TCP, IB_QPT_RC); @@ -874,9 +864,8 @@ id_failure: ib_conn->cma_id = NULL; addr_failure: ib_conn->state = ISER_CONN_DOWN; - iser_conn_put(ib_conn, 1); /* deref ib conn's cma id */ connect_failure: - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ + iser_conn_release(ib_conn); return err; } -- cgit v0.10.2 From 66d4e62d27875f3ae417e3b73396e290bb184b5c Mon Sep 17 00:00:00 2001 From: Ariel Nahum Date: Thu, 22 May 2014 11:00:19 +0300 Subject: IB/iser: Fix a possible race in iser connection states transition In some circumstances (multiple targets), RDMA_CM ESTABLISHED event and ep_disconnect may race. In this case, the iser connection state may transition to UP (after ep_disconnect transitioned it to TERMINATING), while the connection is being torn down. Upon RDMA_CM event ESTABLISHED we allow iser connection state to transition to UP only from PENDING. We also make sure to protect this state change (done under the connection lock). Signed-off-by: Ariel Nahum Signed-off-by: Sagi Grimberg Reviewed-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 4c698e5..ea01075 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -732,8 +732,8 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_UP; - wake_up_interruptible(&ib_conn->wait); + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) + wake_up_interruptible(&ib_conn->wait); } static void iser_disconnected_handler(struct rdma_cm_id *cma_id) -- cgit v0.10.2 From e7eeffa4a073c9d5a4b04a64c5efb1edcb89fb11 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 22 May 2014 11:00:20 +0300 Subject: IB/iser: Add missing newlines to logging messages Logging messages need terminating newlines to avoid possible message interleaving. Add them. Signed-off-by: Roi Dayan Signed-off-by: Joe Perches Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index f217488..eb79739 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -508,28 +508,28 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, case ISCSI_PARAM_HDRDGST_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_DATADGST_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("DataDigest wasn't negotiated to None"); + iser_err("DataDigest wasn't negotiated to None\n"); return -EPROTO; } break; case ISCSI_PARAM_IFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("IFMarker wasn't negotiated to No"); + iser_err("IFMarker wasn't negotiated to No\n"); return -EPROTO; } break; case ISCSI_PARAM_OFMARKER_EN: sscanf(buf, "%d", &value); if (value) { - iser_err("OFMarker wasn't negotiated to No"); + iser_err("OFMarker wasn't negotiated to No\n"); return -EPROTO; } break; -- cgit v0.10.2 From c7ca4b69d9ac8ec22f7c6b37a1ab9ac910c726cd Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 22 May 2014 11:00:21 +0300 Subject: IB/iser: Bump version to 1.4 Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index d309620..97cd385 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -69,7 +69,7 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.3" +#define DRV_VER "1.4" #define iser_dbg(fmt, arg...) \ do { \ -- cgit v0.10.2 From 8524867b9c3d11e38de084f47e2128f43e74610a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 18 May 2014 18:32:38 +0300 Subject: mlx5_core: Fix signature handover operation for interleaved buffers When the data and protection are interleaved in the memory domain, no need to expand the mkey total length. At the moment no Linux user works (iSER initiator & target) in interleaved mode. This may change in the future as for SCSI pass-through devices there is no real point in target performing de-interleaving and re-interleaving of the protection data in the PT stage. Regardless, signature verbs support this mode. Signed-off-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index dc930ed..74ee4a4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2275,7 +2275,10 @@ static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, /* length of the protected region, data + protection */ region_len = wr->sg_list->length; - if (wr->wr.sig_handover.prot) + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) region_len += wr->wr.sig_handover.prot->length; /** -- cgit v0.10.2 From 5c273b16771eaeb3957d365bb3695b92aff037cf Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 18 May 2014 18:32:39 +0300 Subject: mlx5_core: Simplify signature handover wqe for interleaved buffers No need for repetition format pattern in case the data and protection are already interleaved in the memory domain since the pattern already exists. A single key entry is sufficient and may save some extra fetch ops. Signed-off-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 74ee4a4..00b5563 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2131,9 +2131,13 @@ static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, int ret; int wqe_size; - if (!wr->wr.sig_handover.prot) { + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { /** * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. * So need construct: * ------------------ * | data_klm | @@ -2187,23 +2191,13 @@ static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, data_sentry->bcount = cpu_to_be16(block_size); data_sentry->key = cpu_to_be32(data_key); data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + prot_sentry->bcount = cpu_to_be16(prot_size); prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); - if (prot_key == data_key && prot_va == data_va) { - /** - * The data and protection are interleaved - * in a single memory region - **/ - prot_sentry->va = cpu_to_be64(data_va + block_size); - prot_sentry->stride = cpu_to_be16(block_size + prot_size); - data_sentry->stride = prot_sentry->stride; - } else { - /* The data and protection are two different buffers */ - prot_sentry->va = cpu_to_be64(prot_va); - data_sentry->stride = cpu_to_be16(block_size); - prot_sentry->stride = cpu_to_be16(prot_size); - } wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + sizeof(*prot_sentry), 64); } -- cgit v0.10.2 From c7f44fbda68a6b2d6ceb10e45c711750e779bace Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 18 May 2014 18:32:40 +0300 Subject: mlx5_core: Copy DIF fields only when input and output space values match Some DIF implementations (SCSI initiator/target) may want to use different input/output values for application tag and/or reference tag. So in case memory/wire domain values don't match HW must not copy them. Signed-off-by: Sagi Grimberg Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 00b5563..a89f704 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2078,6 +2078,7 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, struct ib_sig_domain *wire = &sig_attrs->wire; int ret, selector; + memset(bsf, 0, sizeof(*bsf)); switch (sig_attrs->mem.sig_type) { case IB_SIG_TYPE_T10_DIF: if (sig_attrs->wire.sig_type != IB_SIG_TYPE_T10_DIF) @@ -2090,9 +2091,11 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, /* Same block structure */ basic->bsf_size_sbs = 1 << 4; if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) - basic->wire.copy_byte_mask = 0xff; - else - basic->wire.copy_byte_mask = 0x3f; + basic->wire.copy_byte_mask |= 0xc0; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= 0x30; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= 0x0f; } else basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); -- cgit v0.10.2 From 096f7e72c604e983e14b84b84fc37593fc433585 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 22 May 2014 14:50:08 +0300 Subject: IB/mlx5: Fix error handling in reg_umr If ib_post_send fails when posting the UMR work request in reg_umr, the code doesn't release the temporary pas buffer allocated, and doesn't dma_unmap it. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 81392b2..ad58985 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -730,7 +730,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct mlx5_ib_mr *mr; struct ib_sge sg; int size = sizeof(u64) * npages; - int err; + int err = 0; int i; for (i = 0; i < 1; i++) { @@ -751,7 +751,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); if (!mr->pas) { err = -ENOMEM; - goto error; + goto free_mr; } mlx5_ib_populate_pas(dev, umem, page_shift, @@ -760,9 +760,8 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, mr->dma)) { - kfree(mr->pas); err = -ENOMEM; - goto error; + goto free_pas; } memset(&wr, 0, sizeof(wr)); @@ -778,26 +777,28 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, err = ib_post_send(umrc->qp, &wr, &bad); if (err) { mlx5_ib_warn(dev, "post send failed, err %d\n", err); - up(&umrc->sem); - goto error; + goto unmap_dma; } wait_for_completion(&mr->done); - up(&umrc->sem); + if (mr->status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } +unmap_dma: + up(&umrc->sem); dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); + +free_pas: kfree(mr->pas); - if (mr->status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed\n"); - err = -EFAULT; - goto error; +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); } return mr; - -error: - free_cached_mr(dev, mr); - return ERR_PTR(err); } static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, -- cgit v0.10.2 From 8605933a22796243982e7ed838deca5549c64c62 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 22 May 2014 14:50:09 +0300 Subject: IB/mlx5: Add MR to radix tree in reg_mr_callback For memory regions that are allocated using reg_umr, the suffix of mlx5_core_create_mkey isn't being called. Instead the creation is completed in a callback function (reg_mr_callback). This means that these MRs aren't being added to the MR radix tree. Add them in the callback. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ad58985..9d932a2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -73,6 +73,8 @@ static void reg_mr_callback(int status, void *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev.priv.mr_table; + int err; spin_lock_irqsave(&ent->lock, flags); ent->pending--; @@ -107,6 +109,13 @@ static void reg_mr_callback(int status, void *context) ent->cur++; ent->size++; spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); } static int add_keys(struct mlx5_ib_dev *dev, int c, int num) -- cgit v0.10.2 From b475598aec63f2efbc78f0ff1895d917d2370846 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 22 May 2014 14:50:10 +0300 Subject: mlx5_core: Store MR attributes in mlx5_mr_core during creation and after UMR The patch stores iova, pd and size during mr creation and after UMRs that modify them. It removes the unused access flags field. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9d932a2..f472ab2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -794,6 +794,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, err = -EFAULT; } + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + unmap_dma: up(&umrc->sem); dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 4cc9276..ac52a0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -82,7 +82,11 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, return mlx5_cmd_status_to_err(&lout.hdr); } + mr->iova = be64_to_cpu(in->seg.start_addr); + mr->size = be64_to_cpu(in->seg.len); mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(lout.mkey), key, mr->key); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 93cef63..2bce4aa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -427,7 +427,6 @@ struct mlx5_core_mr { u64 size; u32 key; u32 pd; - u32 access; }; struct mlx5_core_srq { -- cgit v0.10.2 From 48fea837bb2709bda73cd4ae8bbd57cb277f7b90 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 22 May 2014 14:50:11 +0300 Subject: IB/mlx5: Set QP offsets and parameters for user QPs and not just for kernel QPs For user QPs, the creation process does not currently initialize the fields: * qp->rq.offset * qp->sq.offset * qp->sq.wqe_shift Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a89f704..d13ddf1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -574,6 +574,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, uar_index = uuarn_to_uar_index(&context->uuari, uuarn); mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + err = set_user_buf_size(dev, qp, &ucmd); if (err) goto err_uuar; -- cgit v0.10.2 From a74d24168d2df78e7a532567eb0e7538e6b09568 Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Thu, 22 May 2014 14:50:12 +0300 Subject: IB/mlx5: Refactor UMR to have its own context struct Instead of having the UMR context part of each memory region, allocate a struct on the stack. This allows queuing multiple UMRs that access the same memory region. Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5054158..f2ccf1a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -264,8 +264,6 @@ struct mlx5_ib_mr { __be64 *pas; dma_addr_t dma; int npages; - struct completion done; - enum ib_wc_status status; struct mlx5_ib_dev *dev; struct mlx5_create_mkey_mbox_out out; struct mlx5_core_sig_ctx *sig; @@ -277,6 +275,17 @@ struct mlx5_ib_fast_reg_page_list { dma_addr_t map; }; +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + struct umr_common { struct ib_pd *pd; struct ib_cq *cq; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index f472ab2..14ee4fd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -708,7 +708,7 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) { - struct mlx5_ib_mr *mr; + struct mlx5_ib_umr_context *context; struct ib_wc wc; int err; @@ -721,9 +721,9 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) if (err == 0) break; - mr = (struct mlx5_ib_mr *)(unsigned long)wc.wr_id; - mr->status = wc.status; - complete(&mr->done); + context = (struct mlx5_ib_umr_context *)wc.wr_id; + context->status = wc.status; + complete(&context->done); } ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); } @@ -735,6 +735,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct mlx5_ib_dev *dev = to_mdev(pd->device); struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; struct ib_send_wr wr, *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; @@ -774,24 +775,21 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, } memset(&wr, 0, sizeof(wr)); - wr.wr_id = (u64)(unsigned long)mr; + wr.wr_id = (u64)(unsigned long)&umr_context; prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); - /* We serialize polls so one process does not kidnap another's - * completion. This is not a problem since wr is completed in - * around 1 usec - */ + mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); - init_completion(&mr->done); err = ib_post_send(umrc->qp, &wr, &bad); if (err) { mlx5_ib_warn(dev, "post send failed, err %d\n", err); goto unmap_dma; - } - wait_for_completion(&mr->done); - if (mr->status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed\n"); - err = -EFAULT; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } } mr->mmr.iova = virt_addr; @@ -940,24 +938,26 @@ error: static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; struct ib_send_wr wr, *bad; int err; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (u64)(unsigned long)mr; + wr.wr_id = (u64)(unsigned long)&umr_context; prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); - init_completion(&mr->done); err = ib_post_send(umrc->qp, &wr, &bad); if (err) { up(&umrc->sem); mlx5_ib_dbg(dev, "err %d\n", err); goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); } - wait_for_completion(&mr->done); - up(&umrc->sem); - if (mr->status != IB_WC_SUCCESS) { + if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_warn(dev, "unreg umr failed\n"); err = -EFAULT; goto error; -- cgit v0.10.2 From a8237b32a3faab155a5dc8f886452147ce73da3e Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Mon, 5 May 2014 19:33:21 +0200 Subject: IB/mlx5: add missing padding at end of struct mlx5_ib_create_cq The i386 ABI disagrees with most other ABIs regarding alignment of data type larger than 4 bytes: on most ABIs a padding must be added at end of the structures, while it is not required on i386. So for most ABI struct mlx5_ib_create_cq get padded to be aligned on a 8 bytes multiple, while for i386, such padding is not added. The tool pahole can be used to find such implicit padding: $ pahole --anon_include \ --nested_anon_include \ --recursive \ --class_name mlx5_ib_create_cq \ drivers/infiniband/hw/mlx5/mlx5_ib.o Then, structure layout can be compared between i386 and x86_64: +++ obj-i386/drivers/infiniband/hw/mlx5/mlx5_ib.o.pahole.txt 2014-03-28 11:43:07.386413682 +0100 --- obj-x86_64/drivers/infiniband/hw/mlx5/mlx5_ib.o.pahole.txt 2014-03-27 13:06:17.788472721 +0100 @@ -34,9 +34,8 @@ struct mlx5_ib_create_cq { __u64 db_addr; /* 8 8 */ __u32 cqe_size; /* 16 4 */ - /* size: 20, cachelines: 1, members: 3 */ - /* last cacheline: 20 bytes */ + /* size: 24, cachelines: 1, members: 3 */ + /* padding: 4 */ + /* last cacheline: 24 bytes */ }; This ABI disagreement will make an x86_64 kernel try to read past the buffer provided by an i386 binary. When boundary check will be implemented, a x86_64 kernel will refuse to read past the i386 userspace provided buffer and the uverb will fail. Anyway, if the structure lies in memory on a page boundary and next page is not mapped, ib_copy_from_udata() will fail when trying to read the 4 bytes of padding and the uverb will fail. This patch makes create_cq_user() takes care of the input data size to handle the case where no padding is provided. This way, x86_64 kernel will be able to handle struct mlx5_ib_create_cq as sent by unpatched and patched i386 libmlx5. Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com Cc: Fixes: e126ba97dba9e ("mlx5: Add driver for Mellanox Connect-IB adapter") Signed-off-by: Yann Droneaud Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 62bb6b4..8ae4f89 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -32,6 +32,7 @@ #include #include +#include #include "mlx5_ib.h" #include "user.h" @@ -602,14 +603,24 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, int *cqe_size, int *index, int *inlen) { struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; int page_shift; int npages; int ncont; int err; - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) return -EFAULT; + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) return -EINVAL; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 0f4f8e4..d44ecd2 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -91,6 +91,7 @@ struct mlx5_ib_create_cq { __u64 buf_addr; __u64 db_addr; __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ }; struct mlx5_ib_create_cq_resp { -- cgit v0.10.2 From 43bc889380c2ad9aa230eccc03a15cc52cf710d4 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Mon, 5 May 2014 19:33:22 +0200 Subject: IB/mlx5: add missing padding at end of struct mlx5_ib_create_srq The i386 ABI disagrees with most other ABIs regarding alignment of data type larger than 4 bytes: on most ABIs a padding must be added at end of the structures, while it is not required on i386. So for most ABIs struct mlx5_ib_create_srq gets implicitly padded to be aligned on a 8 bytes multiple, while for i386, such padding is not added. Tool pahole could be used to find such implicit padding: $ pahole --anon_include \ --nested_anon_include \ --recursive \ --class_name mlx5_ib_create_srq \ drivers/infiniband/hw/mlx5/mlx5_ib.o Then, structure layout can be compared between i386 and x86_64: +++ obj-i386/drivers/infiniband/hw/mlx5/mlx5_ib.o.pahole.txt 2014-03-28 11:43:07.386413682 +0100 --- obj-x86_64/drivers/infiniband/hw/mlx5/mlx5_ib.o.pahole.txt 2014-03-27 13:06:17.788472721 +0100 @@ -69,7 +68,6 @@ struct mlx5_ib_create_srq { __u64 db_addr; /* 8 8 */ __u32 flags; /* 16 4 */ - /* size: 20, cachelines: 1, members: 3 */ - /* last cacheline: 20 bytes */ + /* size: 24, cachelines: 1, members: 3 */ + /* padding: 4 */ + /* last cacheline: 24 bytes */ }; ABI disagreement will make an x86_64 kernel try to read past the buffer provided by an i386 binary. When boundary check will be implemented, the x86_64 kernel will refuse to read past the i386 userspace provided buffer and the uverb will fail. Anyway, if the structure lay in memory on a page boundary and next page is not mapped, ib_copy_from_udata() will fail and the uverb will fail. This patch makes create_srq_user() takes care of the input data size to handle the case where no padding was provided. This way, x86_64 kernel will be able to handle struct mlx5_ib_create_srq as sent by unpatched and patched i386 libmlx5. Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com Cc: Fixes: e126ba97dba9e ("mlx5: Add driver for Mellanox Connect-IB adapter") Signed-off-by: Yann Droneaud Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 210b3ea..384af6d 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "mlx5_ib.h" #include "user.h" @@ -78,16 +79,27 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; int err; int npages; int page_shift; int ncont; u32 offset; - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index d44ecd2..d0ba264 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -110,6 +110,7 @@ struct mlx5_ib_create_srq { __u64 buf_addr; __u64 db_addr; __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ }; struct mlx5_ib_create_srq_resp { -- cgit v0.10.2 From e4514cbd972786af67dd6c442c072685387e22a2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 27 May 2014 00:04:44 +0300 Subject: RDMA/cxgb3: Fix information leak in send_abort() The cpl_abort_req struct has several reserved members which need to be cleared to avoid disclosing kernel information. I have added a memset() so now it matches the cxgb4 version of this function. Signed-off-by: Dan Carpenter Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 095bb04..cb78b1e 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -418,6 +418,7 @@ static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) skb->priority = CPL_PRIORITY_DATA; set_arp_failure_handler(skb, abort_arp_failure); req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); -- cgit v0.10.2 From 911eccd284d13d78c92ec4f1f1092c03457d732a Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 2 May 2014 11:28:04 -0400 Subject: IB/qib: Fix port in pkey change event The code used a literal 1 in dispatching an IB_EVENT_PKEY_CHANGE. As of the dual port qib QDR card, this is not necessarily correct. Change to use the port as specified in the call. Cc: Reported-by: Alex Estrin Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index edad991..22c720e 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1028,7 +1028,7 @@ static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) event.event = IB_EVENT_PKEY_CHANGE; event.device = &dd->verbs_dev.ibdev; - event.element.port_num = 1; + event.element.port_num = port; ib_dispatch_event(&event); } return 0; -- cgit v0.10.2 From 7e6d3e5c70f13874fb06e6b67696ed90ce79bd48 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 2 May 2014 11:40:17 -0400 Subject: IB/ipath: Translate legacy diagpkt into newer extended diagpkt This patch addresses an issue where the legacy diagpacket is sent in from the user, but the driver operates on only the extended diagpkt. This patch specifically initializes the extended diagpkt based on the legacy packet. Cc: Reported-by: Rickard Strandqvist Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index e2f9a51..45802e9 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -346,6 +346,10 @@ static ssize_t ipath_diagpkt_write(struct file *fp, ret = -EFAULT; goto bail; } + dp.len = odp.len; + dp.unit = odp.unit; + dp.data = odp.data; + dp.pbc_wd = 0; } else { ret = -EINVAL; goto bail; -- cgit v0.10.2 From ed477c4c83b31ebc2b143382b0b5d70325fc9643 Mon Sep 17 00:00:00 2001 From: Upinder Malhi Date: Sat, 19 Apr 2014 01:12:18 +0000 Subject: IB/usnic: Fix source file missing copyright and license Prepends copyright and license to usnic_uiom_interval_tree.c Signed-off-by: Upinder Malhi Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c index d135ad9..3a4288e 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + #include #include #include -- cgit v0.10.2 From 6c9b5d9b00ed2b1cbd5e5d2c176bf88da7beb224 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 28 May 2014 09:23:03 -0700 Subject: IB/mlx5: Fix warning about cast of wr_id back to pointer on 32 bits We need to cast wr_id to unsigned long before casting to a pointer. This fixes: drivers/infiniband/hw/mlx5/mr.c: In function 'mlx5_umr_cq_handler': >> drivers/infiniband/hw/mlx5/mr.c:724:13: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] context = (struct mlx5_ib_umr_context *)wc.wr_id; Reported-by: kbuild test robot Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 14ee4fd..afa873b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -721,7 +721,7 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) if (err == 0) break; - context = (struct mlx5_ib_umr_context *)wc.wr_id; + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; context->status = wc.status; complete(&context->done); } -- cgit v0.10.2 From 49410185c356b2767409de9220d5cf5c8db062e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Sch=C3=B6lling?= Date: Sun, 25 May 2014 15:26:41 +0200 Subject: IB/ipath: Use time_before()/_after() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Time comparisons must use time_after / time_before to avoid problems when jiffies wraps. Signed-off-by: Manuel Schölling Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 26dfbc8..01ba792 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -70,7 +70,7 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd) if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { int i; if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, "SendbufErrs %lx %lx", sbuf[0], sbuf[1]); @@ -755,7 +755,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) /* likely due to cancel; so suppress message unless verbose */ if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && - dd->ipath_lastcancel > jiffies) { + time_after(dd->ipath_lastcancel, jiffies)) { /* armlaunch takes precedence; it often causes both. */ ipath_cdbg(VERBOSE, "Suppressed %s error (%llx) after sendbuf cancel\n", diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 98ac18e..17a5177 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -247,7 +247,7 @@ static void sdma_abort_task(unsigned long opaque) /* ipath_sdma_abort() is done, waiting for interrupt */ if (status == IPATH_SDMA_ABORT_DISARMED) { - if (jiffies < dd->ipath_sdma_abort_intr_timeout) + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) goto resched_noprint; /* give up, intr got lost somewhere */ ipath_dbg("give up waiting for SDMADISABLED intr\n"); @@ -341,7 +341,7 @@ resched: * JAG - this is bad to just have default be a loop without * state change */ - if (jiffies > dd->ipath_sdma_abort_jiffies) { + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { ipath_dbg("looping with status 0x%08lx\n", dd->ipath_sdma_status); dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; -- cgit v0.10.2 From bfdfcfee3c9281e9cd28c0b08235aba1762504a6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 17 May 2014 23:33:44 +0100 Subject: IB/mlx4: fix unitialised variable is_mcast Commit 297e0dad7206 ("IB/mlx4: Handle Ethernet L2 parameters for IP based GID addressing") introduced a bug where is_mcast is now no longer initialized on the non-multicast condition and so it can be any random value from the stack. This issue was detected by cppcheck: [drivers/infiniband/hw/mlx4/ah.c:103]: (error) Uninitialized variable: is_mcast Simple fix is to initialise is_mcast to zero. Signed-off-by: Colin Ian King Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 170dca6..2d8c339 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -73,7 +73,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - int is_mcast; + int is_mcast = 0; struct in6_addr in6; u16 vlan_tag; -- cgit v0.10.2 From 3c735d481b9a507dccee63eeaf6aa6ffa2bccc67 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Feb 2014 15:39:34 +0300 Subject: RDMA/cxgb3: Remove a couple unneeded conditions We know that "reset_tpt_entry" is false on this side of the if else statement so there is no need to check again. Signed-off-by: Dan Carpenter Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index c3f5aca..de1c61b4 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -735,14 +735,12 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, ((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) | V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); - tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); + tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = cpu_to_be32(len); tpt.va_hi = cpu_to_be32((u32) (to >> 32)); tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; - tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); + tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + -- cgit v0.10.2 From 0a66d2bd300cbdaa3146c81cb823d01593963066 Mon Sep 17 00:00:00 2001 From: Vinit Agnihotri Date: Thu, 29 May 2014 15:46:09 -0400 Subject: IB/qib: Additional Intel branding changes This patches changes user visible function names containing "qlogic" in module init and cleanup. Reviewed-by: Mike Marciniszyn Signed-off-by: Vinit Agnihotri Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 5b7aeb2..8d3c78d 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1272,7 +1272,7 @@ static int qib_notify_dca(struct notifier_block *nb, unsigned long event, * Do all the generic driver unit- and chip-independent memory * allocation and initialization. */ -static int __init qlogic_ib_init(void) +static int __init qib_ib_init(void) { int ret; @@ -1316,12 +1316,12 @@ bail: return ret; } -module_init(qlogic_ib_init); +module_init(qib_ib_init); /* * Do the non-unit driver cleanup, memory free, etc. at unload. */ -static void __exit qlogic_ib_cleanup(void) +static void __exit qib_ib_cleanup(void) { int ret; @@ -1346,7 +1346,7 @@ static void __exit qlogic_ib_cleanup(void) qib_dev_cleanup(); } -module_exit(qlogic_ib_cleanup); +module_exit(qib_ib_cleanup); /* this can only be called after a successful initialization */ static void cleanup_device_data(struct qib_devdata *dd) -- cgit v0.10.2 From b38f2879b7d2d51747de2ea9062c698a6ac64cb1 Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Thu, 29 May 2014 16:30:59 +0300 Subject: mlx4_core: Fix memory leaks in SR-IOV error paths Fix a few memory leaks that happen if errors happen in SR-IOV mode. Signed-off-by: Dotan Barak Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 7cf9dad..12a7ee2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1696,6 +1696,13 @@ unmap_bf: unmap_internal_clock(dev); unmap_bf_area(dev); + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + err_close: if (mlx4_is_slave(dev)) mlx4_slave_exit(dev); @@ -2565,6 +2572,13 @@ err_master_mfunc: if (mlx4_is_master(dev)) mlx4_multi_func_cleanup(dev); + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); -- cgit v0.10.2 From bc82878baa10c2a6a4a6affaf52c152935112142 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 May 2014 16:31:00 +0300 Subject: mlx4_core: Fix incorrect FLAGS1 bitmap test in mlx4_QUERY_FUNC_CAP Commit eb17711bc1d6 ("net/mlx4_core: Introduce nic_info new flag in QUERY_FUNC_CAP") did: if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_OFFSET) { which should be: if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) { Fix that. Fixes: eb17711bc1d6 ("net/mlx4_core: Introduce nic_info new flag in QUERY_FUNC_CAP") Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index d16a4d1..ef242e1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -414,7 +414,7 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET); if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) { - if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_OFFSET) { + if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) { mlx4_err(dev, "VLAN is enforced on this port\n"); err = -EPROTONOSUPPORT; goto out; -- cgit v0.10.2 From 61565013cf7024e8aa52e0a8e78208a955ce7e5f Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 May 2014 16:31:01 +0300 Subject: IB/mlx4: SET_PORT called by mlx4_ib_modify_port should be wrapped mlx4_ib_modify_port is invoked in IB for resetting the Q_Key violations counters and for modifying the IB port capability flags. For example, when opensm is started up on the hypervisor, mlx4_ib_modify_port is called to set the port's IsSM flag. In multifunction mode, the SET_PORT command used in this flow should be wrapped (so that the PF port capability flags are also tracked, thus enabling the aggregate of all the VF/PF capability flags to be tracked properly). The procedure mlx4_SET_PORT() in main.c is also renamed to mlx4_ib_SET_PORT() to differentiate it from procedure mlx4_SET_PORT() in port.c. mlx4_ib_SET_PORT() is used exclusively by mlx4_ib_modify_port(). Finally, the CM invokes ib_modify_port() to set the IsCMSupported flag even when running over RoCE. Therefore, when RoCE is active, mlx4_ib_modify_port should return OK unconditionally (since the capability flags and qkey violations counter are not relevant). Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1b6dbe15..3c3806a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -544,12 +544,11 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, return 0; } -static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, - u32 cap_mask) +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; - u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) @@ -563,8 +562,8 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } - err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -573,11 +572,20 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; struct ib_port_attr attr; u32 cap_mask; int err; - mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) @@ -586,9 +594,9 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; - err = mlx4_SET_PORT(to_mdev(ibdev), port, - !!(mask & IB_PORT_RESET_QKEY_CNTR), - cap_mask); + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); -- cgit v0.10.2 From 97982f5a91e91dab26dd0246083b9adf3ba8b2e3 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 May 2014 16:31:02 +0300 Subject: IB/mlx4: Preparation for VFs to issue/receive SMI (QP0) requests/responses Currently, VFs in SRIOV VFs are denied QP0 access. The main reason for this decision is security, since Subnet Management Datagrams (SMPs) are not restricted by network partitioning and may affect the physical network topology. Moreover, even the SM may be denied access from portions of the network by setting management keys unknown to the SM. However, it is desirable to grant SMI access to certain privileged VFs, so that certain network management activities may be conducted within virtual machines instead of the hypervisor. This commit does the following: 1. Create QP0 tunnel QPs for all VFs. 2. Discard SMI mads sent-from/received-for non-privileged VFs in the hypervisor MAD multiplex/demultiplex logic. SMI mads from/for privileged VFs are allowed to pass. 3. MAD_IFC wrapper changes/fixes. For non-privileged VFs, only host-view MAD_IFC commands are allowed, and only for SMI LID-Routed GET mads. For privileged VFs, there are no restrictions. This commit does not allow privileged VFs as yet. To determine if a VF is privileged, it calls function mlx4_vf_smi_enabled(). This function returns 0 unconditionally for now. The next two commits allow defining and activating privileged VFs. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index fd36ec6..287ad05 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -478,10 +478,6 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (!dest_qpt) tun_qp = &tun_ctx->qp[0]; else @@ -667,6 +663,21 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, } /* Class-specific handling */ switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_demux_sa_handler(ibdev, port, slave, (struct ib_sa_mad *) mad)) @@ -1165,10 +1176,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; - /* QP0 forwarding only for Dom0 */ - if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) - return -EINVAL; - if (dest_qpt == IB_QPT_SMI) { src_qpnum = 0; sqp = &sqp_ctx->qp[0]; @@ -1285,11 +1292,6 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc "belongs to another slave\n", wc->src_qp); return; } - if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { - mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " - "non-master trying to send QP0 packets\n", wc->src_qp); - return; - } /* Map transaction ID */ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, @@ -1317,6 +1319,12 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc /* Class-specific handling */ switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, (struct ib_sa_mad *) &tunnel->mad)) @@ -1749,9 +1757,9 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port, return -EEXIST; ctx->state = DEMUX_PV_STATE_STARTING; - /* have QP0 only on port owner, and only if link layer is IB */ - if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && - rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) ctx->has_smi = 1; if (ctx->has_smi) { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 41308af..2e8c588 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2370,7 +2370,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr, enum ib_qp_type qpt) + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; struct mlx4_av sqp_av = {0}; @@ -2383,8 +2384,10 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, cpu_to_be32(0xf0000000); memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); - /* This function used only for sending on QP1 proxies */ - dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); /* Use QKEY from the QP context, which is set by master */ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } @@ -2700,16 +2703,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += seglen / 16; break; case MLX4_IB_QPT_PROXY_SMI: - /* don't allow QP0 sends on guests */ - err = -ENOSYS; - *bad_wr = wr; - goto out; case MLX4_IB_QPT_PROXY_GSI: /* If we are tunneling special qps, this is a UD qp. * In this case we first add a UD segment targeting * the tunnel qp, and then add a header with address * information */ - set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; build_tunnel_header(wr, wqe, &seglen); diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 78099ea..b0e48d4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -705,20 +705,28 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, struct ib_smp *smp = inbox->buf; u32 index; u8 port; + u8 opcode_modifier; u16 *table; int err; int vidx, pidx; + int network_view; struct mlx4_priv *priv = mlx4_priv(dev); struct ib_smp *outsmp = outbox->buf; __be16 *outtab = (__be16 *)(outsmp->data); __be32 slave_cap_mask; __be64 slave_node_guid; + port = vhcr->in_modifier; + /* network-view bit is for driver use only, and should not be passed to FW */ + opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */ + network_view = !!(vhcr->op_modifier & 0x8); + if (smp->base_version == 1 && smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && smp->class_version == 1) { - if (smp->method == IB_MGMT_METHOD_GET) { + /* host view is paravirtualized */ + if (!network_view && smp->method == IB_MGMT_METHOD_GET) { if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) { index = be32_to_cpu(smp->attr_mod); if (port < 1 || port > dev->caps.num_ports) @@ -743,7 +751,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, /*get the slave specific caps:*/ /*do the command */ err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, vhcr->op_modifier, + vhcr->in_modifier, opcode_modifier, vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); /* modify the response for slaves */ if (!err && slave != mlx4_master_func_num(dev)) { @@ -760,7 +768,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, smp->attr_mod = cpu_to_be32(slave / 8); /* execute cmd */ err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, vhcr->op_modifier, + vhcr->in_modifier, opcode_modifier, vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (!err) { /* if needed, move slave gid to index 0 */ @@ -774,7 +782,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, } if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) { err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, vhcr->op_modifier, + vhcr->in_modifier, opcode_modifier, vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (!err) { slave_node_guid = mlx4_get_slave_node_guid(dev, slave); @@ -784,19 +792,24 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, } } } + + /* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs. + * These are the MADs used by ib verbs (such as ib_query_gids). + */ if (slave != mlx4_master_func_num(dev) && - ((smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) || - (smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && - smp->method == IB_MGMT_METHOD_SET))) { - mlx4_err(dev, "slave %d is trying to execute a Subnet MGMT MAD, " - "class 0x%x, method 0x%x for attr 0x%x. Rejecting\n", - slave, smp->method, smp->mgmt_class, - be16_to_cpu(smp->attr_id)); - return -EPERM; + !mlx4_vf_smi_enabled(dev, slave, port)) { + if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->method == IB_MGMT_METHOD_GET) || network_view) { + mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", + slave, smp->method, smp->mgmt_class, + network_view ? "Network" : "Host", + be16_to_cpu(smp->attr_id)); + return -EPERM; + } } - /*default:*/ + return mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, vhcr->op_modifier, + vhcr->in_modifier, opcode_modifier, vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); } @@ -2537,3 +2550,9 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat return 0; } EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state); + +int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port) +{ + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ba87bd2..83612fa 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1234,4 +1234,5 @@ int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port); int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); +int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); #endif /* MLX4_DEVICE_H */ -- cgit v0.10.2 From 99ec41d0a48cb6d14af25765f9449762f9d101f6 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 May 2014 16:31:03 +0300 Subject: mlx4: Add infrastructure for selecting VFs to enable QP0 via MLX proxy QPs This commit adds the infrastructure for enabling selected VFs to operate SMI (QP0) MADs without restriction. Additionally, for these enabled VFs, their QP0 proxy and tunnel QPs are MLX QPs. As such, they operate over VL15. Therefore, they are not affected by "credit" problems or changes in the VLArb table (which may shut down VL0). Non-enabled VFs may only create UD proxy QP0 qps (which are forced by the hypervisor to send packets using the q-key it assigns and places in the qp-context). Thus, non-enabled VFs will not pose a security risk. The hypervisor discards any privileged MADs it receives from these non-enabled VFs. By default, all VFs are NOT enabled, and must explicitly be enabled by the administrator. The sysfs interface which operates the VF enablement infrastructure is provided in the next commit. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 2e8c588..b256009 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -608,6 +608,16 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return !attr->srq; } +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) @@ -625,10 +635,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { if (init_attr->qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_PROXY_GSI; - else if (mlx4_is_master(dev->dev)) - qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; - else - qp_type = MLX4_IB_QPT_PROXY_SMI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } } qpn = sqpn; /* add extra sg entry for tunneling */ @@ -643,7 +656,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, return -EINVAL; if (tnl_init->proxy_qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_TUN_GSI; - else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; else qp_type = MLX4_IB_QPT_TUN_SMI; @@ -1930,6 +1945,19 @@ out: return err; } +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) @@ -1987,8 +2015,13 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); - if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) - return -EINVAL; + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } sqp->ud_header.deth.qkey = cpu_to_be32(qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); @@ -2682,11 +2715,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case MLX4_IB_QPT_PROXY_SMI_OWNER: - if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { - err = -ENOSYS; - *bad_wr = wr; - goto out; - } err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index b0e48d4..26c3eba 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1666,6 +1666,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) for (port = min_port; port <= max_port; port++) { if (!test_bit(port - 1, actv_ports.ports)) continue; + priv->mfunc.master.vf_oper[slave].smi_enabled[port] = + priv->mfunc.master.vf_admin[slave].enable_smi[port]; vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; vp_oper->state = *vp_admin; @@ -1717,6 +1719,8 @@ static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave for (port = min_port; port <= max_port; port++) { if (!test_bit(port - 1, actv_ports.ports)) continue; + priv->mfunc.master.vf_oper[slave].smi_enabled[port] = + MLX4_VF_SMI_DISABLED; vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; if (NO_INDX != vp_oper->vlan_idx) { __mlx4_unregister_vlan(&priv->dev, @@ -2553,6 +2557,13 @@ EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state); int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port) { - return 0; + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS) + return 0; + + return priv->mfunc.master.vf_oper[slave].smi_enabled[port] == + MLX4_VF_SMI_ENABLED; } EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index ef242e1..01e6dd6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -178,8 +178,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_cmd_info *cmd) { struct mlx4_priv *priv = mlx4_priv(dev); - u8 field; - u32 size; + u8 field, port; + u32 size, proxy_qp, qkey; int err = 0; #define QUERY_FUNC_CAP_FLAGS_OFFSET 0x0 @@ -209,6 +209,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, /* when opcode modifier = 1 */ #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET 0x3 +#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET 0x4 #define QUERY_FUNC_CAP_FLAGS0_OFFSET 0x8 #define QUERY_FUNC_CAP_FLAGS1_OFFSET 0xc @@ -221,6 +222,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, #define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC 0x40 #define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN 0x80 #define QUERY_FUNC_CAP_FLAGS1_NIC_INFO 0x10 +#define QUERY_FUNC_CAP_VF_ENABLE_QP0 0x08 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 @@ -234,28 +236,35 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, return -EINVAL; vhcr->in_modifier = converted_port; - /* Set nic_info bit to mark new fields support */ - field = QUERY_FUNC_CAP_FLAGS1_NIC_INFO; - MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET); - /* phys-port = logical-port */ field = vhcr->in_modifier - find_first_bit(actv_ports.ports, dev->caps.num_ports); MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); - field = vhcr->in_modifier; + port = vhcr->in_modifier; + proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1; + + /* Set nic_info bit to mark new fields support */ + field = QUERY_FUNC_CAP_FLAGS1_NIC_INFO; + + if (mlx4_vf_smi_enabled(dev, slave, port) && + !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) { + field |= QUERY_FUNC_CAP_VF_ENABLE_QP0; + MLX4_PUT(outbox->buf, qkey, + QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET); + } + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET); + /* size is now the QP number */ - size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + field - 1; + size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL); size += 2; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL); - size = dev->phys_caps.base_proxy_sqpn + 8 * slave + field - 1; - MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY); - - size += 2; - MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY); + MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY); + proxy_qp += 2; + MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY); MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier], QUERY_FUNC_CAP_PHYS_PORT_ID); @@ -326,7 +335,7 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u8 field, op_modifier; - u32 size; + u32 size, qkey; int err = 0, quotas = 0; op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */ @@ -442,6 +451,13 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, goto out; } + if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) { + MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET); + func_cap->qp0_qkey = qkey; + } else { + func_cap->qp0_qkey = 0; + } + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL); func_cap->qp0_tunnel_qpn = size & 0xFFFFFF; diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 6811ee0..1fce03e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -134,6 +134,7 @@ struct mlx4_func_cap { int max_eq; int reserved_eq; int mcg_quota; + u32 qp0_qkey; u32 qp0_tunnel_qpn; u32 qp0_proxy_qpn; u32 qp1_tunnel_qpn; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 12a7ee2..9083268 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -666,13 +666,15 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) return -ENODEV; } + dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL); dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || - !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { + !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy || + !dev->caps.qp0_qkey) { err = -ENOMEM; goto err_mem; } @@ -684,6 +686,7 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) " port %d, aborting (%d).\n", i, err); goto err_mem; } + dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey; dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn; dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn; dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn; @@ -729,12 +732,16 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) return 0; err_mem: + kfree(dev->caps.qp0_qkey); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); - dev->caps.qp0_tunnel = dev->caps.qp0_proxy = - dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; + dev->caps.qp0_qkey = NULL; + dev->caps.qp0_tunnel = NULL; + dev->caps.qp0_proxy = NULL; + dev->caps.qp1_tunnel = NULL; + dev->caps.qp1_proxy = NULL; return err; } @@ -1697,6 +1704,7 @@ unmap_bf: unmap_bf_area(dev); if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_qkey); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); @@ -2573,6 +2581,7 @@ err_master_mfunc: mlx4_multi_func_cleanup(dev); if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_qkey); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); @@ -2702,6 +2711,7 @@ static void __mlx4_remove_one(struct pci_dev *pdev) if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); + kfree(dev->caps.qp0_qkey); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index f9c4651..0efd173 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -133,6 +133,11 @@ enum { MLX4_COMM_CMD_FLR = 254 }; +enum { + MLX4_VF_SMI_DISABLED, + MLX4_VF_SMI_ENABLED +}; + /*The flag indicates that the slave should delay the RESET cmd*/ #define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb /*indicates how many retries will be done if we are in the middle of FLR*/ @@ -488,6 +493,7 @@ struct mlx4_vport_state { struct mlx4_vf_admin_state { struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1]; + u8 enable_smi[MLX4_MAX_PORTS + 1]; }; struct mlx4_vport_oper_state { @@ -495,8 +501,10 @@ struct mlx4_vport_oper_state { int mac_idx; int vlan_idx; }; + struct mlx4_vf_oper_state { struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1]; + u8 smi_enabled[MLX4_MAX_PORTS + 1]; }; struct slave_list { diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 1c3fdd4..ad98162 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2827,10 +2827,12 @@ static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start, } static int verify_qp_parameters(struct mlx4_dev *dev, + struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, enum qp_transition transition, u8 slave) { u32 qp_type; + u32 qpn; struct mlx4_qp_context *qp_ctx; enum mlx4_qp_optpar optpar; int port; @@ -2870,6 +2872,20 @@ static int verify_qp_parameters(struct mlx4_dev *dev, return -EINVAL; } break; + case MLX4_QP_ST_MLX: + qpn = vhcr->in_modifier & 0x7fffff; + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (transition == QP_TRANS_INIT2RTR && + slave != mlx4_master_func_num(dev) && + mlx4_is_qp_reserved(dev, qpn) && + !mlx4_vf_smi_enabled(dev, slave, port)) { + /* only enabled VFs may create MLX proxy QPs */ + mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n", + __func__, slave, port); + return -EPERM; + } + break; + default: break; } @@ -3454,7 +3470,7 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, err = adjust_qp_sched_queue(dev, slave, qpc, inbox); if (err) return err; - err = verify_qp_parameters(dev, inbox, QP_TRANS_INIT2RTR, slave); + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave); if (err) return err; @@ -3508,7 +3524,7 @@ int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, err = adjust_qp_sched_queue(dev, slave, context, inbox); if (err) return err; - err = verify_qp_parameters(dev, inbox, QP_TRANS_RTR2RTS, slave); + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave); if (err) return err; @@ -3530,7 +3546,7 @@ int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, err = adjust_qp_sched_queue(dev, slave, context, inbox); if (err) return err; - err = verify_qp_parameters(dev, inbox, QP_TRANS_RTS2RTS, slave); + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave); if (err) return err; @@ -3567,7 +3583,7 @@ int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, err = adjust_qp_sched_queue(dev, slave, context, inbox); if (err) return err; - err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2SQD, slave); + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave); if (err) return err; @@ -3589,7 +3605,7 @@ int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, err = adjust_qp_sched_queue(dev, slave, context, inbox); if (err) return err; - err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2RTS, slave); + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave); if (err) return err; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 83612fa..e2fc701 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -401,6 +401,7 @@ struct mlx4_caps { int max_rq_desc_sz; int max_qp_init_rdma; int max_qp_dest_rdma; + u32 *qp0_qkey; u32 *qp0_proxy; u32 *qp1_proxy; u32 *qp0_tunnel; -- cgit v0.10.2 From 65fed8a8c155271cf647651bd62eecb5928ae3a4 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 May 2014 16:31:04 +0300 Subject: IB/mlx4: Add interface for selecting VFs to enable QP0 via MLX proxy QPs This commit adds the sysfs interface for enabling QP0 on VFs for selected VF/port. By default, no VFs are enabled for QP0 operation. To enable QP0 operation on a VF/port, under /sys/class/infiniband/mlx4_x/iov//ports/x there are two new entries: - smi_enabled (read-only). Indicates whether smi is currently enabled for the indicated VF/port - enable_smi_admin (rw). Used by the admin to request that smi capability be enabled or disabled for the indicated VF/port. 0 = disable, 1 = enable. The requested enablement will occur at the next reset of the VF (e.g. driver restart on the VM which owns the VF). Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index 5a38e43..cb4c66e 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -389,8 +389,10 @@ struct mlx4_port { struct mlx4_ib_dev *dev; struct attribute_group pkey_group; struct attribute_group gid_group; - u8 port_num; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; int slave; + u8 port_num; }; @@ -558,6 +560,101 @@ err: return NULL; } +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) { struct mlx4_port *p; @@ -602,6 +699,10 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) if (ret) goto err_free_gid; + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); return 0; @@ -669,6 +770,7 @@ err_add: mport = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &mport->pkey_group); sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); kobject_put(p); } kobject_put(dev->dev_ports_parent[slave]); @@ -713,6 +815,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device) port = container_of(p, struct mlx4_port, kobj); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); kobject_put(p); kobject_put(device->dev_ports_parent[slave]); } diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 26c3eba..3370ecb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2567,3 +2567,37 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port) MLX4_VF_SMI_ENABLED; } EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled); + +int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave == mlx4_master_func_num(dev)) + return 1; + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS) + return 0; + + return priv->mfunc.master.vf_admin[slave].enable_smi[port] == + MLX4_VF_SMI_ENABLED; +} +EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin); + +int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, + int enabled) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave == mlx4_master_func_num(dev)) + return 0; + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS || + enabled < 0 || enabled > 1) + return -EINVAL; + + priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled; + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index e2fc701..dcabe11 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1236,4 +1236,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); +int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); +int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, + int enable); #endif /* MLX4_DEVICE_H */ -- cgit v0.10.2 From 8ec0a0e6b58218bdc1db91dd70ebfcd6ad8dd6cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 May 2014 10:33:41 +0200 Subject: IB/umad: Fix error handling Avoid leaking a kref count in ib_umad_open() if port->ib_dev == NULL or if nonseekable_open() fails. Avoid leaking a kref count, that sm_sem is kept down and also that the IB_PORT_SM capability mask is not cleared in ib_umad_sm_open() if nonseekable_open() fails. Since container_of() never returns NULL, remove the code that tests whether container_of() returns NULL. Moving the kref_get() call from the start of ib_umad_*open() to the end is safe since it is the responsibility of the caller of these functions to ensure that the cdev pointer remains valid until at least when these functions return. Signed-off-by: Bart Van Assche Cc: [ydroneaud@opteya.com: rework a bit to reduce the amount of code changed] Signed-off-by: Yann Droneaud [ nonseekable_open() can't actually fail, but.... - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f0d588f..9bdf576c 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -780,27 +780,19 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - kref_put(&port->umad_dev->ref, ib_umad_release_dev); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -814,6 +806,13 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kref_get(&port->umad_dev->ref); out: mutex_unlock(&port->file_mutex); @@ -880,10 +879,6 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) int ret; port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); - if (port) - kref_get(&port->umad_dev->ref); - else - return -ENXIO; if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { @@ -898,17 +893,27 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kref_get(&port->umad_dev->ref); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); fail: - kref_put(&port->umad_dev->ref, ib_umad_release_dev); return ret; } -- cgit v0.10.2 From d236cd0e209c595434ba15defe56f2e5aa488d1a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 1 Feb 2013 14:33:58 -0800 Subject: IB/srp: Avoid problems if a header uses pr_fmt SRP defines pr_fmt(fmt) to be "PFX fmt", and then includes a bunch of header files before it gets around to defining PFX. This causes problems if any of the header files do a pr_... and use pr_fmt(). Fix this by using KBUILD_MODNAME instead of the private PFX. Acked-by: Chris Metcalf Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index b42f132..e3c2c5b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -30,7 +30,7 @@ * SOFTWARE. */ -#define pr_fmt(fmt) PFX fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include -- cgit v0.10.2 From b6f04d3d21458818073a2f5af5339f958864bf71 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Mon, 5 May 2014 19:33:23 +0200 Subject: RDMA/cxgb4: Add missing padding at end of struct c4iw_create_cq_resp The i386 ABI disagrees with most other ABIs regarding alignment of data types larger than 4 bytes: on most ABIs a padding must be added at end of the structures, while it is not required on i386. So for most ABI struct c4iw_create_cq_resp gets implicitly padded to be aligned on a 8 bytes multiple, while for i386, such padding is not added. The tool pahole can be used to find such implicit padding: $ pahole --anon_include \ --nested_anon_include \ --recursive \ --class_name c4iw_create_cq_resp \ drivers/infiniband/hw/cxgb4/iw_cxgb4.o Then, structure layout can be compared between i386 and x86_64: +++ obj-i386/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 11:43:05.547432195 +0100 --- obj-x86_64/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 10:55:10.990133017 +0100 @@ -14,9 +13,8 @@ struct c4iw_create_cq_resp { __u32 size; /* 28 4 */ __u32 qid_mask; /* 32 4 */ - /* size: 36, cachelines: 1, members: 6 */ - /* last cacheline: 36 bytes */ + /* size: 40, cachelines: 1, members: 6 */ + /* padding: 4 */ + /* last cacheline: 40 bytes */ }; This ABI disagreement will make an x86_64 kernel try to write past the buffer provided by an i386 binary. When boundary check will be implemented, the x86_64 kernel will refuse to write past the i386 userspace provided buffer and the uverbs will fail. If the structure is on a page boundary and the next page is not mapped, ib_copy_to_udata() will fail and the uverb will fail. This patch adds an explicit padding at end of structure c4iw_create_cq_resp, and, like 92b0ca7cb149 ("IB/mlx5: Fix stack info leak in mlx5_ib_alloc_ucontext()"), makes function c4iw_create_cq() not writting this padding field to userspace. This way, x86_64 kernel will be able to write struct c4iw_create_cq_resp as expected by unpatched and patched i386 libcxgb4. Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com Cc: Fixes: cfdda9d764362 ("RDMA/cxgb4: Add driver for Chelsio T4 RNIC") Fixes: e24a72a3302a6 ("RDMA/cxgb4: Fix four byte info leak in c4iw_create_cq()") Cc: Dan Carpenter Signed-off-by: Yann Droneaud Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index cfaa56a..7151a02 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -940,7 +940,6 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, if (!mm2) goto err4; - memset(&uresp, 0, sizeof(uresp)); uresp.qid_mask = rhp->rdev.cqmask; uresp.cqid = chp->cq.cqid; uresp.size = chp->cq.size; @@ -951,7 +950,8 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, uresp.gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); - ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err5; diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index 11ccd27..9b7534b 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -48,6 +48,7 @@ struct c4iw_create_cq_resp { __u32 cqid; __u32 size; __u32 qid_mask; + __u32 reserved; /* explicit padding (optional for i386) */ }; -- cgit v0.10.2 From 165cb465f73c33c4fb9a79951a623083291c6f1e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 30 May 2014 15:38:58 -0700 Subject: mlx4_core: Move handling of MLX4_QP_ST_MLX to proper switch statement The handling of MLX4_QP_ST_MLX in verify_qp_parameters() was accidentally put inside the inner switch statement (that handles which transition of RC/UC/XRC QPs is happening). Fix this by moving the case to the outer switch statement. The compiler pointed this out with: drivers/net/ethernet/mellanox/mlx4/resource_tracker.c: In function 'verify_qp_parameters': >> drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:2875:3: warning: case value '7' not in enumerated type 'enum qp_transition' [-Wswitch] case MLX4_QP_ST_MLX: Reported-by: kbuild test robot Fixes: 99ec41d0a48c ("mlx4: Add infrastructure for selecting VFs to enable QP0 via MLX proxy QPs") Signed-off-by: Roland Dreier diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index ad98162..10db83e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2872,25 +2872,25 @@ static int verify_qp_parameters(struct mlx4_dev *dev, return -EINVAL; } break; - case MLX4_QP_ST_MLX: - qpn = vhcr->in_modifier & 0x7fffff; - port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; - if (transition == QP_TRANS_INIT2RTR && - slave != mlx4_master_func_num(dev) && - mlx4_is_qp_reserved(dev, qpn) && - !mlx4_vf_smi_enabled(dev, slave, port)) { - /* only enabled VFs may create MLX proxy QPs */ - mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n", - __func__, slave, port); - return -EPERM; - } - break; - default: break; } + break; + case MLX4_QP_ST_MLX: + qpn = vhcr->in_modifier & 0x7fffff; + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (transition == QP_TRANS_INIT2RTR && + slave != mlx4_master_func_num(dev) && + mlx4_is_qp_reserved(dev, qpn) && + !mlx4_vf_smi_enabled(dev, slave, port)) { + /* only enabled VFs may create MLX proxy QPs */ + mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n", + __func__, slave, port); + return -EPERM; + } break; + default: break; } -- cgit v0.10.2 From 729ee4efcc29c040a1729c058f03fae3cebc6035 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Thu, 27 Mar 2014 12:10:33 +0100 Subject: IB: Allow build of hw/ and ulp/ subdirectories independently It is not possible to build only the drivers/infiniband/hw/ (or ulp/) subdirectory with command such as: $ make ARCH=x86_64 O=./obj-x86_64/ drivers/infiniband/hw/ This fails with following error messages: make[2]: Nothing to be done for `all'. make[2]: Nothing to be done for `relocs'. CHK include/config/kernel.release Using /home/ydroneaud/src/linux as source for kernel GEN /home/ydroneaud/src/linux/obj-x86_64/Makefile CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h CALL /home/ydroneaud/src/linux/scripts/checksyscalls.sh /home/ydroneaud/src/linux/scripts/Makefile.build:44: /home/ydroneaud/src/linux/drivers/infiniband/hw/Makefile: No such file or directory make[2]: *** No rule to make target `/home/ydroneaud/src/linux/drivers/infiniband/hw/Makefile'. Stop. make[1]: *** [drivers/infiniband/hw/] Error 2 make: *** [sub-make] Error 2 This patch creates a Makefile in hw/ and ulp/ and moves each corresponding parts of drivers/infiniband/Makefile in the new Makefiles. It should not break build except if some hw/ drivers or ulp/ were allowed previously to be built while CONFIG_INFINIBAND is set to 'n', but according to drivers/infiniband/Kconfig, it's not possible. So it should be safe to apply. Signed-off-by: Yann Droneaud Reviewed-by: Bart Van Assche Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index bf508b5..dc21836 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -1,18 +1,3 @@ obj-$(CONFIG_INFINIBAND) += core/ -obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ -obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ -obj-$(CONFIG_INFINIBAND_QIB) += hw/qib/ -obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ -obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ -obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ -obj-$(CONFIG_INFINIBAND_CXGB4) += hw/cxgb4/ -obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/ -obj-$(CONFIG_MLX5_INFINIBAND) += hw/mlx5/ -obj-$(CONFIG_INFINIBAND_NES) += hw/nes/ -obj-$(CONFIG_INFINIBAND_OCRDMA) += hw/ocrdma/ -obj-$(CONFIG_INFINIBAND_USNIC) += hw/usnic/ -obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ -obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ -obj-$(CONFIG_INFINIBAND_SRPT) += ulp/srpt/ -obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ -obj-$(CONFIG_INFINIBAND_ISERT) += ulp/isert/ +obj-$(CONFIG_INFINIBAND) += hw/ +obj-$(CONFIG_INFINIBAND) += ulp/ diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile new file mode 100644 index 0000000..e900b03 --- /dev/null +++ b/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 0000000..f3c7dcf --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ -- cgit v0.10.2 From 60093dc0c8b6407bc7494cbcb3e84322cc6782c8 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 May 2014 15:15:10 +0300 Subject: IB: Return error for unsupported QP creation flags Fix the usnic and thw qib drivers to err when QP creation flags that they don't understand are provided. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 0cad0c4..7fcc150 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -985,7 +985,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, struct ib_qp *ret; if (init_attr->cap.max_send_sge > ib_qib_max_sges || - init_attr->cap.max_send_wr > ib_qib_max_qp_wrs) { + init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || + init_attr->create_flags) { ret = ERR_PTR(-EINVAL); goto bail; } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index d48d2c0..53bd6a2 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -466,6 +466,9 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, ucontext = to_uucontext(pd->uobject->context); us_ibdev = to_usdev(pd->device); + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { usnic_err("%s: cannot copy udata for create_qp\n", -- cgit v0.10.2 From 09b93088d75009807b72293f26e2634430ce5ba9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 May 2014 15:15:11 +0300 Subject: IB: Add a QP creation flag to use GFP_NOIO allocations This addresses a problem where NFS client writes over IPoIB connected mode may deadlock on memory allocation/writeback. The problem is not directly memory reclamation. There is an indirect dependency between network filesystems writing back pages and ipoib_cm_tx_init() due to how a kworker is used. Page reclaim cannot make forward progress until ipoib_cm_tx_init() succeeds and it is stuck in page reclaim itself waiting for network transmission. Ordinarily this situation may be avoided by having the caller use GFP_NOFS but ipoib_cm_tx_init() does not have that information. To address this, take a general approach and add a new QP creation flag that tells the low-level hardware driver to use GFP_NOIO for the memory allocations related to the new QP. Use the new flag in the ipoib connected mode path, and if the driver doesn't support it, re-issue the QP creation without the flag. Signed-off-by: Mel Gorman Signed-off-by: Jiri Kosina Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 1377f85..933efce 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1030,10 +1030,20 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, - .qp_context = tx + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO }; - return ib_create_qp(priv->pd, &attr); + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; } static int ipoib_cm_send_req(struct net_device *dev, @@ -1104,12 +1114,14 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index acd8251..d75b02f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -783,6 +783,7 @@ enum ib_qp_create_flags { IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, + IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, -- cgit v0.10.2 From 40f2287bd583f4df4c602c1a29a48df2730fb6d4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sun, 11 May 2014 15:15:12 +0300 Subject: IB/mlx4: Implement IB_QP_CREATE_USE_GFP_NOIO Modify the various routines used to allocate memory resources which serve QPs in mlx4 to get an input GFP directive. Have the Ethernet driver to use GFP_KERNEL in it's QP allocations as done prior to this commit, and the IB driver to use GFP_NOIO when the IB verbs IB_QP_CREATE_USE_GFP_NOIO QP creation flag is provided. Signed-off-by: Mel Gorman Signed-off-by: Jiri Kosina Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 5f64081..1066eec 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -102,7 +102,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * int err; err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, - PAGE_SIZE * 2, &buf->buf); + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); if (err) goto out; @@ -113,7 +113,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); if (err) goto err_mtt; @@ -209,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector uar = &to_mucontext(context)->uar; } else { - err = mlx4_db_alloc(dev->dev, &cq->db, 1); + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); if (err) goto err_cq; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index f589522..bb8c9dd 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -156,6 +156,7 @@ enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, }; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 41308af..8710baf 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -610,7 +610,8 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) { int qpn; int err; @@ -748,14 +749,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; if (qp_has_rq(init_attr)) { - err = mlx4_db_alloc(dev->dev, &qp->db, 0); + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); if (err) goto err; *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; } @@ -765,13 +766,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); - + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -801,7 +801,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_proxy; } - err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); if (err) goto err_qpn; @@ -1040,7 +1040,10 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp = NULL; int err; u16 xrcdn = 0; + gfp_t gfp; + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. @@ -1049,7 +1052,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | - MLX4_IB_QP_NETIF)) + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { @@ -1059,7 +1063,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, if (init_attr->create_flags && (udata || - ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) && + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && init_attr->qp_type != IB_QPT_UD) || ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) @@ -1079,7 +1083,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: - qp = kzalloc(sizeof *qp, GFP_KERNEL); + qp = kzalloc(sizeof *qp, gfp); if (!qp) return ERR_PTR(-ENOMEM); qp->pri.vid = 0xFFFF; @@ -1088,7 +1092,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_UD: { err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp); + udata, 0, &qp, gfp); if (err) return ERR_PTR(err); @@ -1106,7 +1110,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, get_sqp_num(to_mdev(pd->device), init_attr), - &qp); + &qp, gfp); if (err) return ERR_PTR(err); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 60c5fb0..62d9285 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - err = mlx4_db_alloc(dev->dev, &srq->db, 0); + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); if (err) goto err_srq; *srq->db.db = 0; - if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { err = -ENOMEM; goto err_db; } @@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); if (err) goto err_mtt; diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c index c3ad464..b0297da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -171,7 +171,7 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap) */ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, - struct mlx4_buf *buf) + struct mlx4_buf *buf, gfp_t gfp) { dma_addr_t t; @@ -180,7 +180,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, buf->npages = 1; buf->page_shift = get_order(size) + PAGE_SHIFT; buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, - size, &t, GFP_KERNEL); + size, &t, gfp); if (!buf->direct.buf) return -ENOMEM; @@ -200,14 +200,14 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), - GFP_KERNEL); + gfp); if (!buf->page_list) return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) { buf->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, - &t, GFP_KERNEL); + &t, gfp); if (!buf->page_list[i].buf) goto err_free; @@ -218,7 +218,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, if (BITS_PER_LONG == 64) { struct page **pages; - pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); + pages = kmalloc(sizeof *pages * buf->nbufs, gfp); if (!pages) goto err_free; for (i = 0; i < buf->nbufs; ++i) @@ -260,11 +260,12 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) } EXPORT_SYMBOL_GPL(mlx4_buf_free); -static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device) +static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device, + gfp_t gfp) { struct mlx4_db_pgdir *pgdir; - pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL); + pgdir = kzalloc(sizeof *pgdir, gfp); if (!pgdir) return NULL; @@ -272,7 +273,7 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device) pgdir->bits[0] = pgdir->order0; pgdir->bits[1] = pgdir->order1; pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE, - &pgdir->db_dma, GFP_KERNEL); + &pgdir->db_dma, gfp); if (!pgdir->db_page) { kfree(pgdir); return NULL; @@ -312,7 +313,7 @@ found: return 0; } -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order) +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_db_pgdir *pgdir; @@ -324,7 +325,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order) if (!mlx4_alloc_db_from_pgdir(pgdir, db, order)) goto out; - pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev)); + pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp); if (!pgdir) { ret = -ENOMEM; goto out; @@ -376,13 +377,13 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, { int err; - err = mlx4_db_alloc(dev, &wqres->db, 1); + err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL); if (err) return err; *wqres->db.db = 0; - err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf); + err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL); if (err) goto err_db; @@ -391,7 +392,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf); + err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL); if (err) goto err_mtt; diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c index 0487121..c90cde5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/cq.c @@ -173,11 +173,11 @@ int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) if (*cqn == -1) return -ENOMEM; - err = mlx4_table_get(dev, &cq_table->table, *cqn); + err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL); if (err) goto err_out; - err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn); + err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL); if (err) goto err_put; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index ba049ae..87857a6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -972,7 +972,7 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, if (!context) return -ENOMEM; - err = mlx4_qp_alloc(mdev->dev, qpn, qp); + err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL); if (err) { en_err(priv, "Failed to allocate qp #%x\n", qpn); goto out; @@ -1012,7 +1012,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) en_err(priv, "Failed reserving drop qpn\n"); return err; } - err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp); + err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL); if (err) { en_err(priv, "Failed allocating drop qp\n"); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); @@ -1071,7 +1071,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) } /* Configure RSS indirection qp */ - err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp); + err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL); if (err) { en_err(priv, "Failed to allocate RSS indirection QP\n"); goto rss_err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index dd1f6d3..bc0cc1e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -113,7 +113,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); ring->qpn = qpn; - err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); + err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_map; diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c index 5fbf492..eb1747e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.c +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -245,7 +245,8 @@ int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, + int gfp) { u32 i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); @@ -259,7 +260,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) } table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT, - (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + (table->lowmem ? gfp : GFP_HIGHUSER) | __GFP_NOWARN, table->coherent); if (!table->icm[i]) { ret = -ENOMEM; @@ -356,7 +357,7 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 i; for (i = start; i <= end; i += inc) { - err = mlx4_table_get(dev, table, i); + err = mlx4_table_get(dev, table, i, GFP_KERNEL); if (err) goto fail; } diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h index dee67fa..067e6e0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.h +++ b/drivers/net/ethernet/mellanox/mlx4/icm.h @@ -71,7 +71,8 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask, int coherent); void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, + int gfp); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 start, u32 end); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index f9c4651..627a54e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -888,7 +888,7 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev); void mlx4_cleanup_qp_table(struct mlx4_dev *dev); void mlx4_cleanup_srq_table(struct mlx4_dev *dev); void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); -int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn); +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp); void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); @@ -896,7 +896,7 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); int __mlx4_mpt_reserve(struct mlx4_dev *dev); void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index); -int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index); +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp); void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index); u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order); void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order); diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 2483585..4c71daf 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -364,14 +364,14 @@ static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index) __mlx4_mpt_release(dev, index); } -int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index) +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; - return mlx4_table_get(dev, &mr_table->dmpt_table, index); + return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp); } -static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index) +static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) { u64 param = 0; @@ -382,7 +382,7 @@ static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index) MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } - return __mlx4_mpt_alloc_icm(dev, index); + return __mlx4_mpt_alloc_icm(dev, index, gfp); } void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index) @@ -469,7 +469,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key)); + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL); if (err) return err; @@ -627,13 +627,14 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, EXPORT_SYMBOL_GPL(mlx4_write_mtt); int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - struct mlx4_buf *buf) + struct mlx4_buf *buf, gfp_t gfp) { u64 *page_list; int err; int i; - page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL); + page_list = kmalloc(buf->npages * sizeof *page_list, + gfp); if (!page_list) return -ENOMEM; @@ -680,7 +681,7 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw) struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key)); + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 61d64eb..917f0d0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -272,29 +272,29 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) } EXPORT_SYMBOL_GPL(mlx4_qp_release_range); -int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; int err; - err = mlx4_table_get(dev, &qp_table->qp_table, qpn); + err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp); if (err) goto err_out; - err = mlx4_table_get(dev, &qp_table->auxc_table, qpn); + err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp); if (err) goto err_put_qp; - err = mlx4_table_get(dev, &qp_table->altc_table, qpn); + err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp); if (err) goto err_put_auxc; - err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn); + err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp); if (err) goto err_put_altc; - err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn); + err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp); if (err) goto err_put_rdmarc; @@ -316,7 +316,7 @@ err_out: return err; } -static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, int gfp) { u64 param = 0; @@ -326,7 +326,7 @@ static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } - return __mlx4_qp_alloc_icm(dev, qpn); + return __mlx4_qp_alloc_icm(dev, qpn, gfp); } void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) @@ -355,7 +355,7 @@ static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) __mlx4_qp_free_icm(dev, qpn); } -int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; @@ -366,7 +366,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) qp->qpn = qpn; - err = mlx4_qp_alloc_icm(dev, qpn); + err = mlx4_qp_alloc_icm(dev, qpn, gfp); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 1c3fdd4..45da913 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -1532,7 +1532,7 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, return err; if (!fw_reserved(dev, qpn)) { - err = __mlx4_qp_alloc_icm(dev, qpn); + err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL); if (err) { res_abort_move(dev, slave, RES_QP, qpn); return err; @@ -1619,7 +1619,7 @@ static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, if (err) return err; - err = __mlx4_mpt_alloc_icm(dev, mpt->key); + err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL); if (err) { res_abort_move(dev, slave, RES_MPT, id); return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c index 98faf87..6714662 100644 --- a/drivers/net/ethernet/mellanox/mlx4/srq.c +++ b/drivers/net/ethernet/mellanox/mlx4/srq.c @@ -103,11 +103,11 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) if (*srqn == -1) return -ENOMEM; - err = mlx4_table_get(dev, &srq_table->table, *srqn); + err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL); if (err) goto err_out; - err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn); + err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL); if (err) goto err_put; return 0; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ba87bd2..be60b00 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -837,7 +837,7 @@ static inline int mlx4_is_slave(struct mlx4_dev *dev) } int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, - struct mlx4_buf *buf); + struct mlx4_buf *buf, gfp_t gfp); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { @@ -874,9 +874,10 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw); int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int start_index, int npages, u64 *page_list); int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - struct mlx4_buf *buf); + struct mlx4_buf *buf, gfp_t gfp); -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order); +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, + gfp_t gfp); void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db); int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, @@ -892,7 +893,8 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); -int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, + gfp_t gfp); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn, -- cgit v0.10.2 From 5343c00dd0f2543d5751575515bdae4b801820ea Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 3 Jun 2014 10:24:24 -0700 Subject: IB/mad: Fix sparse warning about gfp_t use Properly convert gfp_t & result to bool to fix: drivers/infiniband/core/sa_query.c:621:33: warning: incorrect type in initializer (different base types) drivers/infiniband/core/sa_query.c:621:33: expected bool [unsigned] [usertype] preload drivers/infiniband/core/sa_query.c:621:33: got restricted gfp_t Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f820958..233eaf5 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -618,7 +618,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = gfp_mask & __GFP_WAIT; + bool preload = !!(gfp_mask & __GFP_WAIT); unsigned long flags; int ret, id; -- cgit v0.10.2 From 8385fd841468868e0b37a722530d75b0e8bfc5a8 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 4 Jun 2014 10:00:16 -0700 Subject: IB/core: Fix sparse warnings about redeclared functions Fix a few functions that are declared with __attribute_const__ in the ib_verbs.h header file but defined without it in verbs.c. This gets rid of the following sparse warnings: drivers/infiniband/core/verbs.c:51:5: error: symbol 'ib_rate_to_mult' redeclared with different type (originally declared at include/rdma/ib_verbs.h:469) - different modifiers drivers/infiniband/core/verbs.c:68:14: error: symbol 'mult_to_ib_rate' redeclared with different type (originally declared at include/rdma/ib_verbs.h:607) - different modifiers drivers/infiniband/core/verbs.c:85:5: error: symbol 'ib_rate_to_mbps' redeclared with different type (originally declared at include/rdma/ib_verbs.h:476) - different modifiers drivers/infiniband/core/verbs.c:111:1: error: symbol 'rdma_node_get_transport' redeclared with different type (originally declared at include/rdma/ib_verbs.h:84) - different modifiers Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 92525f8..c2b89cc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -48,7 +48,7 @@ #include "core_priv.h" -int ib_rate_to_mult(enum ib_rate rate) +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; @@ -65,7 +65,7 @@ int ib_rate_to_mult(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mult); -enum ib_rate mult_to_ib_rate(int mult) +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; @@ -82,7 +82,7 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); -int ib_rate_to_mbps(enum ib_rate rate) +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; @@ -107,7 +107,7 @@ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); -enum rdma_transport_type +__attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { switch (node_type) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index acd8251..e9da1bc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -80,8 +80,8 @@ enum rdma_transport_type { RDMA_TRANSPORT_USNIC_UDP }; -enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type); enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, @@ -466,14 +466,14 @@ enum ib_rate { * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ -int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ -int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); enum ib_mr_create_flags { IB_MR_SIGNATURE_EN = 1, @@ -604,7 +604,7 @@ struct ib_mr_status { * enum. * @mult: multiple to convert. */ -enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct ib_ah_attr { struct ib_global_route grh; -- cgit v0.10.2 From 373c0ea181c8d277a4a51bbd705189a6a030b81c Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Sun, 18 May 2014 11:12:24 +0300 Subject: IB/core: Remove unneeded kobject_get/put calls The ib_core module will call kobject_get on the parent object of each kobject it creates. This is redundant since kobject_add does that anyway. As a side effect, this patch should fix leaking the ports kobject and the device kobject during unregister flow, since the previous code didn't seem to take into account the kobject_get calls on behalf of the child kobjects. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7d3292c..1f77072 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -534,7 +534,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - kobject_get(device->ports_parent), + device->ports_parent, "%d", port_num); if (ret) goto err_put; @@ -599,7 +599,6 @@ err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); err_put: - kobject_put(device->ports_parent); kfree(p); return ret; } @@ -835,7 +834,7 @@ int ib_device_register_sysfs(struct ib_device *device, } device->ports_parent = kobject_create_and_add("ports", - kobject_get(&class_dev->kobj)); + &class_dev->kobj); if (!device->ports_parent) { ret = -ENOMEM; goto err_put; @@ -876,8 +875,6 @@ err_put: } } - kobject_put(&class_dev->kobj); - err_unregister: device_unregister(class_dev); -- cgit v0.10.2 From cad6d02acc13b6360e4525f86adb6a8932501e15 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Sun, 18 May 2014 11:12:25 +0300 Subject: IB/core: Fix port kobject deletion during error flow When encountering an error during the add_port function, adding a port to sysfs, the port kobject is freed without being deleted from sysfs. Instead of freeing it directly, the patch uses kobject_put to release the kobject and delete it. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 1f77072..68fa798 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -429,15 +429,19 @@ static void ib_port_release(struct kobject *kobj) struct attribute *a; int i; - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); - kfree(p->gid_group.attrs); + kfree(p->gid_group.attrs); + } - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); - kfree(p->pkey_group.attrs); + kfree(p->pkey_group.attrs); + } kfree(p); } @@ -536,8 +540,10 @@ static int add_port(struct ib_device *device, int port_num, ret = kobject_init_and_add(&p->kobj, &port_type, device->ports_parent, "%d", port_num); - if (ret) - goto err_put; + if (ret) { + kfree(p); + return ret; + } ret = sysfs_create_group(&p->kobj, &pma_group); if (ret) @@ -585,6 +591,7 @@ err_free_pkey: kfree(p->pkey_group.attrs[i]); kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -594,12 +601,13 @@ err_free_gid: kfree(p->gid_group.attrs[i]); kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); err_put: - kfree(p); + kobject_put(&p->kobj); return ret; } -- cgit v0.10.2 From 4e2c341b863e5bb680a49d5f006d7b75cf93bc93 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 4 Jun 2014 10:19:13 -0700 Subject: mlx4_core: Fix GFP flags parameters to be gfp_t Otherwise sparse gives a bunch of warnings like drivers/net/ethernet/mellanox/mlx4/srq.c:110:66: sparse: incorrect type in argument 4 (different base types) drivers/net/ethernet/mellanox/mlx4/srq.c:110:66: expected int [signed] gfp drivers/net/ethernet/mellanox/mlx4/srq.c:110:66: got restricted gfp_t Signed-off-by: Roland Dreier diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c index eb1747e..97c9b1d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.c +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -246,7 +246,7 @@ int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) } int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, - int gfp) + gfp_t gfp) { u32 i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h index 067e6e0..0c73645 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.h +++ b/drivers/net/ethernet/mellanox/mlx4/icm.h @@ -72,7 +72,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, - int gfp); + gfp_t gfp); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 start, u32 end); diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 917f0d0..07198ca 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -316,7 +316,7 @@ err_out: return err; } -static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, int gfp) +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) { u64 param = 0; -- cgit v0.10.2 From b7dfa8895f64ffa371d0ed09c1d1ba8c6e19b956 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Mon, 5 May 2014 19:35:26 +0200 Subject: RDMA/cxgb4: add missing padding at end of struct c4iw_alloc_ucontext_resp The i386 ABI disagrees with most other ABIs regarding alignment of data types larger than 4 bytes: on most ABIs a padding must be added at end of the structures, while it is not required on i386. So for most ABI struct c4iw_alloc_ucontext_resp gets implicitly padded to be aligned on a 8 bytes multiple, while for i386, such padding is not added. The tool pahole can be used to find such implicit padding: $ pahole --anon_include \ --nested_anon_include \ --recursive \ --class_name c4iw_alloc_ucontext_resp \ drivers/infiniband/hw/cxgb4/iw_cxgb4.o Then, structure layout can be compared between i386 and x86_64: +++ obj-i386/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 11:43:05.547432195 +0100 --- obj-x86_64/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 10:55:10.990133017 +0100 @@ -2,9 +2,8 @@ struct c4iw_alloc_ucontext_resp { __u64 status_page_key; /* 0 8 */ __u32 status_page_size; /* 8 4 */ - /* size: 12, cachelines: 1, members: 2 */ - /* last cacheline: 12 bytes */ + /* size: 16, cachelines: 1, members: 2 */ + /* padding: 4 */ + /* last cacheline: 16 bytes */ }; This ABI disagreement will make an x86_64 kernel try to write past the buffer provided by an i386 binary. When boundary check will be implemented, the x86_64 kernel will refuse to write past the i386 userspace provided buffer and the uverbs will fail. If the structure is on a page boundary and the next page is not mapped, ib_copy_to_udata() will fail and the uverb will fail. Additionally, as reported by Dan Carpenter, without the implicit padding being properly cleared, an information leak would take place in most architectures. This patch adds an explicit padding to struct c4iw_alloc_ucontext_resp, and, like 92b0ca7cb149 ("IB/mlx5: Fix stack info leak in mlx5_ib_alloc_ucontext()"), makes function c4iw_alloc_ucontext() not writting this padding field to userspace. This way, x86_64 kernel will be able to write struct c4iw_alloc_ucontext_resp as expected by unpatched and patched i386 libcxgb4. Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com Link: http://marc.info/?i=1395848977.3297.15.camel@localhost.localdomain Link: http://marc.info/?i=20140328082428.GH25192@mwanda Cc: Fixes: 05eb23893c2c ("cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes") Reported-by: Yann Droneaud Reported-by: Dan Carpenter Signed-off-by: Yann Droneaud Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index a94a3e1..c777e22 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -122,7 +122,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); - if (udata->outlen < sizeof(uresp)) { + if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { if (!warned++) pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; @@ -140,7 +140,8 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, context->key += PAGE_SIZE; spin_unlock(&context->mmap_lock); - ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err_mm; diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index 9b7534b..cbd0ce1 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -75,5 +75,6 @@ struct c4iw_create_qp_resp { struct c4iw_alloc_ucontext_resp { __u64 status_page_key; __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ }; #endif -- cgit v0.10.2 From 584482ac80e13250061fcc85ea8a68c6a31d5033 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Sun, 18 May 2014 11:12:26 +0300 Subject: IB/core: Fix kobject leak on device register error flow The ports kobject isn't being released during error flow in device registration. This patch refactors the ports kobject cleanup into a single function called from both the error flow in device registration and from the unregistration function. A couple of attributes aren't being deleted (iw_stats_group, and ib_class_attributes). While this may be handled implicitly by the destruction of their kobjects, it seems better to handle all the attributes the same way. Signed-off-by: Haggai Eran [ Make free_port_list_attributes() static. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 68fa798..cbd0383 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -816,6 +816,22 @@ static struct attribute_group iw_stats_group = { .attrs = iw_proto_stats_attrs, }; +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -869,19 +885,7 @@ int ib_device_register_sysfs(struct ib_device *device, return 0; err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } + free_port_list_attributes(device); err_unregister: device_unregister(class_dev); @@ -892,22 +896,18 @@ err: void ib_device_unregister_sysfs(struct ib_device *device) { - struct kobject *p, *t; - struct ib_port *port; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + struct kobject *kobj_dev = kobject_get(&device->dev.kobj); + int i; - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) + sysfs_remove_group(kobj_dev, &iw_stats_group); + + free_port_list_attributes(device); + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); - kobject_put(device->ports_parent); device_unregister(&device->dev); } -- cgit v0.10.2 From 60e1751cb52cc6d1ae04b6bd3c2b96e770b5823f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 6 Jun 2014 18:25:04 +0200 Subject: IB/umad: Fix use-after-free on close Avoid that closing /dev/infiniband/umad or /dev/infiniband/issm triggers a use-after-free. __fput() invokes f_op->release() before it invokes cdev_put(). Make sure that the ib_umad_device structure is freed by the cdev_put() call instead of f_op->release(). This avoids that changing the port mode from IB into Ethernet and back to IB followed by restarting opensmd triggers the following kernel oops: general protection fault: 0000 [#1] PREEMPT SMP RIP: 0010:[] [] module_put+0x2c/0x170 Call Trace: [] cdev_put+0x20/0x30 [] __fput+0x1ae/0x1f0 [] ____fput+0xe/0x10 [] task_work_run+0xac/0xe0 [] do_notify_resume+0x9f/0xc0 [] int_signal+0x12/0x17 Reference: https://bugzilla.kernel.org/show_bug.cgi?id=75051 Signed-off-by: Bart Van Assche Reviewed-by: Yann Droneaud Cc: # 3.x: 8ec0a0e6b58: IB/umad: Fix error handling Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 9bdf576c..1acb991 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -98,7 +98,7 @@ struct ib_umad_port { struct ib_umad_device { int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; @@ -134,14 +134,18 @@ static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device); -static void ib_umad_release_dev(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); kfree(dev); } +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; + static int hdr_size(struct ib_umad_file *file) { return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : @@ -812,7 +816,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) goto out; } - kref_get(&port->umad_dev->ref); + kobject_get(&port->umad_dev->kobj); out: mutex_unlock(&port->file_mutex); @@ -851,7 +855,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - kref_put(&dev->ref, ib_umad_release_dev); + kobject_put(&dev->kobj); return 0; } @@ -902,7 +906,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) if (ret) goto err_clr_sm_cap; - kref_get(&port->umad_dev->ref); + kobject_get(&port->umad_dev->kobj); return 0; @@ -932,7 +936,7 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + kobject_put(&port->umad_dev->kobj); return ret; } @@ -1000,6 +1004,7 @@ static int find_overflow_devnum(void) } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; @@ -1032,6 +1037,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, cdev_init(&port->cdev, &umad_fops); port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); if (cdev_add(&port->cdev, base, 1)) goto err_cdev; @@ -1050,6 +1056,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; @@ -1143,7 +1150,7 @@ static void ib_umad_add_one(struct ib_device *device) if (!umad_dev) return; - kref_init(&umad_dev->ref); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); umad_dev->start_port = s; umad_dev->end_port = e; @@ -1151,7 +1158,8 @@ static void ib_umad_add_one(struct ib_device *device) for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) goto err; } @@ -1163,7 +1171,7 @@ err: while (--i >= s) ib_umad_kill_port(&umad_dev->port[i - s]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static void ib_umad_remove_one(struct ib_device *device) @@ -1177,7 +1185,7 @@ static void ib_umad_remove_one(struct ib_device *device) for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) ib_umad_kill_port(&umad_dev->port[i]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) -- cgit v0.10.2 From 6fcd8d0d93fb2f38807371bb144f3f869d4ca5a2 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 9 Jun 2014 16:36:33 +0200 Subject: IB/mlx4: Fix gfp passing in create_qp_common() There are two kzalloc() calls which were not converted to use value of gfp passed to create_qp_common() instead of using hardcoded GFP_KERNEL in 40f2287bd583 ("IB/mlx4: Implement IB_QP_CREATE_USE_GFP_NOIO"). Fix this by passing gfp value down properly. Reported-by: Dan Carpenter Signed-off-by: Jiri Kosina Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 8710baf..a0bfda2 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -659,14 +659,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { - sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); if (!sqp) return -ENOMEM; qp = &sqp->qp; qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; } else { - qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); if (!qp) return -ENOMEM; qp->pri.vid = 0xFFFF; -- cgit v0.10.2 From 30dc5e63d6a5ad24894b5512d10b228d73645a44 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Wed, 26 Mar 2014 17:07:35 -0500 Subject: RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova Signed-off-by: Steve Wise Reviewed-by: PJ Waskiewicz [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 3ab3865..ffd0af6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -18,7 +18,7 @@ ib_sa-y := sa_query.o multicast.o ib_cm-y := cm.o -iw_cm-y := iwcm.o +iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 42c3058..d570030 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3607,7 +3607,8 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS); + RDMA_NL_RDMA_CM_ID_STATS, + NLM_F_MULTI); if (!id_stats) goto out; diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c new file mode 100644 index 0000000..b85ddbc --- /dev/null +++ b/drivers/infiniband/core/iwpm_msg.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static const char iwpm_ulib_name[] = "iWarpPortMapperUser"; +static int iwpm_ulib_version = 3; +static int iwpm_user_pid = IWPM_PID_UNDEFINED; +static atomic_t echo_nlmsg_seq; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + +/* + * iwpm_register_pid - Send a netlink query to user space + * for the iwarp port mapper pid + * + * nlmsg attributes: + * [IWPM_NLA_REG_PID_SEQ] + * [IWPM_NLA_REG_IF_NAME] + * [IWPM_NLA_REG_IBDEV_NAME] + * [IWPM_NLA_REG_ULIB_NAME] + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto pid_query_error; + } + if (iwpm_registered_client(nl_client)) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto pid_query_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto pid_query_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the pid request message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, + pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, + (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); + if (ret) + goto pid_query_error; + + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", + __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_set_registered(nl_client, 1); + iwpm_user_pid = IWPM_PID_UNAVAILABLE; + err_str = "Unable to send a nlmsg"; + goto pid_query_error; + } + nlmsg_request->req_buffer = pm_msg; + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +pid_query_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_register_pid); + +/* + * iwpm_add_mapping - Send a netlink add mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto add_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto add_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto add_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto add_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + /* fill in the add mapping message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto add_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto add_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto add_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +add_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_mapping); + +/* + * iwpm_add_and_query_mapping - Send a netlink add and query + * mapping message to the port mapper + * nlmsg attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto query_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto query_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + ret = -ENOMEM; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto query_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, + nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto query_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the query message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_QUERY_MAPPING_SEQ); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); + if (ret) + goto query_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + err_str = "Unable to send a nlmsg"; + goto query_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +query_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping); + +/* + * iwpm_remove_mapping - Send a netlink remove mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to create a nlmsg"; + goto remove_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto remove_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + local_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto remove_mapping_error; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto remove_mapping_error; + } + iwpm_print_sockaddr(local_addr, + "remove_mapping: Local sockaddr:"); + return 0; +remove_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb_any(skb); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapping); + +/* netlink attribute policy for the received response to register pid request */ +static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { + [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, + .len = IWPM_DEVNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, + [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_register_pid_cb - Process a port mapper response to + * iwpm_register_pid() + */ +int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; + struct iwpm_dev_data *pm_msg; + char *dev_name, *iwpm_name; + u32 msg_seq; + u8 nl_client; + u16 iwpm_version; + const char *msg_type = "Register Pid response"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, + resp_reg_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + nl_client = nlmsg_request->nl_client; + dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); + + /* check device name, ulib name and version */ + if (strcmp(pm_msg->dev_name, dev_name) || + strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + + pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + __func__, dev_name, iwpm_name, iwpm_version); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_response_exit; + } + iwpm_user_pid = cb->nlh->nlmsg_pid; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + if (iwpm_valid_client(nl_client)) + iwpm_set_registered(nl_client, 1); +register_pid_response_exit: + nlmsg_request->request_done = 1; + /* always for found nlmsg_request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_register_pid_cb); + +/* netlink attribute policy for the received response to add mapping request */ +static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { + [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_mapping_cb - Process a port mapper response to + * iwpm_add_mapping() + */ +int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr; + struct sockaddr_storage *mapped_sockaddr; + const char *msg_type; + u32 msg_seq; + + msg_type = "Add Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, + resp_add_policy, nltb, msg_type)) + return -EINVAL; + + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + mapped_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, + sizeof(*mapped_sockaddr)); + iwpm_print_sockaddr(&pm_msg->loc_addr, + "add_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "add_mapping: Mapped local sockaddr:"); + +add_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_mapping_cb); + +/* netlink attribute policy for the response to add and query mapping request */ +static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { + [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_and_query_mapping_cb - Process a port mapper response to + * iwpm_add_and_query_mapping() + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + const char *msg_type; + u32 msg_seq; + u16 err_code; + + msg_type = "Query Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return -EINVAL; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); + if (err_code == IWPM_REMOTE_QUERY_REJECT) { + pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", + __func__, cb->nlh->nlmsg_pid, msg_seq); + nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; + } + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || + iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { + pr_info("%s: Incorrect local sockaddr\n", __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, + sizeof(*mapped_loc_sockaddr)); + memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, + sizeof(*mapped_rem_sockaddr)); + + iwpm_print_sockaddr(&pm_msg->loc_addr, + "query_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "query_mapping: Mapped local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->rem_addr, + "query_mapping: Remote sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, + "query_mapping: Mapped remote sockaddr:"); +query_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); + +/* netlink attribute policy for the received request for mapping info */ +static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { + [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } +}; + +/* + * iwpm_mapping_info_cb - Process a port mapper request for mapping info + */ +int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; + const char *msg_type = "Mapping Info response"; + int iwpm_pid; + u8 nl_client; + char *iwpm_name; + u16 iwpm_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, + resp_mapinfo_policy, nltb, msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); + if (strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + pr_info("%s: Invalid port mapper name = %s version = %d\n", + __func__, iwpm_name, iwpm_version); + return ret; + } + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registered(nl_client, 0); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + if (!iwpm_mapinfo_available()) + return 0; + iwpm_pid = cb->nlh->nlmsg_pid; + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + return ret; +} +EXPORT_SYMBOL(iwpm_mapping_info_cb); + +/* netlink attribute policy for the received mapping info ack */ +static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } +}; + +/* + * iwpm_ack_mapping_info_cb - Process a port mapper ack for + * the provided mapping info records + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; + u32 mapinfo_send, mapinfo_ack; + const char *msg_type = "Mapping Info Ack"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, + ack_mapinfo_policy, nltb, msg_type)) + return -EINVAL; + mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); + if (mapinfo_ack != mapinfo_send) + pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", + __func__, mapinfo_send, mapinfo_ack); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + return 0; +} +EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); + +/* netlink attribute policy for the received port mapper error message */ +static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { + [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, +}; + +/* + * iwpm_mapping_error_cb - Process a port mapper error message + */ +int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + struct nlattr *nltb[IWPM_NLA_ERR_MAX]; + u32 msg_seq; + u16 err_code; + const char *msg_type = "Mapping Error Msg"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, + map_error_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); + err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); + pr_info("%s: Received msg seq = %u err code = %u client = %d\n", + __func__, msg_seq, err_code, nl_client); + /* look for nlmsg_request */ + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + /* not all errors have associated requests */ + pr_debug("Could not find matching req (seq = %u)\n", msg_seq); + return 0; + } + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + nlmsg_request->err_code = err_code; + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c new file mode 100644 index 0000000..69e9f84 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_HASH_BUCKET_SIZE 512 +#define IWPM_HASH_BUCKET_MASK (IWPM_HASH_BUCKET_SIZE - 1) + +static LIST_HEAD(iwpm_nlmsg_req_list); +static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); + +static struct hlist_head *iwpm_hash_bucket; +static DEFINE_SPINLOCK(iwpm_mapinfo_lock); + +static DEFINE_MUTEX(iwpm_admin_lock); +static struct iwpm_admin_data iwpm_admin; + +int iwpm_init(u8 nl_client) +{ + if (iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + iwpm_hash_bucket = kzalloc(IWPM_HASH_BUCKET_SIZE * + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Unable to create mapinfo hash table\n", __func__); + return -ENOMEM; + } + } + atomic_inc(&iwpm_admin.refcount); + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 1); + return 0; +} +EXPORT_SYMBOL(iwpm_init); + +static void free_hash_bucket(void); + +int iwpm_exit(u8 nl_client) +{ + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Incorrect usage - negative refcount\n", __func__); + return -EINVAL; + } + if (atomic_dec_and_test(&iwpm_admin.refcount)) { + free_hash_bucket(); + pr_debug("%s: Mapinfo hash table is destroyed\n", __func__); + } + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 0); + return 0; +} +EXPORT_SYMBOL(iwpm_exit); + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage *, + struct sockaddr_storage *); + +int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_sockaddr, + u8 nl_client) +{ + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info; + unsigned long flags; + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); + if (!map_info) { + pr_err("%s: Unable to allocate a mapping info\n", __func__); + return -ENOMEM; + } + memcpy(&map_info->local_sockaddr, local_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, + sizeof(struct sockaddr_storage)); + map_info->nl_client = nl_client; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + &map_info->local_sockaddr, + &map_info->mapped_sockaddr); + hlist_add_head(&map_info->hlist_node, hash_bucket_head); + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return 0; +} +EXPORT_SYMBOL(iwpm_create_mapinfo); + +int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_local_addr) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + local_sockaddr, + mapped_local_addr); + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, + mapped_local_addr)) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + ret = 0; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapinfo); + +static void free_hash_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + } + } + /* free the hash list */ + kfree(iwpm_hash_bucket); + iwpm_hash_bucket = NULL; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +} + +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + unsigned long flags; + + nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); + if (!nlmsg_request) { + pr_err("%s Unable to allocate a nlmsg_request\n", __func__); + return NULL; + } + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + kref_init(&nlmsg_request->kref); + kref_get(&nlmsg_request->kref); + nlmsg_request->nlmsg_seq = nlmsg_seq; + nlmsg_request->nl_client = nl_client; + nlmsg_request->request_done = 0; + nlmsg_request->err_code = 0; + return nlmsg_request; +} + +void iwpm_free_nlmsg_request(struct kref *kref) +{ + struct iwpm_nlmsg_request *nlmsg_request; + unsigned long flags; + + nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_del_init(&nlmsg_request->inprocess_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + if (!nlmsg_request->request_done) + pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", + __func__, nlmsg_request->nlmsg_seq); + kfree(nlmsg_request); +} + +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) +{ + struct iwpm_nlmsg_request *nlmsg_request; + struct iwpm_nlmsg_request *found_request = NULL; + unsigned long flags; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, + inprocess_list) { + if (nlmsg_request->nlmsg_seq == echo_seq) { + found_request = nlmsg_request; + kref_get(&nlmsg_request->kref); + break; + } + } + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + return found_request; +} + +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) +{ + int ret; + init_waitqueue_head(&nlmsg_request->waitq); + + ret = wait_event_timeout(nlmsg_request->waitq, + (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT); + if (!ret) { + ret = -EINVAL; + pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", + __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); + } else { + ret = nlmsg_request->err_code; + } + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + return ret; +} + +int iwpm_get_nlmsg_seq(void) +{ + return atomic_inc_return(&iwpm_admin.nlmsg_seq); +} + +int iwpm_valid_client(u8 nl_client) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return 0; + return iwpm_admin.client_list[nl_client]; +} + +void iwpm_set_valid(u8 nl_client, int valid) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return; + iwpm_admin.client_list[nl_client] = valid; +} + +/* valid client */ +int iwpm_registered_client(u8 nl_client) +{ + return iwpm_admin.reg_list[nl_client]; +} + +/* valid client */ +void iwpm_set_registered(u8 nl_client, int reg) +{ + iwpm_admin.reg_list[nl_client] = reg; +} + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client) +{ + struct sk_buff *skb = NULL; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + pr_err("%s Unable to allocate skb\n", __func__); + goto create_nlmsg_exit; + } + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, + NLM_F_REQUEST))) { + pr_warn("%s: Unable to put the nlmsg header\n", __func__); + dev_kfree_skb(skb); + skb = NULL; + } +create_nlmsg_exit: + return skb; +} + +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type) +{ + int nlh_len = 0; + int ret; + const char *err_str = ""; + + ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } + ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Unable to parse the nlmsg"; + goto parse_nlmsg_error; + } + ret = iwpm_validate_nlmsg_attr(nltb, policy_max); + if (ret) { + err_str = "Invalid NULL attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + pr_warn("%s: %s (msg type %s ret = %d)\n", + __func__, err_str, msg_type, ret); + return ret; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + +static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) +{ + u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); + u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); + return hash; +} + +static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) +{ + u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); + u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); + return hash; +} + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage + *local_sockaddr, + struct sockaddr_storage + *mapped_sockaddr) +{ + u32 local_hash, mapped_hash, hash; + + if (local_sockaddr->ss_family == AF_INET) { + local_hash = iwpm_ipv4_jhash((struct sockaddr_in *) local_sockaddr); + mapped_hash = iwpm_ipv4_jhash((struct sockaddr_in *) mapped_sockaddr); + + } else if (local_sockaddr->ss_family == AF_INET6) { + local_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) local_sockaddr); + mapped_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) mapped_sockaddr); + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + return NULL; + } + + if (local_hash == mapped_hash) /* if port mapper isn't available */ + hash = local_hash; + else + hash = jhash_2words(local_hash, mapped_hash, 0); + + return &iwpm_hash_bucket[hash & IWPM_HASH_BUCKET_MASK]; +} + +static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto mapinfo_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of mapinfo number nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); + if (ret) + goto mapinfo_num_error; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); + if (ret) + goto mapinfo_num_error; + ret = ibnl_unicast(skb, nlh, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto mapinfo_num_error; + } + pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + return 0; +mapinfo_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} + +static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) +{ + struct nlmsghdr *nlh = NULL; + int ret = 0; + + if (!skb) + return ret; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + return -ENOMEM; + } + nlh->nlmsg_type = NLMSG_DONE; + ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + if (ret) + pr_warn("%s Unable to send a nlmsg\n", __func__); + return ret; +} + +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) +{ + struct iwpm_mapping_info *map_info; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; + const char *err_str = ""; + int ret; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + skb_num++; + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; + nlh = NULL; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + ret = -ENOMEM; + err_str = "Unable to put the nlmsg header"; + goto send_mapping_info_unlock; + } + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->local_sockaddr, + IWPM_NLA_MAPINFO_LOCAL_ADDR); + if (ret) + goto send_mapping_info_unlock; + + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->mapped_sockaddr, + IWPM_NLA_MAPINFO_MAPPED_ADDR); + if (ret) + goto send_mapping_info_unlock; + + iwpm_print_sockaddr(&map_info->local_sockaddr, + "send_mapping_info: Local sockaddr:"); + iwpm_print_sockaddr(&map_info->mapped_sockaddr, + "send_mapping_info: Mapped local sockaddr:"); + mapping_num++; + nlmsg_bytes += nlh->nlmsg_len; + + /* check if all mappings can fit in one skb */ + if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { + /* and leave room for NLMSG_DONE */ + nlmsg_bytes = 0; + skb_num++; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, + flags); + /* send the skb */ + ret = send_nlmsg_done(skb, nl_client, iwpm_pid); + skb = NULL; + if (ret) { + err_str = "Unable to send map info"; + goto send_mapping_info_exit; + } + if (skb_num == IWPM_MAPINFO_SKB_COUNT) { + ret = -ENOMEM; + err_str = "Insufficient skbs for map info"; + goto send_mapping_info_exit; + } + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + } + } + } +send_mapping_info_unlock: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +send_mapping_info_exit: + if (ret) { + pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); + if (skb) + dev_kfree_skb(skb); + return ret; + } + send_nlmsg_done(skb, nl_client, iwpm_pid); + return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); +} + +int iwpm_mapinfo_available(void) +{ + unsigned long flags; + int full_bucket = 0, i = 0; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + if (!hlist_empty(&iwpm_hash_bucket[i])) { + full_bucket = 1; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return full_bucket; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h new file mode 100644 index 0000000..9777c86 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define IWPM_NL_RETRANS 3 +#define IWPM_NL_TIMEOUT (10*HZ) +#define IWPM_MAPINFO_SKB_COUNT 20 + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +struct iwpm_nlmsg_request { + struct list_head inprocess_list; + __u32 nlmsg_seq; + void *req_buffer; + u8 nl_client; + u8 request_done; + u16 err_code; + wait_queue_head_t waitq; + struct kref kref; +}; + +struct iwpm_mapping_info { + struct hlist_node hlist_node; + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + u8 nl_client; +}; + +struct iwpm_admin_data { + atomic_t refcount; + atomic_t nlmsg_seq; + int client_list[RDMA_NL_NUM_CLIENTS]; + int reg_list[RDMA_NL_NUM_CLIENTS]; +}; + +/** + * iwpm_get_nlmsg_request - Allocate and initialize netlink message request + * @nlmsg_seq: Sequence number of the netlink message + * @nl_client: The index of the netlink client + * @gfp: Indicates how the memory for the request should be allocated + * + * Returns the newly allocated netlink request object if successful, + * otherwise returns NULL + */ +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp); + +/** + * iwpm_free_nlmsg_request - Deallocate netlink message request + * @kref: Holds reference of netlink message request + */ +void iwpm_free_nlmsg_request(struct kref *kref); + +/** + * iwpm_find_nlmsg_request - Find netlink message request in the request list + * @echo_seq: Sequence number of the netlink request to find + * + * Returns the found netlink message request, + * if not found, returns NULL + */ +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq); + +/** + * iwpm_wait_complete_req - Block while servicing the netlink request + * @nlmsg_request: Netlink message request to service + * + * Wakes up, after the request is completed or expired + * Returns 0 if the request is complete without error + */ +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); + +/** + * iwpm_get_nlmsg_seq - Get the sequence number for a netlink + * message to send to the port mapper + * + * Returns the sequence number for the netlink message. + */ +int iwpm_get_nlmsg_seq(void); + +/** + * iwpm_valid_client - Check if the port mapper client is valid + * @nl_client: The index of the netlink client + * + * Valid clients need to call iwpm_init() before using + * the port mapper + */ +int iwpm_valid_client(u8 nl_client); + +/** + * iwpm_set_valid - Set the port mapper client to valid or not + * @nl_client: The index of the netlink client + * @valid: 1 if valid or 0 if invalid + */ +void iwpm_set_valid(u8 nl_client, int valid); + +/** + * iwpm_registered_client - Check if the port mapper client is registered + * @nl_client: The index of the netlink client + * + * Call iwpm_register_pid() to register a client + */ +int iwpm_registered_client(u8 nl_client); + +/** + * iwpm_set_registered - Set the port mapper client to registered or not + * @nl_client: The index of the netlink client + * @reg: 1 if registered or 0 if not + */ +void iwpm_set_registered(u8 nl_client, int reg); + +/** + * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of + * a client to the user space port mapper + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * + * If successful, returns the number of sent mapping info records + */ +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid); + +/** + * iwpm_mapinfo_available - Check if any mapping info records is available + * in the hash table + * + * Returns 1 if mapping information is available, otherwise returns 0 + */ +int iwpm_mapinfo_available(void); + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes + * @nltb: Holds address of each netlink message attributes + * @nla_count: Number of netlink message attributes + * + * Returns error if any of the nla_count attributes is NULL + */ +static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[], + int nla_count) +{ + int i; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) + return -EINVAL; + } + return 0; +} + +/** + * iwpm_create_nlmsg - Allocate skb and form a netlink message + * @nl_op: Netlink message opcode + * @nlh: Holds address of the netlink message header in skb + * @nl_client: The index of the netlink client + * + * Returns the newly allcated skb, or NULL if the tailroom of the skb + * is insufficient to store the message header and payload + */ +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client); + +/** + * iwpm_parse_nlmsg - Validate and parse the received netlink message + * @cb: Netlink callback structure + * @policy_max: Maximum attribute type to be expected + * @nlmsg_policy: Validation policy + * @nltb: Array to store policy_max parsed elements + * @msg_type: Type of netlink message + * + * Returns 0 on success or a negative error code + */ +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index a1e9cba..23dd5a5 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -103,13 +103,13 @@ int ibnl_remove_client(int index) EXPORT_SYMBOL(ibnl_remove_client); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, - int len, int client, int op) + int len, int client, int op, int flags) { unsigned char *prev_tail; prev_tail = skb_tail_pointer(skb); *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, NLM_F_MULTI); + len, flags); if (!*nlh) goto out_nlmsg_trim; (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; @@ -172,6 +172,20 @@ static void ibnl_rcv(struct sk_buff *skb) mutex_unlock(&ibnl_mutex); } +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid) +{ + return nlmsg_unicast(nls, skb, pid); +} +EXPORT_SYMBOL(ibnl_unicast); + +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(nls, skb, 0, group, flags); +} +EXPORT_SYMBOL(ibnl_multicast); + int __init ibnl_init(void) { struct netlink_kernel_cfg cfg = { diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h new file mode 100644 index 0000000..928b277 --- /dev/null +++ b/include/rdma/iw_portmap.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IW_PORTMAP_H +#define _IW_PORTMAP_H + +#define IWPM_ULIBNAME_SIZE 32 +#define IWPM_DEVNAME_SIZE 32 +#define IWPM_IFNAME_SIZE 16 +#define IWPM_IPADDR_SIZE 16 + +enum { + IWPM_INVALID_NLMSG_ERR = 10, + IWPM_CREATE_MAPPING_ERR, + IWPM_DUPLICATE_MAPPING_ERR, + IWPM_UNKNOWN_MAPPING_ERR, + IWPM_CLIENT_DEV_INFO_ERR, + IWPM_USER_LIB_INFO_ERR, + IWPM_REMOTE_QUERY_REJECT +}; + +struct iwpm_dev_data { + char dev_name[IWPM_DEVNAME_SIZE]; + char if_name[IWPM_IFNAME_SIZE]; +}; + +struct iwpm_sa_data { + struct sockaddr_storage loc_addr; + struct sockaddr_storage mapped_loc_addr; + struct sockaddr_storage rem_addr; + struct sockaddr_storage mapped_rem_addr; +}; + +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * + * Should be called when network interface goes up. + */ +int iwpm_init(u8); + +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * + * Should be called when network interface goes down. + */ +int iwpm_exit(u8); + +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ +int iwpm_valid_pid(void); + +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client); + +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); + +/** + * iwpm_add_and_query_mapping - Send a netlink add and query mapping request + * to the userspace port mapper + * @pm_msg: Contains the local and remote ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * If the request is successful, the pm_msg stores the + * port mapper response (mapped local and remote address info) + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); + +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client); + +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper + */ +int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper + */ +int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + */ +int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr, u8 nl_client); + +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ +int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr); + +#endif /* _IW_PORTMAP_H */ diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index e38de79..0790882 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -43,7 +43,7 @@ int ibnl_remove_client(int index); * Returns the allocated buffer on success and NULL on failure. */ void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, - int len, int client, int op); + int len, int client, int op, int flags); /** * Put a new attribute in a supplied skb. * @skb: The netlink skb. @@ -56,4 +56,25 @@ void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type); +/** + * Send the supplied skb to a specific userspace PID. + * @skb: The netlink skb + * @nlh: Header of the netlink message to send + * @pid: Userspace netlink process ID + * Returns 0 on success or a negative error code. + */ +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid); + +/** + * Send the supplied skb to a netlink group. + * @skb: The netlink skb + * @nlh: Header of the netlink message to send + * @group: Netlink group ID + * @flags: allocation flags + * Returns 0 on success or a negative error code. + */ +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags); + #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 8297285..de69170 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -4,7 +4,16 @@ #include enum { - RDMA_NL_RDMA_CM = 1 + RDMA_NL_RDMA_CM = 1, + RDMA_NL_NES, + RDMA_NL_C4IW, + RDMA_NL_NUM_CLIENTS +}; + +enum { + RDMA_NL_GROUP_CM = 1, + RDMA_NL_GROUP_IWPM, + RDMA_NL_NUM_GROUPS }; #define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10) @@ -22,6 +31,18 @@ enum { RDMA_NL_RDMA_CM_NUM_ATTR, }; +/* iwarp port mapper op-codes */ +enum { + RDMA_NL_IWPM_REG_PID = 0, + RDMA_NL_IWPM_ADD_MAPPING, + RDMA_NL_IWPM_QUERY_MAPPING, + RDMA_NL_IWPM_REMOVE_MAPPING, + RDMA_NL_IWPM_HANDLE_ERR, + RDMA_NL_IWPM_MAPINFO, + RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_NUM_OPS +}; + struct rdma_cm_id_stats { __u32 qp_num; __u32 bound_dev_if; @@ -33,5 +54,78 @@ struct rdma_cm_id_stats { __u8 qp_type; }; +enum { + IWPM_NLA_REG_PID_UNSPEC = 0, + IWPM_NLA_REG_PID_SEQ, + IWPM_NLA_REG_IF_NAME, + IWPM_NLA_REG_IBDEV_NAME, + IWPM_NLA_REG_ULIB_NAME, + IWPM_NLA_REG_PID_MAX +}; + +enum { + IWPM_NLA_RREG_PID_UNSPEC = 0, + IWPM_NLA_RREG_PID_SEQ, + IWPM_NLA_RREG_IBDEV_NAME, + IWPM_NLA_RREG_ULIB_NAME, + IWPM_NLA_RREG_ULIB_VER, + IWPM_NLA_RREG_PID_ERR, + IWPM_NLA_RREG_PID_MAX + +}; + +enum { + IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_MANAGE_MAPPING_SEQ, + IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_RMANAGE_MAPPING_ERR, + IWPM_NLA_RMANAGE_MAPPING_MAX +}; + +#define IWPM_NLA_MANAGE_MAPPING_MAX 3 +#define IWPM_NLA_QUERY_MAPPING_MAX 4 +#define IWPM_NLA_MAPINFO_SEND_MAX 3 + +enum { + IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_QUERY_MAPPING_SEQ, + IWPM_NLA_QUERY_LOCAL_ADDR, + IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, + IWPM_NLA_RQUERY_MAPPED_REM_ADDR, + IWPM_NLA_RQUERY_MAPPING_ERR, + IWPM_NLA_RQUERY_MAPPING_MAX +}; + +enum { + IWPM_NLA_MAPINFO_REQ_UNSPEC = 0, + IWPM_NLA_MAPINFO_ULIB_NAME, + IWPM_NLA_MAPINFO_ULIB_VER, + IWPM_NLA_MAPINFO_REQ_MAX +}; + +enum { + IWPM_NLA_MAPINFO_UNSPEC = 0, + IWPM_NLA_MAPINFO_LOCAL_ADDR, + IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_MAX +}; + +enum { + IWPM_NLA_MAPINFO_NUM_UNSPEC = 0, + IWPM_NLA_MAPINFO_SEQ, + IWPM_NLA_MAPINFO_SEND_NUM, + IWPM_NLA_MAPINFO_ACK_NUM, + IWPM_NLA_MAPINFO_NUM_MAX +}; + +enum { + IWPM_NLA_ERR_UNSPEC = 0, + IWPM_NLA_ERR_SEQ, + IWPM_NLA_ERR_CODE, + IWPM_NLA_ERR_MAX +}; + #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v0.10.2 From 5647263cb136a2795b67b9ce46f2debb653e3373 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Wed, 26 Mar 2014 17:07:57 -0500 Subject: RDMA/nes: Add support for iWARP Port Mapper user space service Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 353c7b0..3b2a6dc 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -68,7 +68,6 @@ MODULE_VERSION(DRV_VERSION); int max_mtu = 9000; int interrupt_mod_interval = 0; - /* Interoperability */ int mpa_version = 1; module_param(mpa_version, int, 0644); @@ -112,6 +111,16 @@ static struct pci_device_id nes_pci_table[] = { MODULE_DEVICE_TABLE(pci, nes_pci_table); +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); static int nes_net_event(struct notifier_block *, unsigned long, void *); static int nes_notifiers_registered; @@ -672,6 +681,17 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) } nes_notifiers_registered++; + if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) + printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", + __func__, __LINE__); + + ret = iwpm_init(RDMA_NL_NES); + if (ret) { + printk(KERN_ERR PFX "%s: port mapper initialization failed\n", + pci_name(pcidev)); + goto bail7; + } + INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); /* Initialize network devices */ @@ -710,6 +730,7 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", nesdev->netdev_count, nesdev->nesadapter->netdev_count); + ibnl_remove_client(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { @@ -773,6 +794,8 @@ static void nes_remove(struct pci_dev *pcidev) nesdev->nesadapter->netdev_count--; } } + ibnl_remove_client(RDMA_NL_NES); + iwpm_exit(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 33cc589..bd9d132 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -51,6 +51,8 @@ #include #include #include +#include +#include #define NES_SEND_FIRST_WRITE @@ -130,6 +132,7 @@ #define NES_DBG_IW_TX 0x00040000 #define NES_DBG_SHUTDOWN 0x00080000 #define NES_DBG_PAU 0x00100000 +#define NES_DBG_NLMSG 0x00200000 #define NES_DBG_RSVD1 0x10000000 #define NES_DBG_RSVD2 0x20000000 #define NES_DBG_RSVD3 0x40000000 diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index dfa9df4..6f09a72 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -59,6 +59,7 @@ #include #include #include +#include #include "nes.h" @@ -166,7 +167,6 @@ int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) { return rem_ref_cm_node(cm_node->cm_core, cm_node); } - /** * create_event */ @@ -482,11 +482,11 @@ static void form_cm_frame(struct sk_buff *skb, iph->ttl = 0x40; iph->protocol = 0x06; /* IPPROTO_TCP */ - iph->saddr = htonl(cm_node->loc_addr); - iph->daddr = htonl(cm_node->rem_addr); + iph->saddr = htonl(cm_node->mapped_loc_addr); + iph->daddr = htonl(cm_node->mapped_rem_addr); - tcph->source = htons(cm_node->loc_port); - tcph->dest = htons(cm_node->rem_port); + tcph->source = htons(cm_node->mapped_loc_port); + tcph->dest = htons(cm_node->mapped_rem_port); tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); if (flags & SET_ACK) { @@ -525,6 +525,100 @@ static void form_cm_frame(struct sk_buff *skb, cm_packets_created++; } +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; + nes_sockaddr->sin_family = AF_INET; + memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); + nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &local_sockaddr); + nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), + htons(cm_info->mapped_loc_port), &mapped_sockaddr); + + return iwpm_create_mapinfo(&local_sockaddr, + &mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + * and send a remove mapping op message to + * the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, + u32 mapped_loc_addr, u16 mapped_loc_port) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); + nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), + &mapped_sockaddr); + + iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); + return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &pm_msg->loc_addr); + nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), + &pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, + IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + struct sockaddr_in *mapped_loc_addr = + (struct sockaddr_in *)&pm_msg->mapped_loc_addr; + struct sockaddr_in *mapped_rem_addr = + (struct sockaddr_in *)&pm_msg->mapped_rem_addr; + + if (mapped_loc_addr->sin_family == AF_INET) { + cm_info->mapped_loc_addr = + ntohl(mapped_loc_addr->sin_addr.s_addr); + cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port); + } + if (mapped_rem_addr->sin_family == AF_INET) { + cm_info->mapped_rem_addr = + ntohl(mapped_rem_addr->sin_addr.s_addr); + cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port); + } +} + /** * print_core - dump a cm core */ @@ -1147,8 +1241,11 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, loc_addr, loc_port, cm_node->rem_addr, cm_node->rem_port, rem_addr, rem_port); - if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && - (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { + if ((cm_node->mapped_loc_addr == loc_addr) && + (cm_node->mapped_loc_port == loc_port) && + (cm_node->mapped_rem_addr == rem_addr) && + (cm_node->mapped_rem_port == rem_port)) { + add_ref_cm_node(cm_node); spin_unlock_irqrestore(&cm_core->ht_lock, flags); return cm_node; @@ -1165,18 +1262,28 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, * find_listener - find a cm node listening on this addr-port pair */ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state, int local) { unsigned long flags; struct nes_cm_listener *listen_node; + nes_addr_t listen_addr; + u16 listen_port; /* walk list and find cm_node associated with this session ID */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { + if (local) { + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + } else { + listen_addr = listen_node->mapped_loc_addr; + listen_port = listen_node->mapped_loc_port; + } /* compare node pair, return node handle if a match */ - if (((listen_node->loc_addr == dst_addr) || - listen_node->loc_addr == 0x00000000) && - (listen_node->loc_port == dst_port) && + if (((listen_addr == dst_addr) || + listen_addr == 0x00000000) && + (listen_port == dst_port) && (listener_state & listen_node->listener_state)) { atomic_inc(&listen_node->ref_count); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); @@ -1189,7 +1296,6 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, return NULL; } - /** * add_hte_node - add a cm node to the hash table */ @@ -1310,9 +1416,20 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - if (listener->nesvnic) - nes_manage_apbvt(listener->nesvnic, listener->loc_port, - PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, + listener->mapped_loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_remove_mapinfo(listener->loc_addr, + listener->loc_port, + listener->mapped_loc_addr, + listener->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, + "Delete APBVT mapped_loc_port = %04X\n", + listener->mapped_loc_port); + } nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1454,6 +1571,11 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loc_port = cm_info->loc_port; cm_node->rem_port = cm_info->rem_port; + cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; + cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; + cm_node->mapped_loc_port = cm_info->mapped_loc_port; + cm_node->mapped_rem_port = cm_info->mapped_rem_port; + cm_node->mpa_frame_rev = mpa_version; cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; cm_node->mpav2_ird_ord = 0; @@ -1500,8 +1622,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); - arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); + oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, + cm_node->mapped_rem_addr, oldarpindex); if (arpindex < 0) { kfree(cm_node); return NULL; @@ -1563,11 +1687,14 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); } else { if (cm_node->apbvt_set && cm_node->nesvnic) { - nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, - PCI_FUNC( - cm_node->nesvnic->nesdev->pcidev->devfn), + nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); } + nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", + cm_node->mapped_loc_port); + nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, + cm_node->mapped_loc_addr, cm_node->mapped_loc_port); } atomic_dec(&cm_core->node_cnt); @@ -2235,17 +2362,21 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, * mini_cm_listen - create a listen node with params */ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) { struct nes_cm_listener *listener; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; unsigned long flags; + int iwpm_err = 0; nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", cm_info->loc_addr, cm_info->loc_port); /* cannot have multiple matching listeners */ - listener = find_listener(cm_core, htonl(cm_info->loc_addr), - htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); + listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, + NES_CM_LISTENER_EITHER_STATE, 1); + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { /* find automatically incs ref count ??? */ atomic_dec(&listener->ref_count); @@ -2254,6 +2385,22 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, } if (!listener) { + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(cm_info, &pm_msg); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(cm_info, &pm_msg); + } + /* create a CM listen node (1/2 node to compare incoming traffic to) */ listener = kzalloc(sizeof(*listener), GFP_ATOMIC); if (!listener) { @@ -2261,8 +2408,10 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, return NULL; } - listener->loc_addr = htonl(cm_info->loc_addr); - listener->loc_port = htons(cm_info->loc_port); + listener->loc_addr = cm_info->loc_addr; + listener->loc_port = cm_info->loc_port; + listener->mapped_loc_addr = cm_info->mapped_loc_addr; + listener->mapped_loc_port = cm_info->mapped_loc_port; listener->reused_node = 0; atomic_set(&listener->ref_count, 1); @@ -2324,14 +2473,18 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (cm_info->loc_addr == cm_info->rem_addr) { loopbackremotelistener = find_listener(cm_core, - ntohl(nesvnic->local_ipaddr), cm_node->rem_port, - NES_CM_LISTENER_ACTIVE_STATE); + cm_node->mapped_loc_addr, cm_node->mapped_rem_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { loopback_cm_info = *cm_info; loopback_cm_info.loc_port = cm_info->rem_port; loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.mapped_loc_port = + cm_info->mapped_rem_port; + loopback_cm_info.mapped_rem_port = + cm_info->mapped_loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, loopbackremotelistener); @@ -2560,6 +2713,12 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, nfo.rem_addr = ntohl(iph->saddr); nfo.rem_port = ntohs(tcph->source); + /* If port mapper is available these should be mapped address info */ + nfo.mapped_loc_addr = ntohl(iph->daddr); + nfo.mapped_loc_port = ntohs(tcph->dest); + nfo.mapped_rem_addr = ntohl(iph->saddr); + nfo.mapped_rem_port = ntohs(tcph->source); + tmp_daddr = cpu_to_be32(iph->daddr); tmp_saddr = cpu_to_be32(iph->saddr); @@ -2568,8 +2727,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, do { cm_node = find_node(cm_core, - nfo.rem_port, nfo.rem_addr, - nfo.loc_port, nfo.loc_addr); + nfo.mapped_rem_port, nfo.mapped_rem_addr, + nfo.mapped_loc_port, nfo.mapped_loc_addr); if (!cm_node) { /* Only type of packet accepted are for */ @@ -2578,9 +2737,9 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, skb_handled = 0; break; } - listener = find_listener(cm_core, nfo.loc_addr, - nfo.loc_port, - NES_CM_LISTENER_ACTIVE_STATE); + listener = find_listener(cm_core, nfo.mapped_loc_addr, + nfo.mapped_loc_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); if (!listener) { nfo.cm_id = NULL; nfo.conn_type = 0; @@ -3184,10 +3343,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_cm_init_tsa_conn(nesqp, cm_node); - nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); - nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); - nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3211,9 +3372,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) memset(&nes_quad, 0, sizeof(nes_quad)); nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = raddr->sin_addr.s_addr; - nes_quad.TcpPorts[0] = raddr->sin_port; - nes_quad.TcpPorts[1] = laddr->sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3315,6 +3476,9 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int apbvt_set = 0; struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; if (cm_id->remote_addr.ss_family != AF_INET) return -ENOSYS; @@ -3352,20 +3516,44 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_debug(NES_DBG_CM, "mpa private data len =%u\n", conn_param->private_data_len); + /* set up the connection params for the node */ + cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); + cm_info.rem_port = ntohs(raddr->sin_port); + cm_info.cm_id = cm_id; + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified peer information */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + cm_info.mapped_rem_addr = cm_info.rem_addr; + cm_info.mapped_rem_port = cm_info.rem_port; + + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(&cm_info, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(&cm_info, &pm_msg); + } + if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { - nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), - PCI_FUNC(nesdev->pcidev->devfn), - NES_MANAGE_APBVT_ADD); + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); apbvt_set = 1; } - /* set up the connection params for the node */ - cm_info.loc_addr = htonl(laddr->sin_addr.s_addr); - cm_info.loc_port = htons(laddr->sin_port); - cm_info.rem_addr = htonl(raddr->sin_addr.s_addr); - cm_info.rem_port = htons(raddr->sin_port); - cm_info.cm_id = cm_id; - cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; cm_id->add_ref(cm_id); @@ -3375,10 +3563,14 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (!cm_node) { if (apbvt_set) - nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", + cm_info.mapped_loc_port); + nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, + cm_info.mapped_loc_addr, cm_info.mapped_loc_port); cm_id->rem_ref(cm_id); return -ENOMEM; } @@ -3424,13 +3616,16 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) nesvnic->local_ipaddr, laddr->sin_addr.s_addr); /* setup listen params in our api call struct */ - cm_info.loc_addr = nesvnic->local_ipaddr; - cm_info.loc_port = laddr->sin_port; + cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); + cm_info.loc_port = ntohs(laddr->sin_port); cm_info.backlog = backlog; cm_info.cm_id = cm_id; cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + /* No port mapper available, go with the specified info */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); if (!cm_node) { @@ -3442,7 +3637,10 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = cm_node; if (!cm_node->reused_node) { - err = nes_manage_apbvt(nesvnic, ntohs(laddr->sin_port), + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); if (err) { @@ -3567,9 +3765,11 @@ static void cm_event_connected(struct nes_cm_event *event) nes_cm_init_tsa_conn(nesqp, cm_node); /* set the QP tsa context */ - nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(laddr->sin_port)); - nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(raddr->sin_port)); - nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(raddr->sin_addr.s_addr)); + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3599,9 +3799,9 @@ static void cm_event_connected(struct nes_cm_event *event) nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = raddr->sin_addr.s_addr; - nes_quad.TcpPorts[0] = raddr->sin_port; - nes_quad.TcpPorts[1] = laddr->sin_port; + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3629,7 +3829,7 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; - cm_event_laddr->sin_addr.s_addr = event->cm_info.rem_addr; + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 522c99c..f522cf6 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -293,8 +293,8 @@ struct nes_cm_listener { struct list_head list; struct nes_cm_core *cm_core; u8 loc_mac[ETH_ALEN]; - nes_addr_t loc_addr; - u16 loc_port; + nes_addr_t loc_addr, mapped_loc_addr; + u16 loc_port, mapped_loc_port; struct iw_cm_id *cm_id; enum nes_cm_conn_type conn_type; atomic_t ref_count; @@ -308,7 +308,9 @@ struct nes_cm_listener { /* per connection node and node state information */ struct nes_cm_node { nes_addr_t loc_addr, rem_addr; + nes_addr_t mapped_loc_addr, mapped_rem_addr; u16 loc_port, rem_port; + u16 mapped_loc_port, mapped_rem_port; u8 loc_mac[ETH_ALEN]; u8 rem_mac[ETH_ALEN]; @@ -364,6 +366,10 @@ struct nes_cm_info { u16 rem_port; nes_addr_t loc_addr; nes_addr_t rem_addr; + u16 mapped_loc_port; + u16 mapped_rem_port; + nes_addr_t mapped_loc_addr; + nes_addr_t mapped_rem_addr; enum nes_cm_conn_type conn_type; int backlog; -- cgit v0.10.2 From 9eccfe109b276fddf2908d1a70f7f4449b92f08f Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 26 Mar 2014 17:08:09 -0500 Subject: RDMA/cxgb4: Add support for iWARP Port Mapper user space service Based on original work by Vipul Pandya. Signed-off-by: Steve Wise [ Fix htons -> ntohs to make sparse happy. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 1f863a9..7de6c9f 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -294,6 +294,12 @@ void _c4iw_free_ep(struct kref *kref) dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); } + if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { + print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); + iwpm_remove_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr); + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + } kfree(ep); } @@ -528,6 +534,38 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&pm_msg->loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); +} + +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, + IWPM_IFNAME_SIZE); +} + +static void c4iw_record_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, + sizeof(ep->com.mapped_remote_addr)); +} + static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req; @@ -546,10 +584,14 @@ static int send_connect(struct c4iw_ep *ep) int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? sizeof(struct cpl_act_open_req6) : sizeof(struct cpl_t5_act_open_req6); - struct sockaddr_in *la = (struct sockaddr_in *)&ep->com.local_addr; - struct sockaddr_in *ra = (struct sockaddr_in *)&ep->com.remote_addr; - struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&ep->com.local_addr; - struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; + struct sockaddr_in *la = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? roundup(sizev4, 16) : @@ -1627,10 +1669,10 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) req->le.filter = cpu_to_be32(cxgb4_select_ntuple( ep->com.dev->rdev.lldi.ports[0], ep->l2t)); - sin = (struct sockaddr_in *)&ep->com.local_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; req->le.lport = sin->sin_port; req->le.u.ipv4.lip = sin->sin_addr.s_addr; - sin = (struct sockaddr_in *)&ep->com.remote_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; req->le.pport = sin->sin_port; req->le.u.ipv4.pip = sin->sin_addr.s_addr; req->tcb.t_state_to_astid = @@ -1870,10 +1912,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct sockaddr_in6 *ra6; ep = lookup_atid(t, atid); - la = (struct sockaddr_in *)&ep->com.local_addr; - ra = (struct sockaddr_in *)&ep->com.remote_addr; - la6 = (struct sockaddr_in6 *)&ep->com.local_addr; - ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; + la = (struct sockaddr_in *)&ep->com.mapped_local_addr; + ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, status, status2errno(status)); @@ -2730,13 +2772,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep; int err = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; - struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&cm_id->local_addr; - struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) - &cm_id->remote_addr; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in6 *laddr6; + struct sockaddr_in6 *raddr6; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; __u8 *ra; int iptype; + int iwpm_err = 0; if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { @@ -2767,7 +2811,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->com.qp) { PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); err = -EINVAL; - goto fail2; + goto fail1; } ref_qp(ep); PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, @@ -2780,10 +2824,50 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ep->atid == -1) { printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); err = -ENOMEM; - goto fail2; + goto fail1; } insert_handle(dev, &dev->atid_idr, ep, ep->atid); + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* No port mapper available, go with the specified peer information */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, + sizeof(ep->com.mapped_remote_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + c4iw_form_pm_msg(ep, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + c4iw_record_pm_msg(ep, &pm_msg); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + err = -ENOMEM; + goto fail1; + } + print_addr(&ep->com, __func__, "add_query/create_mapinfo"); + set_bit(RELEASE_MAPINFO, &ep->com.flags); + + laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; + raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; + if (cm_id->remote_addr.ss_family == AF_INET) { iptype = 4; ra = (__u8 *)&raddr->sin_addr; @@ -2794,7 +2878,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { err = pick_local_ipaddrs(dev, cm_id); if (err) - goto fail2; + goto fail1; } /* find a route */ @@ -2814,7 +2898,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { err = pick_local_ip6addrs(dev, cm_id); if (err) - goto fail2; + goto fail1; } /* find a route */ @@ -2830,13 +2914,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->dst) { printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; - goto fail3; + goto fail2; } err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true); if (err) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); - goto fail4; + goto fail3; } PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", @@ -2845,10 +2929,6 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) state_set(&ep->com, CONNECTING); ep->tos = 0; - memcpy(&ep->com.local_addr, &cm_id->local_addr, - sizeof(ep->com.local_addr)); - memcpy(&ep->com.remote_addr, &cm_id->remote_addr, - sizeof(ep->com.remote_addr)); /* send connect request to rnic */ err = send_connect(ep); @@ -2856,12 +2936,12 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto out; cxgb4_l2t_release(ep->l2t); -fail4: - dst_release(ep->dst); fail3: + dst_release(ep->dst); +fail2: remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); -fail2: +fail1: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); out: @@ -2871,7 +2951,8 @@ out: static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; c4iw_init_wr_wait(&ep->com.wr_wait); err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], @@ -2892,7 +2973,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; - struct sockaddr_in *sin = (struct sockaddr_in *)&ep->com.local_addr; + struct sockaddr_in *sin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; if (dev->rdev.lldi.enable_fw_ofld_conn) { do { @@ -2927,6 +3009,9 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *ep; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; might_sleep(); @@ -2961,6 +3046,37 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto fail2; } insert_handle(dev, &dev->stid_idr, ep, ep->stid); + + /* No port mapper available, go with the specified info */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + memcpy(&pm_msg.loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + memcpy(&ep->com.mapped_local_addr, + &pm_msg.mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + err = -ENOMEM; + goto fail3; + } + print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + + set_bit(RELEASE_MAPINFO, &ep->com.flags); state_set(&ep->com, LISTEN); if (ep->com.local_addr.ss_family == AF_INET) err = create_server4(dev, ep); @@ -2970,6 +3086,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } + +fail3: cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index f4fa50a..bd749e7 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -77,6 +77,16 @@ struct c4iw_debugfs_data { int pos; }; +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static int count_idrs(int id, void *p, void *data) { int *countp = data; @@ -113,35 +123,49 @@ static int dump_qp(int id, void *p, void *data) &qp->ep->com.local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) &qp->ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &qp->ep->com.mapped_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " "onchip %u ep tid %u state %u " - "%pI4:%u->%pI4:%u\n", + "%pI4:%u/%u->%pI4:%u/%u\n", qp->wq.sq.qid, qp->wq.rq.qid, (int)qp->attr.state, qp->wq.sq.flags & T4_SQ_ONCHIP, qp->ep->hwtid, (int)qp->ep->com.state, &lsin->sin_addr, ntohs(lsin->sin_port), - &rsin->sin_addr, ntohs(rsin->sin_port)); + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &qp->ep->com.local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) &qp->ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " "onchip %u ep tid %u state %u " - "%pI6:%u->%pI6:%u\n", + "%pI6:%u/%u->%pI6:%u/%u\n", qp->wq.sq.qid, qp->wq.rq.qid, (int)qp->attr.state, qp->wq.sq.flags & T4_SQ_ONCHIP, qp->ep->hwtid, (int)qp->ep->com.state, &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), &rsin6->sin6_addr, - ntohs(rsin6->sin6_port)); + ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); } } else cc = snprintf(qpd->buf + qpd->pos, space, @@ -386,31 +410,43 @@ static int dump_ep(int id, void *p, void *data) &ep->com.local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) &ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " "history 0x%lx hwtid %d atid %d " - "%pI4:%d <-> %pI4:%d\n", + "%pI4:%d/%d <-> %pI4:%d/%d\n", ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state, ep->com.flags, ep->com.history, ep->hwtid, ep->atid, &lsin->sin_addr, ntohs(lsin->sin_port), - &rsin->sin_addr, ntohs(rsin->sin_port)); + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &ep->com.local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) &ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " "history 0x%lx hwtid %d atid %d " - "%pI6:%d <-> %pI6:%d\n", + "%pI6:%d/%d <-> %pI6:%d/%d\n", ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state, ep->com.flags, ep->com.history, ep->hwtid, ep->atid, &lsin6->sin6_addr, ntohs(lsin6->sin6_port), - &rsin6->sin6_addr, ntohs(rsin6->sin6_port)); + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); } if (cc < space) epd->pos += cc; @@ -431,23 +467,29 @@ static int dump_listen_ep(int id, void *p, void *data) if (ep->com.local_addr.ss_family == AF_INET) { struct sockaddr_in *lsin = (struct sockaddr_in *) &ep->com.local_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " - "backlog %d %pI4:%d\n", + "backlog %d %pI4:%d/%d\n", ep, ep->com.cm_id, (int)ep->com.state, ep->com.flags, ep->stid, ep->backlog, - &lsin->sin_addr, ntohs(lsin->sin_port)); + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) &ep->com.local_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " - "backlog %d %pI6:%d\n", + "backlog %d %pI6:%d/%d\n", ep, ep->com.cm_id, (int)ep->com.state, ep->com.flags, ep->stid, ep->backlog, - &lsin6->sin6_addr, ntohs(lsin6->sin6_port)); + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port)); } if (cc < space) epd->pos += cc; @@ -687,6 +729,7 @@ static void c4iw_dealloc(struct uld_ctx *ctx) if (ctx->dev->rdev.oc_mw_kva) iounmap(ctx->dev->rdev.oc_mw_kva); ib_dealloc_device(&ctx->dev->ibdev); + iwpm_exit(RDMA_NL_C4IW); ctx->dev = NULL; } @@ -780,6 +823,14 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) c4iw_debugfs_root); setup_debugfs(devp); } + + ret = iwpm_init(RDMA_NL_C4IW); + if (ret) { + pr_err("port mapper initialization failed with %d\n", ret); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(ret); + } + return devp; } @@ -1274,6 +1325,11 @@ static int __init c4iw_init_module(void) printk(KERN_WARNING MOD "could not create debugfs entry, continuing\n"); + if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, + c4iw_nl_cb_table)) + pr_err("%s[%u]: Failed to add netlink callback\n" + , __func__, __LINE__); + cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); return 0; @@ -1291,6 +1347,7 @@ static void __exit c4iw_exit_module(void) } mutex_unlock(&dev_mutex); cxgb4_unregister_uld(CXGB4_ULD_RDMA); + ibnl_remove_client(RDMA_NL_C4IW); c4iw_cm_term(); debugfs_remove_recursive(c4iw_debugfs_root); } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 7474b49..6f533fb 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -52,6 +52,8 @@ #include #include +#include +#include #include "cxgb4.h" #include "cxgb4_uld.h" @@ -728,6 +730,7 @@ enum c4iw_ep_flags { CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, + RELEASE_MAPINFO = 6, }; enum c4iw_ep_history { @@ -764,6 +767,8 @@ struct c4iw_ep_common { struct mutex mutex; struct sockaddr_storage local_addr; struct sockaddr_storage remote_addr; + struct sockaddr_storage mapped_local_addr; + struct sockaddr_storage mapped_remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; unsigned long history; @@ -807,6 +812,45 @@ struct c4iw_ep { unsigned int retry_count; }; +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, + const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + + if (c4iw_debug) { + switch (epc->local_addr.ss_family) { + case AF_INET: + PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", + func, msg, SINA(&epc->local_addr), + SINP(&epc->local_addr), + SINP(&epc->mapped_local_addr), + SINA(&epc->remote_addr), + SINP(&epc->remote_addr), + SINP(&epc->mapped_remote_addr)); + break; + case AF_INET6: + PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", + func, msg, SIN6A(&epc->local_addr), + SIN6P(&epc->local_addr), + SIN6P(&epc->mapped_local_addr), + SIN6A(&epc->remote_addr), + SIN6P(&epc->remote_addr), + SIN6P(&epc->mapped_remote_addr)); + break; + default: + break; + } + } +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +} + static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; -- cgit v0.10.2