From c8e081a1bf11c5cbac5f2f9f53c040be61d7b29e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 27 Sep 2010 17:51:04 -0700 Subject: RDMA/cxgb4: Fix warnings about casts to/from pointers of different sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: drivers/infiniband/hw/cxgb4/qp.c: In function ‘create_qp’: drivers/infiniband/hw/cxgb4/qp.c:147: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/qp.c: In function ‘rdma_fini’: drivers/infiniband/hw/cxgb4/qp.c:988: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/qp.c: In function ‘rdma_init’: drivers/infiniband/hw/cxgb4/qp.c:1063: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/mem.c: In function ‘write_adapter_mem’: drivers/infiniband/hw/cxgb4/mem.c:74: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/cq.c: In function ‘destroy_cq’: drivers/infiniband/hw/cxgb4/cq.c:58: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/cq.c: In function ‘create_cq’: drivers/infiniband/hw/cxgb4/cq.c:135: warning: cast from pointer to integer of different size drivers/infiniband/hw/cxgb4/cm.c: In function ‘fw6_msg’: drivers/infiniband/hw/cxgb4/cm.c:2326: warning: cast to pointer from integer of different size by casting pointers to unsigned long instead of u64. Reported-by: Andrew Morton Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 32d352a..ea54c6a 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2323,7 +2323,7 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) switch (rpl->type) { case 1: ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); - wr_waitp = (__force struct c4iw_wr_wait *)rpl->data[1]; + wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); if (wr_waitp) { wr_waitp->ret = ret; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index b3daf39..af684fc 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -55,7 +55,7 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_NRES(1) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (u64)&wr_wait; + res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_RESET; @@ -132,7 +132,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_NRES(1) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (u64)&wr_wait; + res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_WRITE; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 269373a..f61562c 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -71,7 +71,7 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, if (i == (num_wqe-1)) { req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | FW_WR_COMPL(1)); - req->wr.wr_lo = (__force __be64)&wr_wait; + req->wr.wr_lo = (__force __be64)(unsigned long) &wr_wait; } else req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR)); req->wr.wr_mid = cpu_to_be32( diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 93f6e5b..5d11f86 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -144,7 +144,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, V_FW_RI_RES_WR_NRES(2) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (u64)&wr_wait; + res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; @@ -985,7 +985,7 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, wqe->flowid_len16 = cpu_to_be32( FW_WR_FLOWID(ep->hwtid) | FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); - wqe->cookie = (u64)&wr_wait; + wqe->cookie = (unsigned long) &wr_wait; wqe->u.fini.type = FW_RI_TYPE_FINI; c4iw_init_wr_wait(&wr_wait); @@ -1060,7 +1060,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) FW_WR_FLOWID(qhp->ep->hwtid) | FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); - wqe->cookie = (u64)&wr_wait; + wqe->cookie = (unsigned long) &wr_wait; wqe->u.init.type = FW_RI_TYPE_INIT; wqe->u.init.mpareqbit_p2ptype = -- cgit v0.10.2 From a3f5adaf491490089215f863a61b9422fae902f8 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 27 Sep 2010 17:51:10 -0700 Subject: IB/core: Add link layer property to ports This patch allows ports to have different link layers: IB_LINK_LAYER_INFINIBAND or IB_LINK_LAYER_ETHERNET. This is required for adding IBoE (InfiniBand-over-Ethernet, aka RoCE) support. For devices that do not provide an implementation for querying the link layer property of a port, we return a default value based on the transport: RMA_TRANSPORT_IB nodes will return IB_LINK_LAYER_INFINIBAND and RDMA_TRANSPORT_IWARP nodes will return IB_LINK_LAYER_ETHERNET. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e0fa222..af7a8b0 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -94,6 +94,22 @@ rdma_node_get_transport(enum rdma_node_type node_type) } EXPORT_SYMBOL(rdma_node_get_transport); +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +{ + if (device->get_link_layer) + return device->get_link_layer(device, port_num); + + switch (rdma_node_get_transport(device->node_type)) { + case RDMA_TRANSPORT_IB: + return IB_LINK_LAYER_INFINIBAND; + case RDMA_TRANSPORT_IWARP: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} +EXPORT_SYMBOL(rdma_port_get_link_layer); + /* Protection domains */ struct ib_pd *ib_alloc_pd(struct ib_device *device) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 857b3b9..e04c488 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -75,6 +75,12 @@ enum rdma_transport_type { enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; +enum rdma_link_layer { + IB_LINK_LAYER_UNSPECIFIED, + IB_LINK_LAYER_INFINIBAND, + IB_LINK_LAYER_ETHERNET, +}; + enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = 1, IB_DEVICE_BAD_PKEY_CNTR = (1<<1), @@ -1010,6 +1016,8 @@ struct ib_device { int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); + enum rdma_link_layer (*get_link_layer)(struct ib_device *device, + u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); @@ -1222,6 +1230,9 @@ int ib_query_device(struct ib_device *device, int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, + u8 port_num); + int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); -- cgit v0.10.2 From 605841f5d263892d9254f09fcf265a26e0bede7e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 27 Sep 2010 17:51:24 -0700 Subject: MAINTAINERS: Fix broken link to http://www.openib.org/ openib.org is now openfabrics.org. Signed-off-by: Roland Dreier diff --git a/MAINTAINERS b/MAINTAINERS index 668682d..d133984 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2951,7 +2951,7 @@ M: Roland Dreier M: Sean Hefty M: Hal Rosenstock L: linux-rdma@vger.kernel.org -W: http://www.openib.org/ +W: http://www.openfabrics.org/ Q: http://patchwork.kernel.org/project/linux-rdma/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git S: Supported -- cgit v0.10.2 From 183ae74bda75b1cfda632b42cdc916853e2dded4 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 27 Sep 2010 17:51:33 -0700 Subject: RDMA/nes: Fix cast-to-pointer warnings on 32-bit Fix: drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_fast_reg_page_list': drivers/infiniband/hw/nes/nes_verbs.c:477: warning: cast to pointer from integer of different size drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_post_send': drivers/infiniband/hw/nes/nes_verbs.c:3486: warning: cast to pointer from integer of different size drivers/infiniband/hw/nes/nes_verbs.c:3486: warning: cast to pointer from integer of different size by printing u64 quantities by casting to unsigned long and long and using %llx, rather than casting to void* and using %p. Reported-by: Andrew Morton Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 9046e66..b89972c 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -476,9 +476,9 @@ static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list( } nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, " "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, " - "pbl.paddr= %p\n", pnesfrpl, &pnesfrpl->ibfrpl, + "pbl.paddr = %llx\n", pnesfrpl, &pnesfrpl->ibfrpl, pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva, - (void *)pnesfrpl->nes_wqe_pbl.paddr); + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr); return pifrpl; } @@ -3483,13 +3483,13 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++) dst_page_list[i] = cpu_to_le64(src_page_list[i]); - nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %p, " - "length: %d, rkey: %0x, pgl_paddr: %p, " + nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %llx, " + "length: %d, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", - (void *)ib_wr->wr.fast_reg.iova_start, + (unsigned long long) ib_wr->wr.fast_reg.iova_start, ib_wr->wr.fast_reg.length, ib_wr->wr.fast_reg.rkey, - (void *)pnesfrpl->nes_wqe_pbl.paddr, + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr, ib_wr->wr.fast_reg.page_list_len, wqe_misc); break; -- cgit v0.10.2 From af93fb5dcc6b1cba5fd0861d349b3f9c93144bc0 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:14:48 -0500 Subject: RDMA/cxgb4: Don't use null ep ptr In c4iw_modify_qp() error path, only use qhp->ep if ep is not already set. Otherwise qhp->ep can be NULL and we crash. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 5d11f86..4f5dd66 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1305,7 +1305,8 @@ err: /* disassociate the LLP connection */ qhp->attr.llp_stream_handle = NULL; - ep = qhp->ep; + if (!ep) + ep = qhp->ep; qhp->ep = NULL; qhp->attr.state = C4IW_QP_STATE_ERROR; free = 1; -- cgit v0.10.2 From 13fecb83b410b147343e6c7b0427d244ef77b526 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:14:53 -0500 Subject: RDMA/cxgb4: Zero out ISGL padding The HW design requires zeroing any pad in SGLs. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 4f5dd66..bdbf54d 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -263,6 +263,9 @@ static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, rem -= len; } } + len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); + if (len) + memset(dstp, 0, len); immdp->op = FW_RI_DATA_IMMD; immdp->r1 = 0; immdp->r2 = 0; @@ -292,6 +295,7 @@ static int build_isgl(__be64 *queue_start, __be64 *queue_end, if (++flitp == queue_end) flitp = queue_start; } + *flitp = (__force __be64)0; isglp->op = FW_RI_DATA_ISGL; isglp->r1 = 0; isglp->nsge = cpu_to_be16(num_sge); -- cgit v0.10.2 From 7459486133dc726ff2edf0957d9cb5c954aedbc7 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:14:58 -0500 Subject: RDMA/cxgb4: Ignore positive return values from cxgb4_*_send() functions The cxgb4_*_send() functions return NET_XMIT_ values, which are positive integers or negative errno values. So don't treat positive return values as an error. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index ea54c6a..2c60266 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -172,7 +172,7 @@ static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e); if (error < 0) kfree_skb(skb); - return error; + return error < 0 ? error : 0; } int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb) @@ -187,7 +187,7 @@ int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb) error = cxgb4_ofld_send(rdev->lldi.ports[0], skb); if (error < 0) kfree_skb(skb); - return error; + return error < 0 ? error : 0; } static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) -- cgit v0.10.2 From 6ff0e343b3356897cef1f09452f93acb13703911 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:04 -0500 Subject: RDMA/cxgb4: Ignore TERMINATE CQEs T4 incorrectly inserts TERM CQEs into the CQ. Silently ignore them. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index af684fc..70371e9 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -476,6 +476,11 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, goto proc_cqe; } + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { + ret = -EAGAIN; + goto skip_cqe; + } + /* * RECV completion. */ @@ -696,6 +701,7 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) case T4_ERR_MSN_RANGE: case T4_ERR_IRD_OVERFLOW: case T4_ERR_OPCODE: + case T4_ERR_INTERNAL_ERR: wc->status = IB_WC_FATAL_ERR; break; case T4_ERR_SWFLUSH: -- cgit v0.10.2 From 0e42c1f4303f3f8d5b2c257dc5488b0ad465097d Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:09 -0500 Subject: RDMA/cxgb4: Handle CPL_RDMA_TERMINATE messages T4 FW sends up CPL_RDMA_TERMINATE to indicate a peer TERM. This triggers the QP moving to TERMINATE state. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 2c60266..2f0a26c 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1725,23 +1725,24 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) { - struct c4iw_ep *ep; - struct cpl_rdma_terminate *term = cplhdr(skb); + struct cpl_rdma_terminate *rpl = cplhdr(skb); struct tid_info *t = dev->rdev.lldi.tids; - unsigned int tid = GET_TID(term); + unsigned int tid = GET_TID(rpl); + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; ep = lookup_tid(t, tid); + BUG_ON(!ep); - if (state_read(&ep->com) != FPDU_MODE) - return 0; + if (ep->com.qp) { + printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid, + ep->com.qp->wq.sq.qid); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } else + printk(KERN_WARNING MOD "TERM received tid %u no qp\n", tid); - PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - skb_pull(skb, sizeof *term); - PDBG("%s saving %d bytes of term msg\n", __func__, skb->len); - skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer, - skb->len); - ep->com.qp->attr.terminate_msg_len = skb->len; - ep->com.qp->attr.is_terminate_local = 0; return 0; } diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index 491e76a..c13041a 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -60,7 +60,7 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, if (qhp->attr.state == C4IW_QP_STATE_RTS) { attrs.next_state = C4IW_QP_STATE_TERMINATE; c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, - &attrs, 1); + &attrs, 0); } event.event = ib_event; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index bdbf54d..1199d1b 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1238,7 +1238,8 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; c4iw_get_ep(&ep->com); - terminate = 1; + if (!internal) + terminate = 1; disconnect = 1; break; case C4IW_QP_STATE_ERROR: -- cgit v0.10.2 From 05fb9629473690e4be4112f22e1adeb1fe4ad733 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:14 -0500 Subject: RDMA/cxgb4: Log HW lack-of-resource errors This helps debug cases where HW resources are depleted. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index 83b23df..26365f6 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -311,6 +311,9 @@ u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size) { unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size); PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + if (!addr && printk_ratelimit()) + printk(KERN_WARNING MOD "%s: Out of PBL memory\n", + pci_name(rdev->lldi.pdev)); return (u32)addr; } @@ -370,6 +373,9 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size) { unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6); PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); + if (!addr && printk_ratelimit()) + printk(KERN_WARNING MOD "%s: Out of RQT memory\n", + pci_name(rdev->lldi.pdev)); return (u32)addr; } -- cgit v0.10.2 From 9e8d1fa3420f489da8a5da47c026511aa71fa50b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:20 -0500 Subject: RDMA/cxgb4: debugfs files for dumping active stags Add "stags" debugfs file. This is useful for examining the TPTE and PBL entries in adapter memory. It allows scripts to dump just the active entries. Also clean up the "qps" file handlers and shared common code. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 9bbf491..2851bf8 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -49,29 +49,57 @@ static DEFINE_MUTEX(dev_mutex); static struct dentry *c4iw_debugfs_root; -struct debugfs_qp_data { +struct c4iw_debugfs_data { struct c4iw_dev *devp; char *buf; int bufsize; int pos; }; -static int count_qps(int id, void *p, void *data) +static int count_idrs(int id, void *p, void *data) { - struct c4iw_qp *qp = p; int *countp = data; - if (id != qp->wq.sq.qid) - return 0; - *countp = *countp + 1; return 0; } -static int dump_qps(int id, void *p, void *data) +static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct c4iw_debugfs_data *d = file->private_data; + loff_t pos = *ppos; + loff_t avail = d->pos; + + if (pos < 0) + return -EINVAL; + if (pos >= avail) + return 0; + if (count > avail - pos) + count = avail - pos; + + while (count) { + size_t len = 0; + + len = min((int)count, (int)d->pos - (int)pos); + if (copy_to_user(buf, d->buf + pos, len)) + return -EFAULT; + if (len == 0) + return -EINVAL; + + buf += len; + pos += len; + count -= len; + } + count = pos - *ppos; + *ppos = pos; + return count; +} + +static int dump_qp(int id, void *p, void *data) { struct c4iw_qp *qp = p; - struct debugfs_qp_data *qpd = data; + struct c4iw_debugfs_data *qpd = data; int space; int cc; @@ -101,7 +129,7 @@ static int dump_qps(int id, void *p, void *data) static int qp_release(struct inode *inode, struct file *file) { - struct debugfs_qp_data *qpd = file->private_data; + struct c4iw_debugfs_data *qpd = file->private_data; if (!qpd) { printk(KERN_INFO "%s null qpd?\n", __func__); return 0; @@ -113,7 +141,7 @@ static int qp_release(struct inode *inode, struct file *file) static int qp_open(struct inode *inode, struct file *file) { - struct debugfs_qp_data *qpd; + struct c4iw_debugfs_data *qpd; int ret = 0; int count = 1; @@ -126,7 +154,7 @@ static int qp_open(struct inode *inode, struct file *file) qpd->pos = 0; spin_lock_irq(&qpd->devp->lock); - idr_for_each(&qpd->devp->qpidr, count_qps, &count); + idr_for_each(&qpd->devp->qpidr, count_idrs, &count); spin_unlock_irq(&qpd->devp->lock); qpd->bufsize = count * 128; @@ -137,7 +165,7 @@ static int qp_open(struct inode *inode, struct file *file) } spin_lock_irq(&qpd->devp->lock); - idr_for_each(&qpd->devp->qpidr, dump_qps, qpd); + idr_for_each(&qpd->devp->qpidr, dump_qp, qpd); spin_unlock_irq(&qpd->devp->lock); qpd->buf[qpd->pos++] = 0; @@ -149,43 +177,84 @@ out: return ret; } -static ssize_t qp_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static const struct file_operations qp_debugfs_fops = { + .owner = THIS_MODULE, + .open = qp_open, + .release = qp_release, + .read = debugfs_read, +}; + +static int dump_stag(int id, void *p, void *data) { - struct debugfs_qp_data *qpd = file->private_data; - loff_t pos = *ppos; - loff_t avail = qpd->pos; + struct c4iw_debugfs_data *stagd = data; + int space; + int cc; - if (pos < 0) - return -EINVAL; - if (pos >= avail) + space = stagd->bufsize - stagd->pos - 1; + if (space == 0) + return 1; + + cc = snprintf(stagd->buf + stagd->pos, space, "0x%x\n", id<<8); + if (cc < space) + stagd->pos += cc; + return 0; +} + +static int stag_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd = file->private_data; + if (!stagd) { + printk(KERN_INFO "%s null stagd?\n", __func__); return 0; - if (count > avail - pos) - count = avail - pos; + } + kfree(stagd->buf); + kfree(stagd); + return 0; +} - while (count) { - size_t len = 0; +static int stag_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd; + int ret = 0; + int count = 1; - len = min((int)count, (int)qpd->pos - (int)pos); - if (copy_to_user(buf, qpd->buf + pos, len)) - return -EFAULT; - if (len == 0) - return -EINVAL; + stagd = kmalloc(sizeof *stagd, GFP_KERNEL); + if (!stagd) { + ret = -ENOMEM; + goto out; + } + stagd->devp = inode->i_private; + stagd->pos = 0; - buf += len; - pos += len; - count -= len; + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, count_idrs, &count); + spin_unlock_irq(&stagd->devp->lock); + + stagd->bufsize = count * sizeof("0x12345678\n"); + stagd->buf = kmalloc(stagd->bufsize, GFP_KERNEL); + if (!stagd->buf) { + ret = -ENOMEM; + goto err1; } - count = pos - *ppos; - *ppos = pos; - return count; + + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, dump_stag, stagd); + spin_unlock_irq(&stagd->devp->lock); + + stagd->buf[stagd->pos++] = 0; + file->private_data = stagd; + goto out; +err1: + kfree(stagd); +out: + return ret; } -static const struct file_operations qp_debugfs_fops = { +static const struct file_operations stag_debugfs_fops = { .owner = THIS_MODULE, - .open = qp_open, - .release = qp_release, - .read = qp_read, + .open = stag_open, + .release = stag_release, + .read = debugfs_read, }; static int setup_debugfs(struct c4iw_dev *devp) @@ -199,6 +268,11 @@ static int setup_debugfs(struct c4iw_dev *devp) (void *)devp, &qp_debugfs_fops); if (de && de->d_inode) de->d_inode->i_size = 4096; + + de = debugfs_create_file("stags", S_IWUSR, devp->debugfs_root, + (void *)devp, &stag_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; return 0; } -- cgit v0.10.2 From aadc4df3087a33ca7fc37f91a024e7b7efdafa75 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:25 -0500 Subject: RDMA/cxgb4: Centralize the wait logic Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 2f0a26c..6819ac4 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -252,7 +252,7 @@ static void *alloc_ep(int size, gfp_t gfp) if (epc) { kref_init(&epc->kref); spin_lock_init(&epc->lock); - init_waitqueue_head(&epc->waitq); + c4iw_init_wr_wait(&epc->wr_wait); } PDBG("%s alloc ep %p\n", __func__, epc); return epc; @@ -1213,9 +1213,9 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) } PDBG("%s ep %p status %d error %d\n", __func__, ep, rpl->status, status2errno(rpl->status)); - ep->com.rpl_err = status2errno(rpl->status); - ep->com.rpl_done = 1; - wake_up(&ep->com.waitq); + ep->com.wr_wait.ret = status2errno(rpl->status); + ep->com.wr_wait.done = 1; + wake_up(&ep->com.wr_wait.wait); return 0; } @@ -1249,9 +1249,9 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_listen_ep *ep = lookup_stid(t, stid); PDBG("%s ep %p\n", __func__, ep); - ep->com.rpl_err = status2errno(rpl->status); - ep->com.rpl_done = 1; - wake_up(&ep->com.waitq); + ep->com.wr_wait.ret = status2errno(rpl->status); + ep->com.wr_wait.done = 1; + wake_up(&ep->com.wr_wait.wait); return 0; } @@ -1507,17 +1507,17 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) * in rdma connection migration (see c4iw_accept_cr()). */ __state_set(&ep->com, CLOSING); - ep->com.rpl_done = 1; - ep->com.rpl_err = -ECONNRESET; + ep->com.wr_wait.done = 1; + ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.waitq); + wake_up(&ep->com.wr_wait.wait); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); - ep->com.rpl_done = 1; - ep->com.rpl_err = -ECONNRESET; + ep->com.wr_wait.done = 1; + ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.waitq); + wake_up(&ep->com.wr_wait.wait); break; case FPDU_MODE: start_ep_timer(ep); @@ -1605,10 +1605,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: - ep->com.rpl_done = 1; - ep->com.rpl_err = -ECONNRESET; + ep->com.wr_wait.done = 1; + ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p\n", ep); - wake_up(&ep->com.waitq); + wake_up(&ep->com.wr_wait.wait); break; case MPA_REQ_RCVD: @@ -1618,10 +1618,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) * rejects the CR. Also wake up anyone waiting * in rdma connection migration (see c4iw_accept_cr()). */ - ep->com.rpl_done = 1; - ep->com.rpl_err = -ECONNRESET; + ep->com.wr_wait.done = 1; + ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.waitq); + wake_up(&ep->com.wr_wait.wait); break; case MORIBUND: case CLOSING: @@ -2043,6 +2043,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) } state_set(&ep->com, LISTEN); + c4iw_init_wr_wait(&ep->com.wr_wait); err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], ep->stid, ep->com.local_addr.sin_addr.s_addr, ep->com.local_addr.sin_port, @@ -2051,15 +2052,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto fail3; /* wait for pass_open_rpl */ - wait_event_timeout(ep->com.waitq, ep->com.rpl_done, C4IW_WR_TO); - if (ep->com.rpl_done) - err = ep->com.rpl_err; - else { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(ep->com.dev->rdev.lldi.pdev)); - ep->com.dev->rdev.flags = T4_FATAL_ERROR; - err = -EIO; - } + err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, + __func__); if (!err) { cm_id->provider_data = ep; goto out; @@ -2083,20 +2077,12 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id) might_sleep(); state_set(&ep->com, DEAD); - ep->com.rpl_done = 0; - ep->com.rpl_err = 0; + c4iw_init_wr_wait(&ep->com.wr_wait); err = listen_stop(ep); if (err) goto done; - wait_event_timeout(ep->com.waitq, ep->com.rpl_done, C4IW_WR_TO); - if (ep->com.rpl_done) - err = ep->com.rpl_err; - else { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(ep->com.dev->rdev.lldi.pdev)); - ep->com.dev->rdev.flags = T4_FATAL_ERROR; - err = -EIO; - } + err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, + __func__); cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET); done: cm_id->rem_ref(cm_id); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 70371e9..8d8f8ad 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -64,14 +64,7 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, c4iw_init_wr_wait(&wr_wait); ret = c4iw_ofld_send(rdev, skb); if (!ret) { - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rdev->lldi.pdev)); - rdev->flags = T4_FATAL_ERROR; - ret = -EIO; - } else - ret = wr_wait.ret; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); } kfree(cq->sw_queue); @@ -157,14 +150,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, if (ret) goto err4; PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait); - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rdev->lldi.pdev)); - rdev->flags = T4_FATAL_ERROR; - ret = -EIO; - } else - ret = wr_wait.ret; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); if (ret) goto err4; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index ed459b8..7780116 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -79,21 +79,6 @@ static inline void *cplhdr(struct sk_buff *skb) return skb->data; } -#define C4IW_WR_TO (10*HZ) - -struct c4iw_wr_wait { - wait_queue_head_t wait; - int done; - int ret; -}; - -static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) -{ - wr_waitp->ret = 0; - wr_waitp->done = 0; - init_waitqueue_head(&wr_waitp->wait); -} - struct c4iw_resource { struct kfifo tpt_fifo; spinlock_t tpt_fifo_lock; @@ -141,6 +126,44 @@ static inline int c4iw_num_stags(struct c4iw_rdev *rdev) return min((int)T4_MAX_NUM_STAG, (int)(rdev->lldi.vr->stag.size >> 5)); } +#define C4IW_WR_TO (10*HZ) + +struct c4iw_wr_wait { + wait_queue_head_t wait; + int done; + int ret; +}; + +static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) +{ + wr_waitp->ret = 0; + wr_waitp->done = 0; + init_waitqueue_head(&wr_waitp->wait); +} + +static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, + struct c4iw_wr_wait *wr_waitp, + u32 hwtid, u32 qpid, + const char *func) +{ + unsigned to = C4IW_WR_TO; + do { + + wait_event_timeout(wr_waitp->wait, wr_waitp->done, to); + if (!wr_waitp->done) { + printk(KERN_ERR MOD "%s - Device %s not responding - " + "tid %u qpid %u\n", func, + pci_name(rdev->lldi.pdev), hwtid, qpid); + to = to << 2; + } + } while (!wr_waitp->done); + if (wr_waitp->ret) + printk(KERN_WARNING MOD "%s: FW reply %d tid %u qpid %u\n", + pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); + return wr_waitp->ret; +} + + struct c4iw_dev { struct ib_device ibdev; struct c4iw_rdev rdev; @@ -582,9 +605,7 @@ struct c4iw_ep_common { spinlock_t lock; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; - wait_queue_head_t waitq; - int rpl_done; - int rpl_err; + struct c4iw_wr_wait wr_wait; unsigned long flags; }; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index f61562c..273ffe4 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -103,14 +103,7 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, len -= C4IW_MAX_INLINE_SIZE; } - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rdev->lldi.pdev)); - rdev->flags = T4_FATAL_ERROR; - ret = -EIO; - } else - ret = wr_wait.ret; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); return ret; } diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 1199d1b..40187e2 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -198,14 +198,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, ret = c4iw_ofld_send(rdev, skb); if (ret) goto err7; - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rdev->lldi.pdev)); - rdev->flags = T4_FATAL_ERROR; - ret = -EIO; - } else - ret = wr_wait.ret; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__); if (ret) goto err7; @@ -997,20 +990,8 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, if (ret) goto out; - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rhp->rdev.lldi.pdev)); - rhp->rdev.flags = T4_FATAL_ERROR; - ret = -EIO; - } else { - ret = wr_wait.ret; - if (ret) - printk(KERN_WARNING MOD - "%s: Abnormal close qpid %d ret %u\n", - pci_name(rhp->rdev.lldi.pdev), qhp->wq.sq.qid, - ret); - } + ret = c4iw_wait_for_reply(&rhp->rdev, &wr_wait, qhp->ep->hwtid, + qhp->wq.sq.qid, __func__); out: PDBG("%s ret %d\n", __func__, ret); return ret; @@ -1106,14 +1087,8 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) if (ret) goto out; - wait_event_timeout(wr_wait.wait, wr_wait.done, C4IW_WR_TO); - if (!wr_wait.done) { - printk(KERN_ERR MOD "Device %s not responding!\n", - pci_name(rhp->rdev.lldi.pdev)); - rhp->rdev.flags = T4_FATAL_ERROR; - ret = -EIO; - } else - ret = wr_wait.ret; + ret = c4iw_wait_for_reply(&rhp->rdev, &wr_wait, qhp->ep->hwtid, + qhp->wq.sq.qid, __func__); out: PDBG("%s ret %d\n", __func__, ret); return ret; -- cgit v0.10.2 From c6d7b26791a2aefdf97f2af1e93161ed05acd631 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 13 Sep 2010 11:23:57 -0500 Subject: RDMA/cxgb4: Support on-chip SQs T4 support on-chip SQs to reduce latency. This patch adds support for this in iw_cxgb4: - Manage ocqp memory like other adapter mem resources. - Allocate user mode SQs from ocqp mem if available. - Map ocqp mem to user process using write combining. - Map PCIE_MA_SYNC reg to user process. Bump uverbs ABI. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 2851bf8..986cfd7 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -364,7 +364,14 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) printk(KERN_ERR MOD "error %d initializing rqt pool\n", err); goto err3; } + err = c4iw_ocqp_pool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); + goto err4; + } return 0; +err4: + c4iw_rqtpool_destroy(rdev); err3: c4iw_pblpool_destroy(rdev); err2: @@ -391,6 +398,7 @@ static void c4iw_remove(struct c4iw_dev *dev) idr_destroy(&dev->cqidr); idr_destroy(&dev->qpidr); idr_destroy(&dev->mmidr); + iounmap(dev->rdev.oc_mw_kva); ib_dealloc_device(&dev->ibdev); } @@ -406,6 +414,17 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) } devp->rdev.lldi = *infop; + devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) + + (pci_resource_len(devp->rdev.lldi.pdev, 2) - + roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size)); + devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, + devp->rdev.lldi.vr->ocq.size); + + printk(KERN_INFO MOD "ocq memory: " + "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", + devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size, + devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva); + mutex_lock(&dev_mutex); ret = c4iw_rdev_open(&devp->rdev); diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 7780116..1c26922 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -112,8 +112,11 @@ struct c4iw_rdev { struct c4iw_dev_ucontext uctx; struct gen_pool *pbl_pool; struct gen_pool *rqt_pool; + struct gen_pool *ocqp_pool; u32 flags; struct cxgb4_lld_info lldi; + unsigned long oc_mw_pa; + void __iomem *oc_mw_kva; }; static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) @@ -675,8 +678,10 @@ int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid); int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); void c4iw_destroy_resource(struct c4iw_resource *rscp); int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_register_device(struct c4iw_dev *dev); @@ -742,6 +747,8 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size); void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size); u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size); void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size); +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size); int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb); void c4iw_flush_hw_cq(struct t4_cq *cq); void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 8f645c8..a49a9c1 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -149,19 +149,28 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) addr = mm->addr; kfree(mm); - if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 2) + - pci_resource_len(rdev->lldi.pdev, 2)))) { + if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) { /* - * Map T4 DB register. + * MA_SYNC register... */ - if (vma->vm_flags & VM_READ) - return -EPERM; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; - vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + + /* + * Map user DB or OCQP memory... + */ + if (addr >= rdev->oc_mw_pa) + vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); @@ -472,6 +481,7 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.post_send = c4iw_post_send; dev->ibdev.post_recv = c4iw_post_receive; dev->ibdev.get_protocol_stats = c4iw_get_mib; + dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 40187e2..7e45f73 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -31,6 +31,55 @@ */ #include "iw_cxgb4.h" +static int ocqp_support; +module_param(ocqp_support, int, 0644); +MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=0)"); + +static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize); +} + +static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue, + pci_unmap_addr(sq, mapping)); +} + +static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (t4_sq_onchip(sq)) + dealloc_oc_sq(rdev, sq); + else + dealloc_host_sq(rdev, sq); +} + +static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (!ocqp_support || !t4_ocqp_supported()) + return -ENOSYS; + sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize); + if (!sq->dma_addr) + return -ENOMEM; + sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr - + rdev->lldi.vr->ocq.start; + sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr - + rdev->lldi.vr->ocq.start); + sq->flags |= T4_SQ_ONCHIP; + return 0; +} + +static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize, + &(sq->dma_addr), GFP_KERNEL); + if (!sq->queue) + return -ENOMEM; + sq->phys_addr = virt_to_phys(sq->queue); + pci_unmap_addr_set(sq, mapping, sq->dma_addr); + return 0; +} + static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct c4iw_dev_ucontext *uctx) { @@ -41,9 +90,7 @@ static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, dma_free_coherent(&(rdev->lldi.pdev->dev), wq->rq.memsize, wq->rq.queue, dma_unmap_addr(&wq->rq, mapping)); - dma_free_coherent(&(rdev->lldi.pdev->dev), - wq->sq.memsize, wq->sq.queue, - dma_unmap_addr(&wq->sq, mapping)); + dealloc_sq(rdev, &wq->sq); c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); kfree(wq->rq.sw_rq); kfree(wq->sq.sw_sq); @@ -93,11 +140,12 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, if (!wq->rq.rqt_hwaddr) goto err4; - wq->sq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), - wq->sq.memsize, &(wq->sq.dma_addr), - GFP_KERNEL); - if (!wq->sq.queue) - goto err5; + if (user) { + if (alloc_oc_sq(rdev, &wq->sq) && alloc_host_sq(rdev, &wq->sq)) + goto err5; + } else + if (alloc_host_sq(rdev, &wq->sq)) + goto err5; memset(wq->sq.queue, 0, wq->sq.memsize); dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); @@ -158,6 +206,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ + t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0 | V_FW_RI_RES_WR_IQID(scq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | @@ -212,9 +261,7 @@ err7: wq->rq.memsize, wq->rq.queue, dma_unmap_addr(&wq->rq, mapping)); err6: - dma_free_coherent(&(rdev->lldi.pdev->dev), - wq->sq.memsize, wq->sq.queue, - dma_unmap_addr(&wq->sq, mapping)); + dealloc_sq(rdev, &wq->sq); err5: c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); err4: @@ -1361,7 +1408,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; - struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4; + struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; PDBG("%s ib_pd %p\n", __func__, pd); @@ -1459,7 +1506,15 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, ret = -ENOMEM; goto err6; } - + if (t4_sq_onchip(&qhp->wq.sq)) { + mm5 = kmalloc(sizeof *mm5, GFP_KERNEL); + if (!mm5) { + ret = -ENOMEM; + goto err7; + } + uresp.flags = C4IW_QPF_ONCHIP; + } else + uresp.flags = 0; uresp.qid_mask = rhp->rdev.qpmask; uresp.sqid = qhp->wq.sq.qid; uresp.sq_size = qhp->wq.sq.size; @@ -1468,6 +1523,10 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, uresp.rq_size = qhp->wq.rq.size; uresp.rq_memsize = qhp->wq.rq.memsize; spin_lock(&ucontext->mmap_lock); + if (mm5) { + uresp.ma_sync_key = ucontext->key; + ucontext->key += PAGE_SIZE; + } uresp.sq_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.rq_key = ucontext->key; @@ -1479,9 +1538,9 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); if (ret) - goto err7; + goto err8; mm1->key = uresp.sq_key; - mm1->addr = virt_to_phys(qhp->wq.sq.queue); + mm1->addr = qhp->wq.sq.phys_addr; mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize); insert_mmap(ucontext, mm1); mm2->key = uresp.rq_key; @@ -1496,6 +1555,13 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, mm4->addr = qhp->wq.rq.udb; mm4->len = PAGE_SIZE; insert_mmap(ucontext, mm4); + if (mm5) { + mm5->key = uresp.ma_sync_key; + mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0) + + A_PCIE_MA_SYNC) & PAGE_MASK; + mm5->len = PAGE_SIZE; + insert_mmap(ucontext, mm5); + } } qhp->ibqp.qp_num = qhp->wq.sq.qid; init_timer(&(qhp->timer)); @@ -1503,6 +1569,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, qhp->wq.sq.qid); return &qhp->ibqp; +err8: + kfree(mm5); err7: kfree(mm4); err6: diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index 26365f6..4fb50d5 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -422,3 +422,59 @@ void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev) { gen_pool_destroy(rdev->rqt_pool); } + +/* + * On-Chip QP Memory. + */ +#define MIN_OCQP_SHIFT 12 /* 4KB == min ocqp size */ + +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + return (u32)addr; +} + +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size); +} + +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev) +{ + unsigned start, chunk, top; + + rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1); + if (!rdev->ocqp_pool) + return -ENOMEM; + + start = rdev->lldi.vr->ocq.start; + chunk = rdev->lldi.vr->ocq.size; + top = start + chunk; + + while (start < top) { + chunk = min(top - start + 1, chunk); + if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) { + PDBG("%s failed to add OCQP chunk (%x/%x)\n", + __func__, start, chunk); + if (chunk <= 1024 << MIN_OCQP_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all OCQP chunks (%x/%x)\n", + start, top - start); + return 0; + } + chunk >>= 1; + } else { + PDBG("%s added OCQP chunk (%x/%x)\n", + __func__, start, chunk); + start += chunk; + } + } + return 0; +} + +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->ocqp_pool); +} diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 24f3690..17ea5fc 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -52,6 +52,7 @@ #define T4_STAG_UNSET 0xffffffff #define T4_FW_MAJ 0 #define T4_EQ_STATUS_ENTRIES (L1_CACHE_BYTES > 64 ? 2 : 1) +#define A_PCIE_MA_SYNC 0x30b4 struct t4_status_page { __be32 rsvd1; /* flit 0 - hw owns */ @@ -266,10 +267,36 @@ struct t4_swsqe { u16 idx; }; +static inline pgprot_t t4_pgprot_wc(pgprot_t prot) +{ +#if defined(__i386__) || defined(__x86_64__) + return pgprot_writecombine(prot); +#elif defined(CONFIG_PPC64) + return __pgprot((pgprot_val(prot) | _PAGE_NO_CACHE) & + ~(pgprot_t)_PAGE_GUARDED); +#else + return pgprot_noncached(prot); +#endif +} + +static inline int t4_ocqp_supported(void) +{ +#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64) + return 1; +#else + return 0; +#endif +} + +enum { + T4_SQ_ONCHIP = (1<<0), +}; + struct t4_sq { union t4_wr *queue; dma_addr_t dma_addr; DEFINE_DMA_UNMAP_ADDR(mapping); + unsigned long phys_addr; struct t4_swsqe *sw_sq; struct t4_swsqe *oldest_read; u64 udb; @@ -280,6 +307,7 @@ struct t4_sq { u16 cidx; u16 pidx; u16 wq_pidx; + u16 flags; }; struct t4_swrqe { @@ -350,6 +378,11 @@ static inline void t4_rq_consume(struct t4_wq *wq) wq->rq.cidx = 0; } +static inline int t4_sq_onchip(struct t4_sq *sq) +{ + return sq->flags & T4_SQ_ONCHIP; +} + static inline int t4_sq_empty(struct t4_wq *wq) { return wq->sq.in_use == 0; @@ -396,30 +429,27 @@ static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc) static inline int t4_wq_in_error(struct t4_wq *wq) { - return wq->sq.queue[wq->sq.size].status.qp_err; + return wq->rq.queue[wq->rq.size].status.qp_err; } static inline void t4_set_wq_in_error(struct t4_wq *wq) { - wq->sq.queue[wq->sq.size].status.qp_err = 1; wq->rq.queue[wq->rq.size].status.qp_err = 1; } static inline void t4_disable_wq_db(struct t4_wq *wq) { - wq->sq.queue[wq->sq.size].status.db_off = 1; wq->rq.queue[wq->rq.size].status.db_off = 1; } static inline void t4_enable_wq_db(struct t4_wq *wq) { - wq->sq.queue[wq->sq.size].status.db_off = 0; wq->rq.queue[wq->rq.size].status.db_off = 0; } static inline int t4_wq_db_enabled(struct t4_wq *wq) { - return !wq->sq.queue[wq->sq.size].status.db_off; + return !wq->rq.queue[wq->rq.size].status.db_off; } struct t4_cq { diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index ed6414a..e6669d5 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -50,7 +50,13 @@ struct c4iw_create_cq_resp { __u32 qid_mask; }; + +enum { + C4IW_QPF_ONCHIP = (1<<0) +}; + struct c4iw_create_qp_resp { + __u64 ma_sync_key; __u64 sq_key; __u64 rq_key; __u64 sq_db_gts_key; @@ -62,5 +68,6 @@ struct c4iw_create_qp_resp { __u32 sq_size; __u32 rq_size; __u32 qid_mask; + __u32 flags; }; #endif -- cgit v0.10.2 From 2f5b48c3ad84fda9efe35122b058ccffc0c2c7cf Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:36 -0500 Subject: RDMA/cxgb4: Use a mutex for QP and EP state transitions Move the connection setup/teardown paths to the workq thread removing spin lock/irq disable requirements for these paths. This allows calls down to the LLD for EP and QP state transition actions to be atomic with respect to processing CPL messages coming up from the HW. Namely, calls to rdma_init() and rdma_fini() can now be called with the mutex held avoiding many race conditions with the abort path. The QP spinlock is still used but only to manipulate the qp state. This allows the fastpaths, poll, post_send, and pos_recv, to run in the irq context. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 6819ac4..9b5c3e3 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -219,12 +219,11 @@ static void set_emss(struct c4iw_ep *ep, u16 opt) static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc) { - unsigned long flags; enum c4iw_ep_state state; - spin_lock_irqsave(&epc->lock, flags); + mutex_lock(&epc->mutex); state = epc->state; - spin_unlock_irqrestore(&epc->lock, flags); + mutex_unlock(&epc->mutex); return state; } @@ -235,12 +234,10 @@ static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) { - unsigned long flags; - - spin_lock_irqsave(&epc->lock, flags); + mutex_lock(&epc->mutex); PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]); __state_set(epc, new); - spin_unlock_irqrestore(&epc->lock, flags); + mutex_unlock(&epc->mutex); return; } @@ -251,7 +248,7 @@ static void *alloc_ep(int size, gfp_t gfp) epc = kzalloc(size, gfp); if (epc) { kref_init(&epc->kref); - spin_lock_init(&epc->lock); + mutex_init(&epc->mutex); c4iw_init_wr_wait(&epc->wr_wait); } PDBG("%s alloc ep %p\n", __func__, epc); @@ -1131,7 +1128,6 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; struct cpl_abort_rpl_rss *rpl = cplhdr(skb); - unsigned long flags; int release = 0; unsigned int tid = GET_TID(rpl); struct tid_info *t = dev->rdev.lldi.tids; @@ -1139,7 +1135,7 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) ep = lookup_tid(t, tid); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); BUG_ON(!ep); - spin_lock_irqsave(&ep->com.lock, flags); + mutex_lock(&ep->com.mutex); switch (ep->com.state) { case ABORTING: __state_set(&ep->com, DEAD); @@ -1150,7 +1146,7 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) __func__, ep, ep->com.state); break; } - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); if (release) release_ep_resources(ep); @@ -1478,7 +1474,6 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_peer_close *hdr = cplhdr(skb); struct c4iw_ep *ep; struct c4iw_qp_attributes attrs; - unsigned long flags; int disconnect = 1; int release = 0; int closing = 0; @@ -1489,7 +1484,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); dst_confirm(ep->dst); - spin_lock_irqsave(&ep->com.lock, flags); + mutex_lock(&ep->com.mutex); switch (ep->com.state) { case MPA_REQ_WAIT: __state_set(&ep->com, CLOSING); @@ -1550,7 +1545,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) default: BUG_ON(1); } - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); if (closing) { attrs.next_state = C4IW_QP_STATE_CLOSING; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, @@ -1581,7 +1576,6 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int ret; int release = 0; - unsigned long flags; struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); @@ -1591,9 +1585,17 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) ep->hwtid); return 0; } - spin_lock_irqsave(&ep->com.lock, flags); PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); + + /* + * Wake up any threads in rdma_init() or rdma_fini(). + */ + ep->com.wr_wait.done = 1; + ep->com.wr_wait.ret = -ECONNRESET; + wake_up(&ep->com.wr_wait.wait); + + mutex_lock(&ep->com.mutex); switch (ep->com.state) { case CONNECTING: break; @@ -1605,23 +1607,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: - ep->com.wr_wait.done = 1; - ep->com.wr_wait.ret = -ECONNRESET; - PDBG("waking up ep %p\n", ep); - wake_up(&ep->com.wr_wait.wait); break; case MPA_REQ_RCVD: - - /* - * We're gonna mark this puppy DEAD, but keep - * the reference on it until the ULP accepts or - * rejects the CR. Also wake up anyone waiting - * in rdma connection migration (see c4iw_accept_cr()). - */ - ep->com.wr_wait.done = 1; - ep->com.wr_wait.ret = -ECONNRESET; - PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.wr_wait.wait); break; case MORIBUND: case CLOSING: @@ -1644,7 +1631,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) break; case DEAD: PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); return 0; default: BUG_ON(1); @@ -1655,7 +1642,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) __state_set(&ep->com, DEAD); release = 1; } - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); if (!rpl_skb) { @@ -1681,7 +1668,6 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_ep *ep; struct c4iw_qp_attributes attrs; struct cpl_close_con_rpl *rpl = cplhdr(skb); - unsigned long flags; int release = 0; struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(rpl); @@ -1692,7 +1678,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) BUG_ON(!ep); /* The cm_id may be null if we failed to connect */ - spin_lock_irqsave(&ep->com.lock, flags); + mutex_lock(&ep->com.mutex); switch (ep->com.state) { case CLOSING: __state_set(&ep->com, MORIBUND); @@ -1717,7 +1703,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) BUG_ON(1); break; } - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); if (release) release_ep_resources(ep); return 0; @@ -2093,12 +2079,11 @@ done: int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) { int ret = 0; - unsigned long flags; int close = 0; int fatal = 0; struct c4iw_rdev *rdev; - spin_lock_irqsave(&ep->com.lock, flags); + mutex_lock(&ep->com.mutex); PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, states[ep->com.state], abrupt); @@ -2145,7 +2130,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) break; } - spin_unlock_irqrestore(&ep->com.lock, flags); + mutex_unlock(&ep->com.mutex); if (close) { if (abrupt) ret = abort_connection(ep, NULL, gfp); @@ -2159,6 +2144,13 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) return ret; } +static int async_event(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_fw6_msg *rpl = cplhdr(skb); + c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + return 0; +} + /* * These are the real handlers that are called from a * work queue. @@ -2177,7 +2169,8 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { [CPL_ABORT_REQ_RSS] = peer_abort, [CPL_CLOSE_CON_RPL] = close_con_rpl, [CPL_RDMA_TERMINATE] = terminate, - [CPL_FW4_ACK] = fw4_ack + [CPL_FW4_ACK] = fw4_ack, + [CPL_FW6_MSG] = async_event }; static void process_timeout(struct c4iw_ep *ep) @@ -2185,7 +2178,7 @@ static void process_timeout(struct c4iw_ep *ep) struct c4iw_qp_attributes attrs; int abort = 1; - spin_lock_irq(&ep->com.lock); + mutex_lock(&ep->com.mutex); PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, ep->com.state); switch (ep->com.state) { @@ -2212,7 +2205,7 @@ static void process_timeout(struct c4iw_ep *ep) WARN_ON(1); abort = 0; } - spin_unlock_irq(&ep->com.lock); + mutex_unlock(&ep->com.mutex); if (abort) abort_connection(ep, NULL, GFP_KERNEL); c4iw_put_ep(&ep->com); @@ -2296,6 +2289,7 @@ static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb) printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u " "for tid %u\n", rpl->status, GET_TID(rpl)); } + kfree_skb(skb); return 0; } @@ -2313,17 +2307,22 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); if (wr_waitp) { - wr_waitp->ret = ret; + if (ret) + wr_waitp->ret = -ret; + else + wr_waitp->ret = 0; wr_waitp->done = 1; wake_up(&wr_waitp->wait); } + kfree_skb(skb); break; case 2: - c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + sched(dev, skb); break; default: printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__, rpl->type); + kfree_skb(skb); break; } return 0; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 1c26922..16032cd 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -353,6 +354,7 @@ struct c4iw_qp { struct c4iw_qp_attributes attr; struct t4_wq wq; spinlock_t lock; + struct mutex mutex; atomic_t refcnt; wait_queue_head_t wait; struct timer_list timer; @@ -605,7 +607,7 @@ struct c4iw_ep_common { struct c4iw_dev *dev; enum c4iw_ep_state state; struct kref kref; - spinlock_t lock; + struct mutex mutex; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; struct c4iw_wr_wait wr_wait; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 7e45f73..76a286f 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -35,6 +35,14 @@ static int ocqp_support; module_param(ocqp_support, int, 0644); MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=0)"); +static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) +{ + unsigned long flag; + spin_lock_irqsave(&qhp->lock, flag); + qhp->attr.state = state; + spin_unlock_irqrestore(&qhp->lock, flag); +} + static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) { c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize); @@ -949,46 +957,38 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, * Assumes qhp lock is held. */ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, - struct c4iw_cq *schp, unsigned long *flag) + struct c4iw_cq *schp) { int count; int flushed; + unsigned long flag; PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); - /* take a ref on the qhp since we must release the lock */ - atomic_inc(&qhp->refcnt); - spin_unlock_irqrestore(&qhp->lock, *flag); /* locking hierarchy: cq lock first, then qp lock. */ - spin_lock_irqsave(&rchp->lock, *flag); + spin_lock_irqsave(&rchp->lock, flag); spin_lock(&qhp->lock); c4iw_flush_hw_cq(&rchp->cq); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); - spin_unlock_irqrestore(&rchp->lock, *flag); + spin_unlock_irqrestore(&rchp->lock, flag); if (flushed) (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); /* locking hierarchy: cq lock first, then qp lock. */ - spin_lock_irqsave(&schp->lock, *flag); + spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); c4iw_flush_hw_cq(&schp->cq); c4iw_count_scqes(&schp->cq, &qhp->wq, &count); flushed = c4iw_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); - spin_unlock_irqrestore(&schp->lock, *flag); + spin_unlock_irqrestore(&schp->lock, flag); if (flushed) (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); - - /* deref */ - if (atomic_dec_and_test(&qhp->refcnt)) - wake_up(&qhp->wait); - - spin_lock_irqsave(&qhp->lock, *flag); } -static void flush_qp(struct c4iw_qp *qhp, unsigned long *flag) +static void flush_qp(struct c4iw_qp *qhp) { struct c4iw_cq *rchp, *schp; @@ -1002,7 +1002,7 @@ static void flush_qp(struct c4iw_qp *qhp, unsigned long *flag) t4_set_cq_in_error(&schp->cq); return; } - __flush_qp(qhp, rchp, schp, flag); + __flush_qp(qhp, rchp, schp); } static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, @@ -1010,7 +1010,6 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, { struct fw_ri_wr *wqe; int ret; - struct c4iw_wr_wait wr_wait; struct sk_buff *skb; PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, @@ -1029,15 +1028,15 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, wqe->flowid_len16 = cpu_to_be32( FW_WR_FLOWID(ep->hwtid) | FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); - wqe->cookie = (unsigned long) &wr_wait; + wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.fini.type = FW_RI_TYPE_FINI; - c4iw_init_wr_wait(&wr_wait); + c4iw_init_wr_wait(&ep->com.wr_wait); ret = c4iw_ofld_send(&rhp->rdev, skb); if (ret) goto out; - ret = c4iw_wait_for_reply(&rhp->rdev, &wr_wait, qhp->ep->hwtid, + ret = c4iw_wait_for_reply(&rhp->rdev, &ep->com.wr_wait, qhp->ep->hwtid, qhp->wq.sq.qid, __func__); out: PDBG("%s ret %d\n", __func__, ret); @@ -1072,7 +1071,6 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) { struct fw_ri_wr *wqe; int ret; - struct c4iw_wr_wait wr_wait; struct sk_buff *skb; PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, @@ -1092,7 +1090,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) FW_WR_FLOWID(qhp->ep->hwtid) | FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); - wqe->cookie = (unsigned long) &wr_wait; + wqe->cookie = (unsigned long) &qhp->ep->com.wr_wait; wqe->u.init.type = FW_RI_TYPE_INIT; wqe->u.init.mpareqbit_p2ptype = @@ -1129,13 +1127,13 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) if (qhp->attr.mpa_attr.initiator) build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); - c4iw_init_wr_wait(&wr_wait); + c4iw_init_wr_wait(&qhp->ep->com.wr_wait); ret = c4iw_ofld_send(&rhp->rdev, skb); if (ret) goto out; - ret = c4iw_wait_for_reply(&rhp->rdev, &wr_wait, qhp->ep->hwtid, - qhp->wq.sq.qid, __func__); + ret = c4iw_wait_for_reply(&rhp->rdev, &qhp->ep->com.wr_wait, + qhp->ep->hwtid, qhp->wq.sq.qid, __func__); out: PDBG("%s ret %d\n", __func__, ret); return ret; @@ -1148,7 +1146,6 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, { int ret = 0; struct c4iw_qp_attributes newattr = qhp->attr; - unsigned long flag; int disconnect = 0; int terminate = 0; int abort = 0; @@ -1159,7 +1156,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state, (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); - spin_lock_irqsave(&qhp->lock, flag); + mutex_lock(&qhp->mutex); /* Process attr changes if in IDLE */ if (mask & C4IW_QP_ATTR_VALID_MODIFY) { @@ -1210,7 +1207,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, qhp->attr.mpa_attr = attrs->mpa_attr; qhp->attr.llp_stream_handle = attrs->llp_stream_handle; qhp->ep = qhp->attr.llp_stream_handle; - qhp->attr.state = C4IW_QP_STATE_RTS; + set_state(qhp, C4IW_QP_STATE_RTS); /* * Ref the endpoint here and deref when we @@ -1219,15 +1216,13 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, * transition. */ c4iw_get_ep(&qhp->ep->com); - spin_unlock_irqrestore(&qhp->lock, flag); ret = rdma_init(rhp, qhp); - spin_lock_irqsave(&qhp->lock, flag); if (ret) goto err; break; case C4IW_QP_STATE_ERROR: - qhp->attr.state = C4IW_QP_STATE_ERROR; - flush_qp(qhp, &flag); + set_state(qhp, C4IW_QP_STATE_ERROR); + flush_qp(qhp); break; default: ret = -EINVAL; @@ -1238,39 +1233,38 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); - qhp->attr.state = C4IW_QP_STATE_CLOSING; + set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { abort = 0; disconnect = 1; - c4iw_get_ep(&ep->com); + c4iw_get_ep(&qhp->ep->com); } - spin_unlock_irqrestore(&qhp->lock, flag); ret = rdma_fini(rhp, qhp, ep); - spin_lock_irqsave(&qhp->lock, flag); if (ret) { - c4iw_get_ep(&ep->com); + if (internal) + c4iw_get_ep(&qhp->ep->com); disconnect = abort = 1; goto err; } break; case C4IW_QP_STATE_TERMINATE: - qhp->attr.state = C4IW_QP_STATE_TERMINATE; + set_state(qhp, C4IW_QP_STATE_TERMINATE); if (qhp->ibqp.uobject) t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; - c4iw_get_ep(&ep->com); if (!internal) terminate = 1; disconnect = 1; + c4iw_get_ep(&qhp->ep->com); break; case C4IW_QP_STATE_ERROR: - qhp->attr.state = C4IW_QP_STATE_ERROR; + set_state(qhp, C4IW_QP_STATE_ERROR); if (!internal) { abort = 1; disconnect = 1; ep = qhp->ep; - c4iw_get_ep(&ep->com); + c4iw_get_ep(&qhp->ep->com); } goto err; break; @@ -1286,8 +1280,8 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, } switch (attrs->next_state) { case C4IW_QP_STATE_IDLE: - flush_qp(qhp, &flag); - qhp->attr.state = C4IW_QP_STATE_IDLE; + flush_qp(qhp); + set_state(qhp, C4IW_QP_STATE_IDLE); qhp->attr.llp_stream_handle = NULL; c4iw_put_ep(&qhp->ep->com); qhp->ep = NULL; @@ -1309,7 +1303,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, ret = -EINVAL; goto out; } - qhp->attr.state = C4IW_QP_STATE_IDLE; + set_state(qhp, C4IW_QP_STATE_IDLE); break; case C4IW_QP_STATE_TERMINATE: if (!internal) { @@ -1335,13 +1329,13 @@ err: if (!ep) ep = qhp->ep; qhp->ep = NULL; - qhp->attr.state = C4IW_QP_STATE_ERROR; + set_state(qhp, C4IW_QP_STATE_ERROR); free = 1; wake_up(&qhp->wait); BUG_ON(!ep); - flush_qp(qhp, &flag); + flush_qp(qhp); out: - spin_unlock_irqrestore(&qhp->lock, flag); + mutex_unlock(&qhp->mutex); if (terminate) post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL); @@ -1363,7 +1357,6 @@ out: */ if (free) c4iw_put_ep(&ep->com); - PDBG("%s exit state %d\n", __func__, qhp->attr.state); return ret; } @@ -1478,6 +1471,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.max_ord = 1; qhp->attr.max_ird = 1; spin_lock_init(&qhp->lock); + mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); -- cgit v0.10.2 From 98ae68b7ee6adb75ede42d84eae4032dbb122b81 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 10 Sep 2010 11:15:41 -0500 Subject: RDMA/cxgb4: Set the default TCP send window to 128KB This helps with large IO throughput. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 9b5c3e3..0f68e2b 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -117,9 +117,9 @@ static int rcv_win = 256 * 1024; module_param(rcv_win, int, 0644); MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)"); -static int snd_win = 32 * 1024; +static int snd_win = 128 * 1024; module_param(snd_win, int, 0644); -MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)"); static struct workqueue_struct *workq; -- cgit v0.10.2 From 410ade4c26bdf256fea3246e968a12409eb08763 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 17 Sep 2010 15:40:09 -0500 Subject: RDMA/cxgb4: Don't set completion flag for read requests Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 76a286f..ff04e5c 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -639,7 +639,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) - fw_flags |= FW_RI_RDMA_READ_INVALIDATE; + fw_flags = FW_RI_RDMA_READ_INVALIDATE; else fw_flags = 0; err = build_rdma_read(wqe, wr, &len16); -- cgit v0.10.2 From 40dbf6ee381008e471d3c4a332971247b7799744 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 17 Sep 2010 15:40:15 -0500 Subject: RDMA/cxgb4: Fastreg NSMR fixes - Remove dsgl support - doesn't work in T4. - Wrap the immediate PBL as needed when building it in the wr. - Adjust max pbl depth allowed based on ulptx alignment requirements. - Bump the slots per SQ to 5 to allow up to 128MB fast registers. - Advertise fastreg support by default. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index a49a9c1..81e1277 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -54,9 +54,9 @@ #include "iw_cxgb4.h" -static int fastreg_support; +static int fastreg_support = 1; module_param(fastreg_support, int, 0644); -MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=0)"); +MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); static int c4iw_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index ff04e5c..057cb25 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -505,13 +505,15 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, return 0; } -static int build_fastreg(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) { struct fw_ri_immd *imdp; __be64 *p; int i; int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); + int rem; if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH) return -EINVAL; @@ -526,32 +528,28 @@ static int build_fastreg(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff); - if (pbllen > T4_MAX_FR_IMMD) { - struct c4iw_fr_page_list *c4pl = - to_c4iw_fr_page_list(wr->wr.fast_reg.page_list); - struct fw_ri_dsgl *sglp; - - sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1); - sglp->op = FW_RI_DATA_DSGL; - sglp->r1 = 0; - sglp->nsge = cpu_to_be16(1); - sglp->addr0 = cpu_to_be64(c4pl->dma_addr); - sglp->len0 = cpu_to_be32(pbllen); - - *len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *sglp, 16); - } else { - imdp = (struct fw_ri_immd *)(&wqe->fr + 1); - imdp->op = FW_RI_DATA_IMMD; - imdp->r1 = 0; - imdp->r2 = 0; - imdp->immdlen = cpu_to_be32(pbllen); - p = (__be64 *)(imdp + 1); - for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) - *p = cpu_to_be64( - (u64)wr->wr.fast_reg.page_list->page_list[i]); - *len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen, - 16); + WARN_ON(pbllen > T4_MAX_FR_IMMD); + imdp = (struct fw_ri_immd *)(&wqe->fr + 1); + imdp->op = FW_RI_DATA_IMMD; + imdp->r1 = 0; + imdp->r2 = 0; + imdp->immdlen = cpu_to_be32(pbllen); + p = (__be64 *)(imdp + 1); + rem = pbllen; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]); + rem -= sizeof *p; + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + BUG_ON(rem < 0); + while (rem) { + *p = 0; + rem -= sizeof *p; + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; } + *len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen, 16); return 0; } @@ -652,7 +650,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_FAST_REG_MR: fw_opcode = FW_RI_FR_NSMR_WR; swsqe->opcode = FW_RI_FAST_REGISTER; - err = build_fastreg(wqe, wr, &len16); + err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 17ea5fc..7000442 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -66,7 +66,7 @@ struct t4_status_page { #define T4_EQ_ENTRY_SIZE 64 -#define T4_SQ_NUM_SLOTS 4 +#define T4_SQ_NUM_SLOTS 5 #define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS) #define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \ sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) @@ -79,7 +79,7 @@ struct t4_status_page { sizeof(struct fw_ri_rdma_write_wr) - \ sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) #define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \ - sizeof(struct fw_ri_immd))) + sizeof(struct fw_ri_immd)) & ~31UL) #define T4_MAX_FR_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) #define T4_RQ_NUM_SLOTS 2 -- cgit v0.10.2 From fed1db33fe85573487a4732d628ac5afdb5dc776 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 27 Aug 2010 08:29:38 -0500 Subject: IPoIB: Set pkt_type correctly for multicast packets (fix IGMP breakage) IGMP processing is broken because the IPOIB does not set the skb->pkt_type the right way for multicast traffic. All incoming packets are set to PACKET_HOST which means that igmp_recv() will ignore the IGMP broadcasts/multicasts. This in turn means that the IGMP timers are firing and are sending information about multicast subscriptions unnecessarily. In a large private network this can cause traffic spikes. Signed-off-by: Christoph Lameter Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index ec6b4fb..dfa7190 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -223,6 +223,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct sk_buff *skb; u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -271,6 +272,16 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ipoib_ud_dma_unmap_rx(priv, mapping); ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + skb_pull(skb, IB_GRH_BYTES); skb->protocol = ((struct ipoib_header *) skb->data)->proto; @@ -281,9 +292,6 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) dev->stats.rx_bytes += skb->len; skb->dev = dev; - /* XXX get correct PACKET_ type here */ - skb->pkt_type = PACKET_HOST; - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) skb->ip_summed = CHECKSUM_UNNECESSARY; -- cgit v0.10.2 From 0498628f974d4132ba34b06fc24786a00401b79c Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 20 Sep 2010 08:42:38 +0200 Subject: mlx4_core: Allow log_mtts_per_seg up to 7 As there are now machines with a lot more physical memory, we need to be able to register more memory. This patch lifts the upper limit of log_mtts_per_seg from 5 to 7, increasing the amount of memory that can be registered. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 5102ab1..4f6a859 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -103,7 +103,7 @@ MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports " static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); -MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); +MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) @@ -1304,7 +1304,7 @@ static int __init mlx4_verify_params(void) return -1; } - if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { + if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; } -- cgit v0.10.2 From 4d8d6389b26d8bee06cd706caf1fd3f1ccaa0205 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 Sep 2010 18:32:39 +0200 Subject: RDMA/nes: Remove unneeded variable Just a small cleanup. The "passive_state" variable isn't used any more after commit dae58728dc ("RDMA/nes: Fix double CLOSE event indication crash") Signed-off-by: Dan Carpenter Acked-by: Faisal Latif Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 61e0efd..5c8d34c 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1424,7 +1424,6 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, { int reset = 0; /* whether to send reset in case of err.. */ - int passive_state; atomic_inc(&cm_resets_recvd); nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u." " refcnt=%d\n", cm_node, cm_node->state, @@ -1439,7 +1438,7 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, active_open_err(cm_node, skb, reset); break; case NES_CM_STATE_MPAREQ_RCVD: - passive_state = atomic_add_return(1, &cm_node->passive_state); + atomic_inc(&cm_node->passive_state); dev_kfree_skb_any(skb); break; case NES_CM_STATE_ESTABLISHED: -- cgit v0.10.2 From fc4ec9bd82dbe08499d801f45a1d49e4c7b66b20 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 11 Sep 2010 22:10:55 -0700 Subject: RDMA/amso1100: Remove KERN_ from pr_ use Signed-off-by: Joe Perches Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c index 3b50954..0ebe4e8 100644 --- a/drivers/infiniband/hw/amso1100/c2_intr.c +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -62,8 +62,8 @@ void c2_rnic_interrupt(struct c2_dev *c2dev) static void handle_mq(struct c2_dev *c2dev, u32 mq_index) { if (c2dev->qptr_array[mq_index] == NULL) { - pr_debug(KERN_INFO "handle_mq: stray activity for mq_index=%d\n", - mq_index); + pr_debug("handle_mq: stray activity for mq_index=%d\n", + mq_index); return; } -- cgit v0.10.2 From 557d0540b96176dc42943e84c88c288f523388ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Sep 2010 14:33:31 +0000 Subject: IB/umad: Make user_mad semaphore a real one Get rid of init_MUTEX[_LOCKED]() and use sema_init() instead. Signed-off-by: Thomas Gleixner Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 5fa8569..cd1996d 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1022,7 +1022,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->ib_dev = device; port->port_num = port_num; - init_MUTEX(&port->sm_sem); + sema_init(&port->sm_sem, 1); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); -- cgit v0.10.2 From 625fbd3a36d836efaaee4b6d9c2fcd25e3654624 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Fri, 20 Aug 2010 04:10:19 +0000 Subject: IB/ehca: Fix driver on relocatable kernel the eHCA driver registers a MR for all of kernel memory, but makes the assumption that valid memory exists at KERNELBASE. This assumption may not be true in the case of a relocatable kernel, so use KERNELBASE + PHYSICAL_START to get the true beginning of usable kernel memory. cc: Joachim Fenkes cc: Christoph Raisch cc: Hoan-Ham Hguyen Signed-off-by: Sonny Rao Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 53f4cd4..43cae84 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -171,7 +171,7 @@ struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags) } ret = ehca_reg_maxmr(shca, e_maxmr, - (void *)ehca_map_vaddr((void *)KERNELBASE), + (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)), mr_access_flags, e_pd, &e_maxmr->ib.ib_mr.lkey, &e_maxmr->ib.ib_mr.rkey); @@ -1636,7 +1636,7 @@ int ehca_reg_internal_maxmr( /* register internal max-MR on HCA */ size_maxmr = ehca_mr_len; - iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE); + iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)); ib_pbuf.addr = 0; ib_pbuf.size = size_maxmr; num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr, @@ -2209,7 +2209,7 @@ int ehca_mr_is_maxmr(u64 size, { /* a MR is treated as max-MR only if it fits following: */ if ((size == ehca_mr_len) && - (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) { + (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) { ehca_gen_dbg("this is a max-MR"); return 1; } else -- cgit v0.10.2 From 293277215636e1586fcf6386b0b64457c6dfdb68 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Mon, 23 Aug 2010 14:32:36 +0000 Subject: RDMA/nes: Report correct port state if interface is down With commit cd6860eb ("RDMA/nes: Fix hangs on ifdown") we no longer remove nes interfaces on ifdown. On nes_query_port(), add an additional check of the netdev queue and report IB_PORT_DOWN if the queue is not running. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index b89972c..2374efbd 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -584,7 +584,9 @@ static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr props->lmc = 0; props->sm_lid = 0; props->sm_sl = 0; - if (nesvnic->linkup) + if (netif_queue_stopped(netdev)) + props->state = IB_PORT_DOWN; + else if (nesvnic->linkup) props->state = IB_PORT_ACTIVE; else props->state = IB_PORT_DOWN; -- cgit v0.10.2 From 52106bd24c8d5e8a26b98ad93a755b0827029860 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Wed, 1 Sep 2010 13:35:35 +0000 Subject: RDMA/nes: Turn carrier off on ifdown This lets the bonding driver to detect when an interface goes down. Signed-off-by: Maciej Sosnowski Signed-off-by: Chien Tung Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 10560c7..3892e2c 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -271,6 +271,7 @@ static int nes_netdev_stop(struct net_device *netdev) if (netif_msg_ifdown(nesvnic)) printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name); + netif_carrier_off(netdev); /* Disable network packets */ napi_disable(&nesvnic->napi); -- cgit v0.10.2 From 0f2f930a67c763a71aacfdbc76de9a76de7d1a9e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 5 Sep 2010 01:52:43 +0000 Subject: IB/qib: Remove unnecessary casts of private_data Signed-off-by: Joe Perches Acked-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 6b11645..cef5d67 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1722,7 +1722,7 @@ static int qib_close(struct inode *in, struct file *fp) mutex_lock(&qib_mutex); - fd = (struct qib_filedata *) fp->private_data; + fd = fp->private_data; fp->private_data = NULL; rcd = fd->rcd; if (!rcd) { @@ -1808,7 +1808,7 @@ static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo) struct qib_ctxtdata *rcd = ctxt_fp(fp); struct qib_filedata *fd; - fd = (struct qib_filedata *) fp->private_data; + fd = fp->private_data; info.num_active = qib_count_active_units(); info.unit = rcd->dd->unit; -- cgit v0.10.2 From fb50a83d8c7cf8c6548166fe6775ad4f41e5ab72 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 7 Oct 2010 09:54:53 -0700 Subject: MAINTAINERS: Hand off SCSI RDMA Protocol (SRP) initiator to Dave Dillow Signed-off-by: Roland Dreier diff --git a/MAINTAINERS b/MAINTAINERS index 668682d..68aa1ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5035,6 +5035,16 @@ W: http://www.kernel.dk S: Maintained F: drivers/scsi/sr* +SCSI RDMA PROTOCOL (SRP) INITIATOR +M: David Dillow +L: linux-rdma@vger.kernel.org +S: Supported +W: http://www.openfabrics.org +Q: http://patchwork.kernel.org/project/linux-rdma/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/dad/srp-initiator.git +F: drivers/infiniband/ulp/srp/ +F: include/scsi/srp.h + SCSI SG DRIVER M: Doug Gilbert L: linux-scsi@vger.kernel.org -- cgit v0.10.2 From 5a0fd09428e47fb08d5a887515d92bb2447f4b65 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 7 Oct 2010 16:24:16 +0200 Subject: IB/mlx4: Limit size of fast registration WRs Fix the limit on the size of max fast registration WRs that can be posted to match hardware capabilities. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4e94e36..62e8cd6 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -135,7 +135,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; - props->max_fast_reg_page_list_len = PAGE_SIZE / sizeof (u64); + props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 1d27b9a..dca55b1 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -226,7 +226,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device struct mlx4_ib_fast_reg_page_list *mfrpl; int size = page_list_len * sizeof (u64); - if (size > PAGE_SIZE) + if (page_list_len > MLX4_MAX_FAST_REG_PAGES) return ERR_PTR(-EINVAL); mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7a7f9c1..ada6938 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -171,6 +171,10 @@ enum { MLX4_NUM_FEXCH = 64 * 1024, }; +enum { + MLX4_MAX_FAST_REG_PAGES = 511, +}; + static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) { return (major << 32) | (minor << 16) | subminor; -- cgit v0.10.2 From 8bbac892fb75d20fa274ca026e24faf00afbf9dd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 29 Sep 2010 14:11:12 +0000 Subject: RDMA/cxgb4: Add default_llseek to debugfs files Incorporate BKL removal changes. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 986cfd7..b254357 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -182,6 +182,7 @@ static const struct file_operations qp_debugfs_fops = { .open = qp_open, .release = qp_release, .read = debugfs_read, + .llseek = default_llseek, }; static int dump_stag(int id, void *p, void *data) @@ -255,6 +256,7 @@ static const struct file_operations stag_debugfs_fops = { .open = stag_open, .release = stag_release, .read = debugfs_read, + .llseek = default_llseek, }; static int setup_debugfs(struct c4iw_dev *devp) -- cgit v0.10.2 From 3160977a6e66ea4c4b4f33010f5d04f0004b938c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 29 Sep 2010 18:21:33 +0000 Subject: RDMA/cxgb4: Use simple_read_from_buffer() for debugfs handlers We can replace our equivalent open-coded version. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index b254357..22a290d 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -68,32 +68,8 @@ static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct c4iw_debugfs_data *d = file->private_data; - loff_t pos = *ppos; - loff_t avail = d->pos; - if (pos < 0) - return -EINVAL; - if (pos >= avail) - return 0; - if (count > avail - pos) - count = avail - pos; - - while (count) { - size_t len = 0; - - len = min((int)count, (int)d->pos - (int)pos); - if (copy_to_user(buf, d->buf + pos, len)) - return -EFAULT; - if (len == 0) - return -EINVAL; - - buf += len; - pos += len; - count -= len; - } - count = pos - *ppos; - *ppos = pos; - return count; + return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos); } static int dump_qp(int id, void *p, void *data) -- cgit v0.10.2 From 26012f0750dd73348b0a0a680a4bee2715d4a334 Mon Sep 17 00:00:00 2001 From: Animesh K Trivedi Date: Tue, 28 Sep 2010 14:44:02 +0000 Subject: RDMA/iwcm: Fix hang in uninterruptible wait on cm_id destroy A process can get stuck in an uninterruptible wait in the kernel while destroying a cm_id when iw_cm_connect() fails: For example, When creation of a PD fails but the user continues with an attempt to connect to the server without checking the return value, in iw_cm_connect() a NULL qp is found so the call fails. However the IWCM_F_CONNECT_WAIT bit is not cleared. destroy_cm_id() then waits forever for IWCM_F_CONNECT_WAIT to be cleared. The same problem exists on the passive side with the accept call. Fix this by clearing the bit and waking up any waiters in the appropriate spots. Signed-off-by: Animesh Trivedi Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index bfead5b..2a1e9ae 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -506,6 +506,8 @@ int iw_cm_accept(struct iw_cm_id *cm_id, qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); @@ -565,6 +567,8 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); -- cgit v0.10.2 From 7b4c876961ad6ddcfacd69b25fe7e13ff41fe322 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 27 Sep 2010 17:51:11 -0700 Subject: IPoIB: Skip IBoE ports IPoIB is IP-over-Infiniband link layer. In the case of IBoE, the link layer is Ethernet and IP can work directly over Ethernet, so disable IPoIB for non-IB_LINK_LAYER_INFINIBAND ports. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b4b2257..9141779 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1362,6 +1362,8 @@ static void ipoib_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = netdev_priv(dev); -- cgit v0.10.2 From fac70d51914674ce8ae742ed73441ddb4770ad20 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 27 Sep 2010 17:51:11 -0700 Subject: IB/mad: IBoE supports only QP1 (no QP0) Since IBoE is using Ethernet as its link layer, there is no central management entity so there is need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will skip QP0 and create only QP1 for IBoE ports. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index ae7c288..91916a8 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -59,8 +59,8 @@ __ib_get_agent_port(struct ib_device *device, int port_num) struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { - if (entry->agent[0]->device == device && - entry->agent[0]->port_num == port_num) + if (entry->agent[1]->device == device && + entry->agent[1]->port_num == port_num) return entry; } return NULL; @@ -155,14 +155,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error1; } - /* Obtain send only MAD agent for SMI QP */ - port_priv->agent[0] = ib_register_mad_agent(device, port_num, - IB_QPT_SMI, NULL, 0, - &agent_send_handler, - NULL, NULL); - if (IS_ERR(port_priv->agent[0])) { - ret = PTR_ERR(port_priv->agent[0]); - goto error2; + if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + /* Obtain send only MAD agent for SMI QP */ + port_priv->agent[0] = ib_register_mad_agent(device, port_num, + IB_QPT_SMI, NULL, 0, + &agent_send_handler, + NULL, NULL); + if (IS_ERR(port_priv->agent[0])) { + ret = PTR_ERR(port_priv->agent[0]); + goto error2; + } } /* Obtain send only MAD agent for GSI QP */ @@ -182,7 +184,8 @@ int ib_agent_port_open(struct ib_device *device, int port_num) return 0; error3: - ib_unregister_mad_agent(port_priv->agent[0]); + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); error2: kfree(port_priv); error1: @@ -205,7 +208,9 @@ int ib_agent_port_close(struct ib_device *device, int port_num) spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); ib_unregister_mad_agent(port_priv->agent[1]); - ib_unregister_mad_agent(port_priv->agent[0]); + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); + kfree(port_priv); return 0; } diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef1304f..822cfdc 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2598,6 +2598,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; + if (!qp_info->qp) + return; + while (!list_empty(&qp_info->recv_queue.list)) { mad_list = list_entry(qp_info->recv_queue.list.next, @@ -2639,6 +2642,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; + if (!qp) + continue; + /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition @@ -2680,6 +2686,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) } for (i = 0; i < IB_MAD_QPS_CORE; i++) { + if (!port_priv->qp_info[i].qp) + continue; + ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { printk(KERN_ERR PFX "Couldn't post receive WRs\n"); @@ -2758,6 +2767,9 @@ error: static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { + if (!qp_info->qp) + return; + ib_destroy_qp(qp_info->qp); kfree(qp_info->snoop_table); } @@ -2773,6 +2785,7 @@ static int ib_mad_port_open(struct ib_device *device, struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof "ib_mad123"]; + int has_smi; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); @@ -2788,7 +2801,11 @@ static int ib_mad_port_open(struct ib_device *device, init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); - cq_size = (mad_sendq_size + mad_recvq_size) * 2; + cq_size = mad_sendq_size + mad_recvq_size; + has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + if (has_smi) + cq_size *= 2; + port_priv->cq = ib_create_cq(port_priv->device, ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); @@ -2812,9 +2829,11 @@ static int ib_mad_port_open(struct ib_device *device, goto error5; } - ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); - if (ret) - goto error6; + if (has_smi) { + ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); + if (ret) + goto error6; + } ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); if (ret) goto error7; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index a519801..68b4162 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -774,6 +774,10 @@ static void mcast_event_handler(struct ib_event_handler *handler, int index; dev = container_of(handler, struct mcast_device, event_handler); + if (rdma_port_get_link_layer(dev->device, event->element.port_num) != + IB_LINK_LAYER_INFINIBAND) + return; + index = event->element.port_num - dev->start_port; switch (event->event) { @@ -796,6 +800,7 @@ static void mcast_add_one(struct ib_device *device) struct mcast_device *dev; struct mcast_port *port; int i; + int count = 0; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; @@ -813,6 +818,9 @@ static void mcast_add_one(struct ib_device *device) } for (i = 0; i <= dev->end_port - dev->start_port; i++) { + if (rdma_port_get_link_layer(device, dev->start_port + i) != + IB_LINK_LAYER_INFINIBAND) + continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; @@ -820,6 +828,12 @@ static void mcast_add_one(struct ib_device *device) port->table = RB_ROOT; init_completion(&port->comp); atomic_set(&port->refcount, 1); + ++count; + } + + if (!count) { + kfree(dev); + return; } dev->device = device; @@ -843,9 +857,12 @@ static void mcast_remove_one(struct ib_device *device) flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - port = &dev->port[i]; - deref_port(port); - wait_for_completion(&port->comp); + if (rdma_port_get_link_layer(device, dev->start_port + i) == + IB_LINK_LAYER_INFINIBAND) { + port = &dev->port[i]; + deref_port(port); + wait_for_completion(&port->comp); + } } kfree(dev); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7e1ffd8c..27674c7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -416,6 +416,9 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; + if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) + return; + spin_lock_irqsave(&port->ah_lock, flags); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); @@ -1007,7 +1010,7 @@ static void ib_sa_add_one(struct ib_device *device) e = device->phys_port_cnt; } - sa_dev = kmalloc(sizeof *sa_dev + + sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), GFP_KERNEL); if (!sa_dev) @@ -1017,9 +1020,12 @@ static void ib_sa_add_one(struct ib_device *device) sa_dev->end_port = e; for (i = 0; i <= e - s; ++i) { + spin_lock_init(&sa_dev->port[i].ah_lock); + if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + continue; + sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; - spin_lock_init(&sa_dev->port[i].ah_lock); sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, @@ -1045,13 +1051,15 @@ static void ib_sa_add_one(struct ib_device *device) goto err; for (i = 0; i <= e - s; ++i) - update_sm_ah(&sa_dev->port[i].update_task); + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + update_sm_ah(&sa_dev->port[i].update_task); return; err: while (--i >= 0) - ib_unregister_mad_agent(sa_dev->port[i].agent); + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + ib_unregister_mad_agent(sa_dev->port[i].agent); kfree(sa_dev); @@ -1071,9 +1079,12 @@ static void ib_sa_remove_one(struct ib_device *device) flush_scheduled_work(); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { - ib_unregister_mad_agent(sa_dev->port[i].agent); - if (sa_dev->port[i].sm_ah) - kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + ib_unregister_mad_agent(sa_dev->port[i].agent); + if (sa_dev->port[i].sm_ah) + kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + } + } kfree(sa_dev); -- cgit v0.10.2 From 3c86aa70bf677a31b71c8292e349242e26cbc743 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 13 Oct 2010 21:26:51 +0200 Subject: RDMA/cm: Add RDMA CM support for IBoE devices Add support for IBoE device binding and IP --> GID resolution. Path resolving and multicast joining are implemented within cma.c by filling in the responses and running callbacks in the CMA work queue. IP --> GID resolution always yields IPv6 link local addresses; remote GIDs are derived from the destination MAC address of the remote port. Multicast GIDs are always mapped to multicast MACs as is done in IPv6. (IPv4 multicast is enabled by translating IPv4 multicast addresses to IPv6 multicast as described in .) Some helper functions are added to ib_addr.h. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index b930b81..f61bc07 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -59,6 +59,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define CMA_IBOE_PACKET_LIFETIME 18 static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -157,6 +158,7 @@ struct cma_multicast { struct list_head list; void *context; struct sockaddr_storage addr; + struct kref mcref; }; struct cma_work { @@ -173,6 +175,12 @@ struct cma_ndev_work { struct rdma_cm_event event; }; +struct iboe_mcast_work { + struct work_struct work; + struct rdma_id_private *id; + struct cma_multicast *mc; +}; + union cma_ip_addr { struct in6_addr ip6; struct { @@ -281,6 +289,8 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, atomic_inc(&cma_dev->refcount); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; + id_priv->id.route.addr.dev_addr.transport = + rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } @@ -290,6 +300,14 @@ static inline void cma_deref_dev(struct cma_device *cma_dev) complete(&cma_dev->comp); } +static inline void release_mc(struct kref *kref) +{ + struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); + + kfree(mc->multicast.ib); + kfree(mc); +} + static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(&id_priv->list); @@ -323,22 +341,63 @@ static int cma_set_qkey(struct rdma_id_private *id_priv) return ret; } +static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +{ + int i; + int err; + struct ib_port_attr props; + union ib_gid tmp; + + err = ib_query_port(device, port_num, &props); + if (err) + return 1; + + for (i = 0; i < props.gid_tbl_len; ++i) { + err = ib_query_gid(device, port_num, i, &tmp); + if (err) + return 1; + if (!memcmp(&tmp, gid, sizeof tmp)) + return 0; + } + + return -EAGAIN; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid; + union ib_gid gid, iboe_gid; int ret = -ENODEV; + u8 port; + enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; - rdma_addr_get_sgid(dev_addr, &gid); + iboe_addr_get_sgid(dev_addr, &iboe_gid); + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { - ret = ib_find_cached_gid(cma_dev->device, &gid, - &id_priv->id.port_num, NULL); - if (!ret) { - cma_attach_to_dev(id_priv, cma_dev); - break; + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = find_gid_port(cma_dev->device, &iboe_gid, port); + else + ret = find_gid_port(cma_dev->device, &gid, port); + + if (!ret) { + id_priv->id.port_num = port; + goto out; + } else if (ret == 1) + break; + } } } + +out: + if (!ret) + cma_attach_to_dev(id_priv, cma_dev); + return ret; } @@ -556,10 +615,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; + u16 pkey; + + if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == + IB_LINK_LAYER_INFINIBAND) + pkey = ib_addr_get_pkey(dev_addr); + else + pkey = 0xffff; ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, - ib_addr_get_pkey(dev_addr), - &qp_attr->pkey_index); + pkey, &qp_attr->pkey_index); if (ret) return ret; @@ -737,8 +802,8 @@ static inline int cma_user_data_offset(enum rdma_port_space ps) static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); break; @@ -816,8 +881,17 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); + switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } } } @@ -833,7 +907,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&lock); if (id_priv->cma_dev) { mutex_unlock(&lock); - switch (rdma_node_get_transport(id->device->node_type)) { + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1708,6 +1782,77 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + struct rdma_addr *addr = &route->addr; + struct cma_work *work; + int ret; + struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; + struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; + struct net_device *ndev = NULL; + + if (src_addr->sin_family != dst_addr->sin_family) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + + route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + route->num_paths = 1; + + iboe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr); + iboe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr); + + route->path_rec->hop_limit = 1; + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; + + if (addr->dev_addr.bound_dev_if) + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + route->path_rec->mtu = iboe_get_mtu(ndev->mtu); + route->path_rec->rate_selector = IB_SA_EQ; + route->path_rec->rate = iboe_get_rate(ndev); + dev_put(ndev); + route->path_rec->packet_life_time_selector = IB_SA_EQ; + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + if (!route->path_rec->mtu) { + ret = -EINVAL; + goto err2; + } + + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + work->event.status = 0; + + queue_work(cma_wq, &work->work); + + return 0; + +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; @@ -1720,7 +1865,16 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) atomic_inc(&id_priv->refcount); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - ret = cma_resolve_ib_route(id_priv, timeout_ms); + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_resolve_ib_route(id_priv, timeout_ms); + break; + case IB_LINK_LAYER_ETHERNET: + ret = cma_resolve_iboe_route(id_priv); + break; + default: + ret = -ENOSYS; + } break; case RDMA_TRANSPORT_IWARP: ret = cma_resolve_iw_route(id_priv, timeout_ms); @@ -1773,7 +1927,7 @@ port_found: goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB) ? + (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); @@ -2758,6 +2912,102 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, return 0; } +static void iboe_mcast_work_handler(struct work_struct *work) +{ + struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); + struct cma_multicast *mc = mw->mc; + struct ib_sa_multicast *m = mc->multicast.ib; + + mc->multicast.ib->context = mc; + cma_ib_mc_handler(0, m); + kref_put(&mc->mcref, release_mc); + kfree(mw); +} + +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if (addr->sa_family == AF_INET6) { + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else { + mgid->raw[0] = 0xff; + mgid->raw[1] = 0x0e; + mgid->raw[2] = 0; + mgid->raw[3] = 0; + mgid->raw[4] = 0; + mgid->raw[5] = 0; + mgid->raw[6] = 0; + mgid->raw[7] = 0; + mgid->raw[8] = 0; + mgid->raw[9] = 0; + mgid->raw[10] = 0xff; + mgid->raw[11] = 0xff; + *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; + } +} + +static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct iboe_mcast_work *work; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int err; + struct sockaddr *addr = (struct sockaddr *)&mc->addr; + struct net_device *ndev = NULL; + + if (cma_zero_addr((struct sockaddr *)&mc->addr)) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); + if (!mc->multicast.ib) { + err = -ENOMEM; + goto out1; + } + + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + + mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); + if (id_priv->id.ps == RDMA_PS_UDP) + mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!ndev) { + err = -ENODEV; + goto out2; + } + mc->multicast.ib->rec.rate = iboe_get_rate(ndev); + mc->multicast.ib->rec.hop_limit = 1; + mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + dev_put(ndev); + if (!mc->multicast.ib->rec.mtu) { + err = -EINVAL; + goto out2; + } + iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); + work->id = id_priv; + work->mc = mc; + INIT_WORK(&work->work, iboe_mcast_work_handler); + kref_get(&mc->mcref); + queue_work(cma_wq, &work->work); + + return 0; + +out2: + kfree(mc->multicast.ib); +out1: + kfree(work); + return err; +} + int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { @@ -2784,7 +3034,17 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - ret = cma_join_ib_multicast(id_priv, mc); + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_join_ib_multicast(id_priv, mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + break; + default: + ret = -EINVAL; + } break; default: ret = -ENOSYS; @@ -2817,8 +3077,19 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, mc->multicast.ib->rec.mlid); - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } + } return; } } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 27674c7..91a6603 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -496,6 +496,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; + int force_grh; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -505,7 +506,9 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - if (rec->hop_limit > 1) { + force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + + if (rec->hop_limit > 1 || force_grh) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ac7edc2..3d3c926 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -583,6 +583,34 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, } } +static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + resp->num_paths = route->num_paths; + switch (route->num_paths) { + case 0: + dev_addr = &route->addr.dev_addr; + iboe_mac_to_ll((union ib_gid *) &resp->ib_route[0].dgid, + dev_addr->dst_dev_addr); + iboe_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(0xffff); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + /* fall through */ + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -617,12 +645,17 @@ static ssize_t ucma_query_route(struct ucma_file *file, resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - ucma_copy_ib_route(&resp, &ctx->cm_id->route); - break; - default: - break; + if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + break; + case IB_LINK_LAYER_ETHERNET: + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + break; + default: + break; + } } out: diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index fa0d52b..904ffa9 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -40,6 +40,7 @@ #include #include #include +#include struct rdma_addr_client { atomic_t refcount; @@ -63,6 +64,7 @@ struct rdma_dev_addr { unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; + enum rdma_transport_type transport; }; /** @@ -127,9 +129,31 @@ static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } +static inline void iboe_mac_to_ll(union ib_gid *gid, u8 *mac) +{ + memset(gid->raw, 0, 16); + *((__be32 *) gid->raw) = cpu_to_be32(0xfe800000); + gid->raw[12] = 0xfe; + gid->raw[11] = 0xff; + memcpy(gid->raw + 13, mac + 3, 3); + memcpy(gid->raw + 8, mac, 3); + gid->raw[8] ^= 2; +} + +static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + iboe_mac_to_ll(gid, dev_addr->src_dev_addr); +} + static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); + if (dev_addr->transport == RDMA_TRANSPORT_IB && + dev_addr->dev_type != ARPHRD_INFINIBAND) + iboe_addr_get_sgid(dev_addr, gid); + else + memcpy(gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) @@ -147,4 +171,77 @@ static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_g memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } +static inline enum ib_mtu iboe_get_mtu(int mtu) +{ + /* + * reduce IB headers from effective IBoE MTU. 28 stands for + * atomic header which is the biggest possible header after BTH + */ + mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28; + + if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) + return IB_MTU_4096; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) + return IB_MTU_2048; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) + return IB_MTU_1024; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) + return IB_MTU_512; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) + return IB_MTU_256; + else + return 0; +} + +static inline int iboe_get_rate(struct net_device *dev) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings || + dev->ethtool_ops->get_settings(dev, &cmd)) + return IB_RATE_PORT_CURRENT; + + if (cmd.speed >= 40000) + return IB_RATE_40_GBPS; + else if (cmd.speed >= 30000) + return IB_RATE_30_GBPS; + else if (cmd.speed >= 20000) + return IB_RATE_20_GBPS; + else if (cmd.speed >= 10000) + return IB_RATE_10_GBPS; + else + return IB_RATE_PORT_CURRENT; +} + +static inline int rdma_link_local_addr(struct in6_addr *addr) +{ + if (addr->s6_addr32[0] == htonl(0xfe800000) && + addr->s6_addr32[1] == 0) + return 1; + + return 0; +} + +static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) +{ + memcpy(mac, &addr->s6_addr[8], 3); + memcpy(mac + 3, &addr->s6_addr[13], 3); + mac[0] ^= 2; +} + +static inline int rdma_is_multicast_addr(struct in6_addr *addr) +{ + return addr->s6_addr[0] == 0xff; +} + +static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) +{ + int i; + + mac[0] = 0x33; + mac[1] = 0x33; + for (i = 2; i < 6; ++i) + mac[i] = addr->s6_addr[i + 10]; +} + #endif /* IB_ADDR_H */ -- cgit v0.10.2 From ff7f5aab354dee01f29c9c00933f6d4aa590eadb Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 26 Aug 2010 14:17:56 +0000 Subject: IB/pack: IBoE UD packet packing support Add support for packing IBoE packet headers. Signed-off-by: Eli Cohen [ Clean up and fix ib_ud_header_init() a bit. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 650b501..cb0dd5a 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -80,6 +80,29 @@ static const struct ib_field lrh_table[] = { .size_bits = 16 } }; +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -180,38 +203,38 @@ static const struct ib_field deth_table[] = { /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload + * @lrh_present: specify if LRH is present + * @eth_present: specify if Eth header is present * @grh_present:GRH flag (if non-zero, GRH will be included) - * @immediate_present: specify if immediate data should be used + * @immediate_present: specify if immediate data is present * @header:Structure to initialize - * - * ib_ud_header_init() initializes the lrh.link_version, lrh.link_next_header, - * lrh.packet_length, grh.ip_version, grh.payload_length, - * grh.next_header, bth.opcode, bth.pad_count and - * bth.transport_header_version fields of a &struct ib_ud_header given - * the payload length and whether a GRH will be included. */ void ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, int grh_present, int immediate_present, struct ib_ud_header *header) { - u16 packet_length; - memset(header, 0, sizeof *header); - header->lrh.link_version = 0; - header->lrh.link_next_header = - grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; - packet_length = (IB_LRH_BYTES + - IB_BTH_BYTES + - IB_DETH_BYTES + - payload_bytes + - 4 + /* ICRC */ - 3) / 4; /* round up */ - - header->grh_present = grh_present; + if (lrh_present) { + u16 packet_length; + + header->lrh.link_version = 0; + header->lrh.link_next_header = + grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; + packet_length = (IB_LRH_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + (grh_present ? IB_GRH_BYTES : 0) + + payload_bytes + + 4 + /* ICRC */ + 3) / 4; /* round up */ + header->lrh.packet_length = cpu_to_be16(packet_length); + } + if (grh_present) { - packet_length += IB_GRH_BYTES / 4; header->grh.ip_version = 6; header->grh.payload_length = cpu_to_be16((IB_BTH_BYTES + @@ -222,19 +245,51 @@ void ib_ud_header_init(int payload_bytes, header->grh.next_header = 0x1b; } - header->lrh.packet_length = cpu_to_be16(packet_length); - - header->immediate_present = immediate_present; if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; else header->bth.opcode = IB_OPCODE_UD_SEND_ONLY; header->bth.pad_count = (4 - payload_bytes) & 3; header->bth.transport_header_version = 0; + + header->lrh_present = lrh_present; + header->eth_present = eth_present; + header->grh_present = grh_present; + header->immediate_present = immediate_present; } EXPORT_SYMBOL(ib_ud_header_init); /** + * ib_lrh_header_pack - Pack LRH header struct into wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_pack() packs the LRH header structure @lrh into + * wire format in the buffer @buf. + */ +int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf) +{ + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_pack); + +/** + * ib_lrh_header_unpack - Unpack LRH structure from wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_unpack() unpacks the LRH header structure from + * wire format (in buf) into @lrh. + */ +int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh) +{ + ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_unpack); + +/** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into @@ -247,10 +302,16 @@ int ib_ud_header_pack(struct ib_ud_header *header, { int len = 0; - ib_pack(lrh_table, ARRAY_SIZE(lrh_table), - &header->lrh, buf); - len += IB_LRH_BYTES; - + if (header->lrh_present) { + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), + &header->lrh, buf + len); + len += IB_LRH_BYTES; + } + if (header->eth_present) { + ib_pack(eth_table, ARRAY_SIZE(eth_table), + &header->eth, buf + len); + len += IB_ETH_BYTES; + } if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), &header->grh, buf + len); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 6a60827..bb1277c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1231,7 +1231,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; - ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), 0, &sqp->ud_header); + ib_ud_header_init(send_size, 1, 0, mlx4_ib_ah_grh_present(ah), 0, &sqp->ud_header); sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index d2d172e..1a1c55f 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1493,7 +1493,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, int err; u16 pkey; - ib_ud_header_init(256, /* assume a MAD */ + ib_ud_header_init(256, /* assume a MAD */ 1, 0, mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0, &sqp->ud_header); diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index cbb50f4..6b91d8e 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -37,6 +37,7 @@ enum { IB_LRH_BYTES = 8, + IB_ETH_BYTES = 14, IB_GRH_BYTES = 40, IB_BTH_BYTES = 12, IB_DETH_BYTES = 8 @@ -210,14 +211,25 @@ struct ib_unpacked_deth { __be32 source_qpn; }; +struct ib_unpacked_eth { + u8 dmac_h[4]; + u8 dmac_l[2]; + u8 smac_h[2]; + u8 smac_l[4]; + __be16 type; +}; + struct ib_ud_header { + int lrh_present; struct ib_unpacked_lrh lrh; - int grh_present; - struct ib_unpacked_grh grh; - struct ib_unpacked_bth bth; + int eth_present; + struct ib_unpacked_eth eth; + int grh_present; + struct ib_unpacked_grh grh; + struct ib_unpacked_bth bth; struct ib_unpacked_deth deth; - int immediate_present; - __be32 immediate_data; + int immediate_present; + __be32 immediate_data; }; void ib_pack(const struct ib_field *desc, @@ -230,9 +242,11 @@ void ib_unpack(const struct ib_field *desc, void *buf, void *structure); -void ib_ud_header_init(int payload_bytes, - int grh_present, - int immediate_present, +void ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int grh_present, + int immediate_present, struct ib_ud_header *header); int ib_ud_header_pack(struct ib_ud_header *header, -- cgit v0.10.2 From de5dd81b49c27c7818492be0746bfed6ac3b1c8d Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 18 Oct 2010 15:16:40 +0000 Subject: RDMA/cxgb4: Export T4 TCP MIB Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 81e1277..f66dd8b 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -391,7 +391,17 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, static int c4iw_get_mib(struct ib_device *ibdev, union rdma_protocol_stats *stats) { - return -ENOSYS; + struct tp_tcp_stats v4, v6; + struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev); + + cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6); + memset(stats, 0, sizeof *stats); + stats->iw.tcpInSegs = v4.tcpInSegs + v6.tcpInSegs; + stats->iw.tcpOutSegs = v4.tcpOutSegs + v6.tcpOutSegs; + stats->iw.tcpRetransSegs = v4.tcpRetransSegs + v6.tcpRetransSegs; + stats->iw.tcpOutRsts = v4.tcpOutRsts + v6.tcpOutSegs; + + return 0; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -- cgit v0.10.2 From da411ba1daf895bdae9420101e8e2741d6633342 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 18 Oct 2010 15:16:45 +0000 Subject: RDMA/cxgb4: Use cxgb4 service for packet gl to skb Remove the local service t4_pktgl_to_skb() and use cxgb4_pktgl_to_skb() exported by cxgb4. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 22a290d..54fbc11 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -454,46 +454,6 @@ out: return dev; } -static struct sk_buff *t4_pktgl_to_skb(const struct pkt_gl *gl, - unsigned int skb_len, - unsigned int pull_len) -{ - struct sk_buff *skb; - struct skb_shared_info *ssi; - - if (gl->tot_len <= 512) { - skb = alloc_skb(gl->tot_len, GFP_ATOMIC); - if (unlikely(!skb)) - goto out; - __skb_put(skb, gl->tot_len); - skb_copy_to_linear_data(skb, gl->va, gl->tot_len); - } else { - skb = alloc_skb(skb_len, GFP_ATOMIC); - if (unlikely(!skb)) - goto out; - __skb_put(skb, pull_len); - skb_copy_to_linear_data(skb, gl->va, pull_len); - - ssi = skb_shinfo(skb); - ssi->frags[0].page = gl->frags[0].page; - ssi->frags[0].page_offset = gl->frags[0].page_offset + pull_len; - ssi->frags[0].size = gl->frags[0].size - pull_len; - if (gl->nfrags > 1) - memcpy(&ssi->frags[1], &gl->frags[1], - (gl->nfrags - 1) * sizeof(skb_frag_t)); - ssi->nr_frags = gl->nfrags; - - skb->len = gl->tot_len; - skb->data_len = skb->len - pull_len; - skb->truesize += skb->data_len; - - /* Get a reference for the last page, we don't own it */ - get_page(gl->frags[gl->nfrags - 1].page); - } -out: - return skb; -} - static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, const struct pkt_gl *gl) { @@ -518,7 +478,7 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, c4iw_ev_handler(dev, qid); return 0; } else { - skb = t4_pktgl_to_skb(gl, 128, 128); + skb = cxgb4_pktgl_to_skb(gl, 128, 128); if (unlikely(!skb)) goto nomem; } -- cgit v0.10.2 From b955150ea784af4c193b708a2e8091673bf23004 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 21 Oct 2010 12:37:06 +0000 Subject: RDMA/cxgb3: When a user QP is marked in error, also mark the CQs in error The flushing of work requests for user QPs is implemented entirely in the user mode library. The only kernel interaction is to mark the user QP object indicating it is in error when the QP exits RTS. When the user QP operations are called by the application (eg: post_send, post_recv), the QP in error bit is checked and if set, the library flushes the QP. If, however, the application is not doing IO, but rather just polling the CQ, it will never get flushed work requests. This breaks some classes of applications. This patch adds logic to mark user CQs in error when a QP that is bound to the CQ is marked in error. The library poll code can then notice the CQ is in error and flush all the in error QPs bound to that CQ. Design: - add 1 extra CQE entry to the CQ memory that will be used to indicate in error status. - return the desired CQ memory size that should be mapped by the library - bump the ABI since the create_cq uverbs response changes. - detect older libraries and reduce the mmap size accordingly. (The ABI bump doesn't break old libraries, since they didn't check the ABI field anyway) Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 005b7b5..09dda0b 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -160,6 +160,7 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) struct rdma_cq_setup setup; int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + size += 1; /* one extra page for storing cq-in-err state */ cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); if (!cq->cqid) return -ENOMEM; diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h index e5ddb63..4bb997a 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_wr.h +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -728,6 +728,22 @@ struct t3_cq { #define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \ CQE_GENBIT(*cqe)) +struct t3_cq_status_page { + u32 cq_err; +}; + +static inline int cxio_cq_in_error(struct t3_cq *cq) +{ + return ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err; +} + +static inline void cxio_set_cq_in_error(struct t3_cq *cq) +{ + ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err = 1; +} + static inline void cxio_set_wq_in_error(struct t3_wq *wq) { wq->queue->wq_in_err.err |= 1; diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c index 6afc89e..71e0d84 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_ev.c +++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -76,6 +76,14 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, atomic_inc(&qhp->refcnt); spin_unlock(&rnicp->lock); + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } + event.event = ib_event; event.device = chp->ibcq.device; if (ib_event == IB_EVENT_CQ_ERR) @@ -86,13 +94,7 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, if (qhp->ibqp.event_handler) (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); - if (qhp->attr.state == IWCH_QP_STATE_RTS) { - attrs.next_state = IWCH_QP_STATE_TERMINATE; - iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, - &attrs, 1); - if (send_term) - iwch_post_terminate(qhp, rsp_msg); - } + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); if (atomic_dec_and_test(&qhp->refcnt)) wake_up(&qhp->wait); @@ -179,7 +181,6 @@ void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) case TPT_ERR_BOUND: case TPT_ERR_INVALIDATE_SHARED_MR: case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: - (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); break; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index fca0b4b..2e27413 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -154,6 +154,8 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve struct iwch_create_cq_resp uresp; struct iwch_create_cq_req ureq; struct iwch_ucontext *ucontext = NULL; + static int warned; + size_t resplen; PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries); rhp = to_iwch_dev(ibdev); @@ -217,15 +219,26 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve uresp.key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); - if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + if (udata->outlen < sizeof uresp) { + if (!warned++) + printk(KERN_WARNING MOD "Warning - " + "downlevel libcxgb3 (non-fatal).\n"); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof(struct t3_cqe)); + resplen = sizeof(struct iwch_create_cq_resp_v0); + } else { + mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * + sizeof(struct t3_cqe)); + uresp.memsize = mm->len; + resplen = sizeof uresp; + } + if (ib_copy_to_udata(udata, &uresp, resplen)) { kfree(mm); iwch_destroy_cq(&chp->ibcq); return ERR_PTR(-EFAULT); } - mm->key = uresp.key; - mm->addr = virt_to_phys(chp->cq.queue); - mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * - sizeof (struct t3_cqe)); insert_mmap(ucontext, mm); } PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", @@ -1414,6 +1427,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; dev->ibdev.get_protocol_stats = iwch_get_mib; + dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index c64d27b..0993137 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -802,14 +802,12 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) /* * Assumes qhp lock is held. */ -static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) +static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, + struct iwch_cq *schp, unsigned long *flag) { - struct iwch_cq *rchp, *schp; int count; int flushed; - rchp = get_chp(qhp->rhp, qhp->attr.rcq); - schp = get_chp(qhp->rhp, qhp->attr.scq); PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); /* take a ref on the qhp since we must release the lock */ @@ -847,10 +845,23 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) static void flush_qp(struct iwch_qp *qhp, unsigned long *flag) { - if (qhp->ibqp.uobject) + struct iwch_cq *rchp, *schp; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + if (qhp->ibqp.uobject) { cxio_set_wq_in_error(&qhp->wq); - else - __flush_qp(qhp, flag); + cxio_set_cq_in_error(&rchp->cq); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + if (schp != rchp) { + cxio_set_cq_in_error(&schp->cq); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + } + return; + } + __flush_qp(qhp, rchp, schp, flag); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h index cb7086f..a277c31 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_user.h +++ b/drivers/infiniband/hw/cxgb3/iwch_user.h @@ -45,10 +45,18 @@ struct iwch_create_cq_req { __u64 user_rptr_addr; }; +struct iwch_create_cq_resp_v0 { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + struct iwch_create_cq_resp { __u64 key; __u32 cqid; __u32 size_log2; + __u32 memsize; + __u32 reserved; }; struct iwch_create_qp_resp { -- cgit v0.10.2 From 5715f5d44b93507693feca6c62290e123e5ef688 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 22 Oct 2010 22:00:48 +0000 Subject: IB/qib: Process RDMA WRITE ONLY with IMMEDIATE properly See table 35 in IBA - the header order for RDMA_WRITE_ONLY_WITH_IMMEDIATE and SEND_LAST_WITH_IMMEDIATE is different: the RDMA_WRITE_ONLY has a RETH header before the immediate data, so we need a different code path to extract the immediate data. I tested this with a userspace app that does RDMA_WRITE with immediate on a QLE7140. Signed-off-by: Jason Gunthorpe Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index a093111..955fb71 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -2068,7 +2068,10 @@ send_last: goto nack_op_err; if (!ret) goto rnr_nak; - goto send_last_imm; + wc.ex.imm_data = ohdr->u.rc.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; case OP(RDMA_READ_REQUEST): { struct qib_ack_entry *e; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index b9c8b63..32ccf3c 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -457,8 +457,10 @@ rdma_first: } if (opcode == OP(RDMA_WRITE_ONLY)) goto rdma_last; - else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; goto rdma_last_imm; + } /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ @@ -471,8 +473,8 @@ rdma_first: break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): -rdma_last_imm: wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; -- cgit v0.10.2 From dd5e6e38b2b8bd8bf71cae800e2b613e85ef1522 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Aug 2010 19:27:20 +0000 Subject: IB/srp: Preparation for transmit ring response allocation The transmit ring in ib_srp (srp_target.tx_ring) is currently only used for allocating requests sent by the initiator to the target. This patch prepares using that ring for allocation of both requests and responses. Also, this patch differentiates the uses of SRP_SQ_SIZE, increases the size of the IB send completion queue by one element and reserves one transmit ring slot for SRP_TSK_MGMT requests. Signed-off-by: Bart Van Assche Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7f8f16b..b8b09a4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -291,7 +291,7 @@ static void srp_free_target_ib(struct srp_target_port *target) for (i = 0; i < SRP_RQ_SIZE; ++i) srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) + for (i = 0; i < SRP_SQ_SIZE; ++i) srp_free_iu(target->srp_host, target->tx_ring[i]); } @@ -822,7 +822,7 @@ static int srp_post_recv(struct srp_target_port *target) spin_lock_irqsave(target->scsi_host->host_lock, flags); - next = target->rx_head & (SRP_RQ_SIZE - 1); + next = target->rx_head & SRP_RQ_MASK; wr.wr_id = next; iu = target->rx_ring[next]; @@ -989,19 +989,19 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, enum srp_request_type req_type) { - s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2; + s32 rsv = (req_type == SRP_REQ_TASK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; srp_send_completion(target->send_cq, target); if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) return NULL; - if (target->req_lim < min) { + if (target->req_lim <= rsv) { ++target->zero_req_lim; return NULL; } - return target->tx_ring[target->tx_head & SRP_SQ_SIZE]; + return target->tx_ring[target->tx_head & SRP_SQ_MASK]; } /* @@ -1020,7 +1020,7 @@ static int __srp_post_send(struct srp_target_port *target, list.lkey = target->srp_host->srp_dev->mr->lkey; wr.next = NULL; - wr.wr_id = target->tx_head & SRP_SQ_SIZE; + wr.wr_id = target->tx_head & SRP_SQ_MASK; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; @@ -1121,7 +1121,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { + for (i = 0; i < SRP_SQ_SIZE; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, srp_max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); @@ -1137,7 +1137,7 @@ err: target->rx_ring[i] = NULL; } - for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { + for (i = 0; i < SRP_SQ_SIZE; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); target->tx_ring[i] = NULL; } @@ -1626,9 +1626,9 @@ static struct scsi_host_template srp_template = { .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, - .can_queue = SRP_SQ_SIZE, + .can_queue = SRP_CMD_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_SQ_SIZE, + .cmd_per_lun = SRP_CMD_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -1813,7 +1813,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) printk(KERN_WARNING PFX "bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_SQ_SIZE); + target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); break; case SRP_OPT_IO_CLASS: @@ -1891,7 +1891,7 @@ static ssize_t srp_create_target(struct device *dev, INIT_LIST_HEAD(&target->free_reqs); INIT_LIST_HEAD(&target->req_queue); - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { target->req_ring[i].index = i; list_add_tail(&target->req_ring[i].list, &target->free_reqs); } @@ -2159,6 +2159,9 @@ static int __init srp_init_module(void) { int ret; + BUILD_BUG_ON_NOT_POWER_OF_2(SRP_SQ_SIZE); + BUILD_BUG_ON_NOT_POWER_OF_2(SRP_RQ_SIZE); + if (srp_sg_tablesize > 255) { printk(KERN_WARNING PFX "Clamping srp_sg_tablesize to 255\n"); srp_sg_tablesize = 255; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 5a80eac..7a959d5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -59,7 +59,14 @@ enum { SRP_RQ_SHIFT = 6, SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - SRP_SQ_SIZE = SRP_RQ_SIZE - 1, + SRP_RQ_MASK = SRP_RQ_SIZE - 1, + + SRP_SQ_SIZE = SRP_RQ_SIZE, + SRP_SQ_MASK = SRP_SQ_SIZE - 1, + SRP_RSP_SQ_SIZE = 1, + SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, + SRP_TSK_MGMT_SQ_SIZE = 1, + SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1), @@ -144,11 +151,11 @@ struct srp_target_port { unsigned tx_head; unsigned tx_tail; - struct srp_iu *tx_ring[SRP_SQ_SIZE + 1]; + struct srp_iu *tx_ring[SRP_SQ_SIZE]; struct list_head free_reqs; struct list_head req_queue; - struct srp_request req_ring[SRP_SQ_SIZE]; + struct srp_request req_ring[SRP_CMD_SQ_SIZE]; struct work_struct work; -- cgit v0.10.2 From bb12588a38e6db85e01dceadff7bc161fc92e7d2 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 8 Oct 2010 14:40:47 -0400 Subject: IB/srp: Implement SRP_CRED_REQ and SRP_AER_REQ This patch adds support for SRP_CRED_REQ to avoid a lockup by targets that use that mechanism to return credits to the initiator. This prevents a lockup observed in the field where we would never add the credits from the SRP_CRED_REQ to our current count, and would therefore never send another command to the target. Minimal support for SRP_AER_REQ is also added, as these messages can also be used to convey additional credits to the initiator. Based upon extensive debugging and code by Bart Van Assche and a bug report by Chris Worley. Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index b8b09a4..a54eee9 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -83,6 +83,10 @@ static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); static void srp_send_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_iu_type iu_type); +static int __srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len); static struct scsi_transport_template *ib_srp_transport_template; @@ -896,6 +900,71 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) spin_unlock_irqrestore(target->scsi_host->host_lock, flags); } +static int srp_response_common(struct srp_target_port *target, s32 req_delta, + void *rsp, int len) +{ + struct ib_device *dev; + unsigned long flags; + struct srp_iu *iu; + int err = 1; + + dev = target->srp_host->srp_dev->dev; + + spin_lock_irqsave(target->scsi_host->host_lock, flags); + target->req_lim += req_delta; + + iu = __srp_get_tx_iu(target, SRP_IU_RSP); + if (!iu) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "no IU available to send response\n"); + goto out; + } + + ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); + memcpy(iu->buf, rsp, len); + ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); + + err = __srp_post_send(target, iu, len); + if (err) + shost_printk(KERN_ERR, target->scsi_host, PFX + "unable to post response: %d\n", err); + +out: + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + return err; +} + +static void srp_process_cred_req(struct srp_target_port *target, + struct srp_cred_req *req) +{ + struct srp_cred_rsp rsp = { + .opcode = SRP_CRED_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_CRED_REQ\n"); +} + +static void srp_process_aer_req(struct srp_target_port *target, + struct srp_aer_req *req) +{ + struct srp_aer_rsp rsp = { + .opcode = SRP_AER_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + shost_printk(KERN_ERR, target->scsi_host, PFX + "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun)); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_AER_REQ\n"); +} + static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) { struct ib_device *dev; @@ -923,6 +992,14 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) srp_process_rsp(target, iu->buf); break; + case SRP_CRED_REQ: + srp_process_cred_req(target, iu->buf); + break; + + case SRP_AER_REQ: + srp_process_aer_req(target, iu->buf); + break; + case SRP_T_LOGOUT: /* XXX Handle target logout */ shost_printk(KERN_WARNING, target->scsi_host, @@ -985,23 +1062,36 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) * Must be called with target->scsi_host->host_lock held to protect * req_lim and tx_head. Lock cannot be dropped between call here and * call to __srp_post_send(). + * + * Note: + * An upper limit for the number of allocated information units for each + * request type is: + * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues + * more than Scsi_Host.can_queue requests. + * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. + * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than + * one unanswered SRP request to an initiator. */ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, - enum srp_request_type req_type) + enum srp_iu_type iu_type) { - s32 rsv = (req_type == SRP_REQ_TASK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + struct srp_iu *iu; srp_send_completion(target->send_cq, target); if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) return NULL; - if (target->req_lim <= rsv) { + /* Initiator responses to target requests do not consume credits */ + if (target->req_lim <= rsv && iu_type != SRP_IU_RSP) { ++target->zero_req_lim; return NULL; } - return target->tx_ring[target->tx_head & SRP_SQ_MASK]; + iu = target->tx_ring[target->tx_head & SRP_SQ_MASK]; + iu->type = iu_type; + return iu; } /* @@ -1030,7 +1120,8 @@ static int __srp_post_send(struct srp_target_port *target, if (!ret) { ++target->tx_head; - --target->req_lim; + if (iu->type != SRP_IU_RSP) + --target->req_lim; } return ret; @@ -1056,7 +1147,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, return 0; } - iu = __srp_get_tx_iu(target, SRP_REQ_NORMAL); + iu = __srp_get_tx_iu(target, SRP_IU_CMD); if (!iu) goto err; @@ -1363,7 +1454,7 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&req->done); - iu = __srp_get_tx_iu(target, SRP_REQ_TASK_MGMT); + iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); if (!iu) goto out; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 7a959d5..ed0dce9 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -82,9 +82,10 @@ enum srp_target_state { SRP_TARGET_REMOVED }; -enum srp_request_type { - SRP_REQ_NORMAL, - SRP_REQ_TASK_MGMT, +enum srp_iu_type { + SRP_IU_CMD, + SRP_IU_TSK_MGMT, + SRP_IU_RSP, }; struct srp_device { @@ -171,6 +172,7 @@ struct srp_iu { void *buf; size_t size; enum dma_data_direction direction; + enum srp_iu_type type; }; #endif /* IB_SRP_H */ diff --git a/include/scsi/srp.h b/include/scsi/srp.h index ad178fa..1ae84db 100644 --- a/include/scsi/srp.h +++ b/include/scsi/srp.h @@ -239,4 +239,42 @@ struct srp_rsp { u8 data[0]; } __attribute__((packed)); +struct srp_cred_req { + u8 opcode; + u8 sol_not; + u8 reserved[2]; + __be32 req_lim_delta; + u64 tag; +}; + +struct srp_cred_rsp { + u8 opcode; + u8 reserved[7]; + u64 tag; +}; + +/* + * The SRP spec defines the fixed portion of the AER_REQ structure to be + * 36 bytes, so it needs to be packed to avoid having it padded to 40 bytes + * on 64-bit architectures. + */ +struct srp_aer_req { + u8 opcode; + u8 sol_not; + u8 reserved[2]; + __be32 req_lim_delta; + u64 tag; + u32 reserved2; + __be64 lun; + __be32 sense_data_len; + u32 reserved3; + u8 sense_data[0]; +} __attribute__((packed)); + +struct srp_aer_rsp { + u8 opcode; + u8 reserved[7]; + u64 tag; +}; + #endif /* SCSI_SRP_H */ -- cgit v0.10.2 From c3aa9b186b95025d4ba4e90d6140c9887dfaae0a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 20 Sep 2010 07:05:06 +0000 Subject: IPoIB: Set dev_id field of net_device Use the net device's dev_id field to encode the port number of the pci device. This can be used to to associate a net device with the pci device's port. The encoding is: dev_id = port - 1. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b4b2257..30d2333 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1240,6 +1240,7 @@ static struct net_device *ipoib_add_port(const char *format, goto alloc_mem_failed; SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); -- cgit v0.10.2 From 97cb7e40c69618ba03030b467faf60dc3ee982ee Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 13 Aug 2010 20:56:34 +0000 Subject: RDMA/ucma: Allow tuning the max listen backlog For iWARP connections, the connect request is carried in a TCP payload on an already established TCP connection. So if the ucma's backlog is full, the connection request is transmitted and acked at the TCP level by the time the connect request gets dropped in the ucma. The end result is the connection gets rejected by the iWARP provider. Further, a 32 node 256NP OpenMPI job will generate > 128 connect requests on some ranks. This patch increases the default max backlog to 1024, and adds a sysctl variable so the backlog can be adjusted at run time. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ac7edc2..b2748a6 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -50,8 +51,24 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); -enum { - UCMA_MAX_BACKLOG = 128 +static unsigned int max_backlog = 1024; + +static struct ctl_table_header *ucma_ctl_table_hdr; +static ctl_table ucma_ctl_table[] = { + { + .procname = "max_backlog", + .data = &max_backlog, + .maxlen = sizeof max_backlog, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path ucma_ctl_path[] = { + { .procname = "net" }, + { .procname = "rdma_ucm" }, + { } }; struct ucma_file { @@ -686,8 +703,8 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ? - cmd.backlog : UCMA_MAX_BACKLOG; + ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? + cmd.backlog : max_backlog; ret = rdma_listen(ctx->cm_id, ctx->backlog); ucma_put_ctx(ctx); return ret; @@ -1279,16 +1296,26 @@ static int __init ucma_init(void) ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); - goto err; + goto err1; + } + + ucma_ctl_table_hdr = register_sysctl_paths(ucma_ctl_path, ucma_ctl_table); + if (!ucma_ctl_table_hdr) { + printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n"); + ret = -ENOMEM; + goto err2; } return 0; -err: +err2: + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); +err1: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { + unregister_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); -- cgit v0.10.2 From 7454159d3c01371c1abed9d941044d95d2efde97 Mon Sep 17 00:00:00 2001 From: matt mooney Date: Fri, 24 Sep 2010 19:17:18 +0000 Subject: IB: Replace EXTRA_CFLAGS with ccflags-y Signed-off-by: matt mooney Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild index 06964c4..950dfab 100644 --- a/drivers/infiniband/hw/amso1100/Kbuild +++ b/drivers/infiniband/hw/amso1100/Kbuild @@ -1,6 +1,4 @@ -ifdef CONFIG_INFINIBAND_AMSO1100_DEBUG -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile index 7e7b5a6..621619c 100644 --- a/drivers/infiniband/hw/cxgb3/Makefile +++ b/drivers/infiniband/hw/cxgb3/Makefile @@ -1,10 +1,8 @@ -EXTRA_CFLAGS += -Idrivers/net/cxgb3 +ccflags-y := -Idrivers/net/cxgb3 obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ iwch_provider.o iwch.o cxio_hal.o cxio_resource.o -ifdef CONFIG_INFINIBAND_CXGB3_DEBUG -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile index e31a499..cd20b13 100644 --- a/drivers/infiniband/hw/cxgb4/Makefile +++ b/drivers/infiniband/hw/cxgb4/Makefile @@ -1,4 +1,4 @@ -EXTRA_CFLAGS += -Idrivers/net/cxgb4 +ccflags-y := -Idrivers/net/cxgb4 obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile index fa3df82..4496f28 100644 --- a/drivers/infiniband/hw/ipath/Makefile +++ b/drivers/infiniband/hw/ipath/Makefile @@ -1,4 +1,4 @@ -EXTRA_CFLAGS += -DIPATH_IDSTR='"QLogic kernel.org driver"' \ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ -DIPATH_KERN_TYPE=0 obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o -- cgit v0.10.2 From d0d68b8693bd16bfbbc93b89f1d9f3351723307c Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 4 Oct 2010 12:11:34 +0000 Subject: IB/mlx4: Signal node desc changes to SM by using FW to generate trap 144 The Node Description cannot be changed via MADs (it is read-only). Until now, it was changed in the driver via sysfs, and the new Node Description was simply inserted by the driver into MAD responses (replacing the description returned by FW). System startup scripts use the sysfs interface to change the node description at driver startup to show the hostname, etc. However, this has a race condition: the SM could discover the original FW node description rather than the system-specific description if it queried the port before the startup scripts finish running. For mlx4, we fix this with a new FW command (SET_NODE) that allows passing the new node description to FW. When this command is invoked, FW sends a trap 144 to the SM. When it gets this trap, the SM can query the node to obtain the new node description -- thus eliminating the effects of the race. This patch simply calls SET_NODE command when a new node description is entered via sysfs (thus causing trap 144 to be issued by the FW). We ignore all failures of the SET_NODE command (including those caused by using a device FW that predates the SET_NODE command), since in that case things work just as before. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 62e8cd6..ac6951d 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -272,14 +272,32 @@ out: static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { + struct mlx4_cmd_mailbox *mailbox; + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; - if (mask & IB_DEVICE_MODIFY_NODE_DESC) { - spin_lock(&to_mdev(ibdev)->sm_lock); - memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock(&to_mdev(ibdev)->sm_lock); - } + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + spin_lock(&to_mdev(ibdev)->sm_lock); + memcpy(ibdev->node_desc, props->node_desc, 64); + spin_unlock(&to_mdev(ibdev)->sm_lock); + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); + if (IS_ERR(mailbox)) + return 0; + + memset(mailbox->buf, 0, 256); + memcpy(mailbox->buf, props->node_desc, 64); + mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A); + + mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 0f82293..2731266 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -57,6 +57,7 @@ enum { MLX4_CMD_QUERY_PORT = 0x43, MLX4_CMD_SENSE_PORT = 0x4d, MLX4_CMD_SET_PORT = 0xc, + MLX4_CMD_SET_NODE = 0x5a, MLX4_CMD_ACCESS_DDR = 0x2e, MLX4_CMD_MAP_ICM = 0xffa, MLX4_CMD_UNMAP_ICM = 0xff9, -- cgit v0.10.2 From 05a1d7504f836ee67e27f2488cb5b8126b51dbd4 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 8 Oct 2010 14:48:14 -0400 Subject: IB/srp: Eliminate two forward declarations Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a54eee9..d4c08d6 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -83,10 +83,6 @@ static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); static void srp_send_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); -static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, - enum srp_iu_type iu_type); -static int __srp_post_send(struct srp_target_port *target, - struct srp_iu *iu, int len); static struct scsi_transport_template *ib_srp_transport_template; @@ -815,6 +811,75 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, return len; } +/* + * Must be called with target->scsi_host->host_lock held to protect + * req_lim and tx_head. Lock cannot be dropped between call here and + * call to __srp_post_send(). + * + * Note: + * An upper limit for the number of allocated information units for each + * request type is: + * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues + * more than Scsi_Host.can_queue requests. + * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. + * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than + * one unanswered SRP request to an initiator. + */ +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_iu_type iu_type) +{ + s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + struct srp_iu *iu; + + srp_send_completion(target->send_cq, target); + + if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) + return NULL; + + /* Initiator responses to target requests do not consume credits */ + if (target->req_lim <= rsv && iu_type != SRP_IU_RSP) { + ++target->zero_req_lim; + return NULL; + } + + iu = target->tx_ring[target->tx_head & SRP_SQ_MASK]; + iu->type = iu_type; + return iu; +} + +/* + * Must be called with target->scsi_host->host_lock held to protect + * req_lim and tx_head. + */ +static int __srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + int ret = 0; + + list.addr = iu->dma; + list.length = len; + list.lkey = target->srp_host->srp_dev->mr->lkey; + + wr.next = NULL; + wr.wr_id = target->tx_head & SRP_SQ_MASK; + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(target->qp, &wr, &bad_wr); + + if (!ret) { + ++target->tx_head; + if (iu->type != SRP_IU_RSP) + --target->req_lim; + } + + return ret; +} + static int srp_post_recv(struct srp_target_port *target) { unsigned long flags; @@ -1058,75 +1123,6 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) } } -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. Lock cannot be dropped between call here and - * call to __srp_post_send(). - * - * Note: - * An upper limit for the number of allocated information units for each - * request type is: - * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues - * more than Scsi_Host.can_queue requests. - * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. - * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than - * one unanswered SRP request to an initiator. - */ -static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, - enum srp_iu_type iu_type) -{ - s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; - struct srp_iu *iu; - - srp_send_completion(target->send_cq, target); - - if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) - return NULL; - - /* Initiator responses to target requests do not consume credits */ - if (target->req_lim <= rsv && iu_type != SRP_IU_RSP) { - ++target->zero_req_lim; - return NULL; - } - - iu = target->tx_ring[target->tx_head & SRP_SQ_MASK]; - iu->type = iu_type; - return iu; -} - -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. - */ -static int __srp_post_send(struct srp_target_port *target, - struct srp_iu *iu, int len) -{ - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - int ret = 0; - - list.addr = iu->dma; - list.length = len; - list.lkey = target->srp_host->srp_dev->mr->lkey; - - wr.next = NULL; - wr.wr_id = target->tx_head & SRP_SQ_MASK; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - ret = ib_post_send(target->qp, &wr, &bad_wr); - - if (!ret) { - ++target->tx_head; - if (iu->type != SRP_IU_RSP) - --target->req_lim; - } - - return ret; -} - static int srp_queuecommand(struct scsi_cmnd *scmnd, void (*done)(struct scsi_cmnd *)) { -- cgit v0.10.2 From 7ade400aba9a675b610074d6609658661db07eeb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Aug 2010 19:27:36 +0000 Subject: IB/srp: Reduce number of BUSY conditions As proposed by the SRP (draft) standard, ib_srp reserves one ring element for SRP_TSK_MGMT requests. This patch makes sure that the SCSI mid-layer never tries to queue more than (SRP request limit) - 1 SCSI commands to ib_srp. This improves performance for targets whose request limit is less than or equal to SRP_NORMAL_REQ_SQ_SIZE by reducing the number of BUSY responses reported by ib_srp to the SCSI mid-layer. Signed-off-by: Bart Van Assche Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index d4c08d6..4df8275 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1339,8 +1339,13 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); target->req_lim = be32_to_cpu(rsp->req_lim_delta); - target->scsi_host->can_queue = min(target->req_lim, - target->scsi_host->can_queue); + /* + * Reserve credits for task management so we don't + * bounce requests back to the SCSI mid-layer. + */ + target->scsi_host->can_queue + = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, + target->scsi_host->can_queue); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", opcode); -- cgit v0.10.2 From 21c1a90769e680e7c1f49bae4c5804cf0c7bc814 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Aug 2010 19:27:40 +0000 Subject: IB/srp: Use list_first_entry() Use the list_first_entry() macro in ib_srp instead of open-coding the equivalent, which makes the source code slightly more descriptive. The list_first_entry() macro itself was introduced in kernel 2.6.22. Signed-off-by: Bart Van Assche Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4df8275..9b4bc5a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1151,7 +1151,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, ib_dma_sync_single_for_cpu(dev, iu->dma, srp_max_iu_len, DMA_TO_DEVICE); - req = list_entry(target->free_reqs.next, struct srp_request, list); + req = list_first_entry(&target->free_reqs, struct srp_request, list); scmnd->scsi_done = done; scmnd->result = 0; -- cgit v0.10.2 From 19081f31ce941a22bfc681d18ae2d31e31084df5 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Mon, 18 Oct 2010 08:54:49 -0400 Subject: IB/srp: Sync buffer before posting send srp_send_tsk_mgmt() was missing the proper DMA sync calls before posting the buffer to the device. Signed-off-by: David Dillow Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9b4bc5a..cfc1d65 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1442,6 +1442,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) static int srp_send_tsk_mgmt(struct srp_target_port *target, struct srp_request *req, u8 func) { + struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; @@ -1459,6 +1460,8 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, if (!iu) goto out; + ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); tsk_mgmt = iu->buf; memset(tsk_mgmt, 0, sizeof *tsk_mgmt); @@ -1468,6 +1471,8 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, tsk_mgmt->tsk_mgmt_func = func; tsk_mgmt->task_tag = req->index; + ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) goto out; -- cgit v0.10.2 From 2420b60b1dc4ed98cb1788e928bc57ff2efa1a8d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 18 Oct 2010 14:45:20 -0700 Subject: IB/uverbs: Return link layer type to userspace for query port operation Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6fcfbeb..b342248 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -460,6 +460,8 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; + resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index a17f771..fe5b051 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -205,7 +205,8 @@ struct ib_uverbs_query_port_resp { __u8 active_width; __u8 active_speed; __u8 phys_state; - __u8 reserved[3]; + __u8 link_layer; + __u8 reserved[2]; }; struct ib_uverbs_alloc_pd { -- cgit v0.10.2 From 33c87f0af60146b375220809c1cb745ac1a86edf Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 26 Aug 2010 14:18:43 +0000 Subject: mlx4_core: Allow protocol drivers to find corresponding interfaces Add a mechanism for mlx4 protocol drivers to get a pointer to other drivers's device objects. For this, an exported function, mlx4_get_protocol_dev() is added, which allows a driver to get some other driver's device based on the protocol that the driver implements. Two protocols are added: MLX4_PROTOCOL_IB and MLX4_PROTOCOL_EN. This will be used in mlx4 IBoE support so that mlx4_ib can find the corresponding mlx4_en netdev. Signed-off-by: Eli Cohen [ Clean up and rename a few things. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c index 97934f1..b2df3ee 100644 --- a/drivers/net/mlx4/en_main.c +++ b/drivers/net/mlx4/en_main.c @@ -124,6 +124,13 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) return 0; } +static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) +{ + struct mlx4_en_dev *endev = ctx; + + return endev->pndev[port]; +} + static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, enum mlx4_dev_event event, int port) { @@ -282,9 +289,11 @@ err_free_res: } static struct mlx4_interface mlx4_en_interface = { - .add = mlx4_en_add, - .remove = mlx4_en_remove, - .event = mlx4_en_event, + .add = mlx4_en_add, + .remove = mlx4_en_remove, + .event = mlx4_en_event, + .get_dev = mlx4_en_get_netdev, + .protocol = MLX4_PROTOCOL_EN, }; static int __init mlx4_en_init(void) diff --git a/drivers/net/mlx4/intf.c b/drivers/net/mlx4/intf.c index 5550678..73c94fc 100644 --- a/drivers/net/mlx4/intf.c +++ b/drivers/net/mlx4/intf.c @@ -161,3 +161,24 @@ void mlx4_unregister_device(struct mlx4_dev *dev) mutex_unlock(&intf_mutex); } + +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_device_context *dev_ctx; + unsigned long flags; + void *result = NULL; + + spin_lock_irqsave(&priv->ctx_lock, flags); + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) { + result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port); + break; + } + + spin_unlock_irqrestore(&priv->ctx_lock, flags); + + return result; +} +EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev); diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 53c5fdb..f407cd4 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -44,15 +44,24 @@ enum mlx4_dev_event { MLX4_DEV_EVENT_PORT_REINIT, }; +enum mlx4_protocol { + MLX4_PROTOCOL_IB, + MLX4_PROTOCOL_EN, +}; + struct mlx4_interface { void * (*add) (struct mlx4_dev *dev); void (*remove)(struct mlx4_dev *dev, void *context); void (*event) (struct mlx4_dev *dev, void *context, enum mlx4_dev_event event, int port); + void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); struct list_head list; + enum mlx4_protocol protocol; }; int mlx4_register_interface(struct mlx4_interface *intf); void mlx4_unregister_interface(struct mlx4_interface *intf); +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); + #endif /* MLX4_DRIVER_H */ -- cgit v0.10.2 From 96dfa684c85d24b697f865f37a4f0c8678fc86e9 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 20 Oct 2010 21:57:02 -0700 Subject: mlx4_core: Update data structures and constants for IBoE Add fields to hardware data structures and add new constants required for IBoE support. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 04f42ae..5b3593d 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -98,7 +98,8 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags) [20] = "Address vector port checking support", [21] = "UD multicast support", [24] = "Demand paging support", - [25] = "Router support" + [25] = "Router support", + [30] = "IBoE support" }; int i; diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 0f82293..22bd8d3 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -140,6 +140,7 @@ enum { MLX4_SET_PORT_MAC_TABLE = 0x2, MLX4_SET_PORT_VLAN_TABLE = 0x3, MLX4_SET_PORT_PRIO_MAP = 0x4, + MLX4_SET_PORT_GID_TABLE = 0x5, }; struct mlx4_dev; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7a7f9c1..47e163a 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -67,7 +67,8 @@ enum { MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, MLX4_DEV_CAP_FLAG_RAW_MCAST = 1 << 19, MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1 << 20, - MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21 + MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21, + MLX4_DEV_CAP_FLAG_IBOE = 1 << 30 }; enum { diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 7abe643..97cfdc8 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -112,7 +112,8 @@ struct mlx4_qp_path { u8 snooper_flags; u8 reserved3[2]; u8 counter_index; - u8 reserved4[7]; + u8 reserved4; + u8 dmac[6]; }; struct mlx4_qp_context { @@ -166,6 +167,7 @@ enum { MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, MLX4_WQE_CTRL_INS_VLAN = 1 << 6, MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7, + MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0, }; struct mlx4_wqe_ctrl_seg { @@ -219,7 +221,8 @@ struct mlx4_wqe_datagram_seg { __be32 av[8]; __be32 dqpn; __be32 qkey; - __be32 reservd[2]; + __be16 vlan; + u8 mac[6]; }; struct mlx4_wqe_lso_seg { -- cgit v0.10.2 From 7ac870ed7d2316587ec06747c28cb9f44f67997d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 21 Oct 2010 12:39:08 -0700 Subject: mlx4_en: Change multicast promiscuous mode to support IBoE Change multicast promiscuous mode to pass packets through the multicast group distribution table before sending packets that miss to the default multicast QP. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c index a29abe8..a249887 100644 --- a/drivers/net/mlx4/en_port.c +++ b/drivers/net/mlx4/en_port.c @@ -127,8 +127,8 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, memset(context, 0, sizeof *context); context->base_qpn = cpu_to_be32(base_qpn); - context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | base_qpn); - context->mcast = cpu_to_be32(1 << SET_PORT_PROMISC_SHIFT | base_qpn); + context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn); + context->mcast = cpu_to_be32(1 << SET_PORT_PROMISC_MODE_SHIFT | base_qpn); context->intra_no_vlan = 0; context->no_vlan = MLX4_NO_VLAN_IDX; context->intra_vlan_miss = 0; diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h index e6477f1..9354891 100644 --- a/drivers/net/mlx4/en_port.h +++ b/drivers/net/mlx4/en_port.h @@ -36,7 +36,8 @@ #define SET_PORT_GEN_ALL_VALID 0x7 -#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_PROMISC_EN_SHIFT 31 +#define SET_PORT_PROMISC_MODE_SHIFT 30 enum { MLX4_CMD_SET_VLAN_FLTR = 0x47, -- cgit v0.10.2 From fa417f7b520ee60b39f7e23528d2030af30a07d1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 24 Oct 2010 21:08:52 -0700 Subject: IB/mlx4: Add support for IBoE Add support for IBoE to mlx4_ib. The bulk of the code is handling the new address vector fields; mlx4 needs the MAC address of a remote node to include it in a WQE (for datagrams) or in the QP context (for connected QPs). Address resolution is done by assuming all unicast GIDs are either link-local IPv6 addresses. Multicast group attach/detach needs to update the NIC's multicast filters; but since attaching a QP to a multicast group can be done before the QP is bound to a port, for IBoE we need to keep track of all multicast groups that a QP is attached too before it transitions from INIT to RTR (since it does not have a port in the INIT state). Signed-off-by: Eli Cohen [ Many things cleaned up and otherwise monkeyed with; hope I didn't introduce too many bugs. - Roland ] Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 11a236f..3bf3544 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -30,66 +30,153 @@ * SOFTWARE. */ +#include + #include +#include +#include #include "mlx4_ib.h" -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast, u8 port) { - struct mlx4_dev *dev = to_mdev(pd->device)->dev; - struct mlx4_ib_ah *ah; + struct in6_addr in6; - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); + *is_mcast = 0; - memset(&ah->av, 0, sizeof ah->av); + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); + if (rdma_link_local_addr(&in6)) + rdma_get_ll_mac(&in6, mac); + else if (rdma_is_multicast_addr(&in6)) { + rdma_get_mcast_mac(&in6, mac); + *is_mcast = 1; + } else + return -EINVAL; - ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); - ah->av.g_slid = ah_attr->src_path_bits; - ah->av.dlid = cpu_to_be16(ah_attr->dlid); - if (ah_attr->static_rate) { - ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; - while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && - !(1 << ah->av.stat_rate & dev->caps.stat_rate_support)) - --ah->av.stat_rate; - } - ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + return 0; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.g_slid = ah_attr->src_path_bits; if (ah_attr->ah_flags & IB_AH_GRH) { - ah->av.g_slid |= 0x80; - ah->av.gid_index = ah_attr->grh.sgid_index; - ah->av.hop_limit = ah_attr->grh.hop_limit; - ah->av.sl_tclass_flowlabel |= + ah->av.ib.g_slid |= 0x80; + ah->av.ib.gid_index = ah_attr->grh.sgid_index; + ah->av.ib.hop_limit = ah_attr->grh.hop_limit; + ah->av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr->grh.traffic_class << 20) | ah_attr->grh.flow_label); - memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16); + memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16); + } + + ah->av.ib.dlid = cpu_to_be16(ah_attr->dlid); + if (ah_attr->static_rate) { + ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + --ah->av.ib.stat_rate; } + ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); return &ah->ibah; } +static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + struct mlx4_dev *dev = ibdev->dev; + u8 mac[6]; + int err; + int is_mcast; + + err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); + if (err) + return ERR_PTR(err); + + memcpy(ah->av.eth.mac, mac, 6); + ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.eth.gid_index = ah_attr->grh.sgid_index; + if (ah_attr->static_rate) { + ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support)) + --ah->av.eth.stat_rate; + } + + /* + * HW requires multicast LID so we just choose one. + */ + if (is_mcast) + ah->av.ib.dlid = cpu_to_be16(0xc000); + + memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16); + ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + + return &ah->ibah; +} + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah; + struct ib_ah *ret; + + ah = kzalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) { + if (!(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + } else { + /* + * TBD: need to handle the case when we get + * called in an atomic context and there we + * might sleep. We don't expect this + * currently since we're working with link + * local addresses which we can translate + * without going to sleep. + */ + ret = create_iboe_ah(pd, ah_attr, ah); + } + + if (IS_ERR(ret)) + kfree(ah); + + return ret; + } else + return create_ib_ah(pd, ah_attr, ah); /* never fails */ +} + int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) { struct mlx4_ib_ah *ah = to_mah(ibah); + enum rdma_link_layer ll; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->dlid = be16_to_cpu(ah->av.dlid); - ah_attr->sl = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; - ah_attr->port_num = be32_to_cpu(ah->av.port_pd) >> 24; - if (ah->av.stat_rate) - ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET; - ah_attr->src_path_bits = ah->av.g_slid & 0x7F; + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; + ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; + if (ah->av.ib.stat_rate) + ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; + ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F; if (mlx4_ib_ah_grh_present(ah)) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.traffic_class = - be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20; + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20; ah_attr->grh.flow_label = - be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff; - ah_attr->grh.hop_limit = ah->av.hop_limit; - ah_attr->grh.sgid_index = ah->av.gid_index; - memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16); + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff; + ah_attr->grh.hop_limit = ah->av.ib.hop_limit; + ah_attr->grh.sgid_index = ah->av.ib.gid_index; + memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16); } return 0; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index f38d5b1..c9a8dd6 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -311,19 +311,25 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) struct ib_mad_agent *agent; int p, q; int ret; + enum rdma_link_layer ll; - for (p = 0; p < dev->num_ports; ++p) + for (p = 0; p < dev->num_ports; ++p) { + ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); for (q = 0; q <= 1; ++q) { - agent = ib_register_mad_agent(&dev->ib_dev, p + 1, - q ? IB_QPT_GSI : IB_QPT_SMI, - NULL, 0, send_handler, - NULL, NULL); - if (IS_ERR(agent)) { - ret = PTR_ERR(agent); - goto err; - } - dev->send_agent[p][q] = agent; + if (ll == IB_LINK_LAYER_INFINIBAND) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } else + dev->send_agent[p][q] = NULL; } + } return 0; @@ -344,8 +350,10 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) for (p = 0; p < dev->num_ports; ++p) { for (q = 0; q <= 1; ++q) { agent = dev->send_agent[p][q]; - dev->send_agent[p][q] = NULL; - ib_unregister_mad_agent(agent); + if (agent) { + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } } if (dev->sm_ah[p]) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4e94e36..e65db73 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -35,9 +35,13 @@ #include #include #include +#include +#include +#include #include #include +#include #include #include @@ -58,6 +62,15 @@ static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; +struct update_gid_work { + struct work_struct work; + union ib_gid gids[128]; + struct mlx4_ib_dev *dev; + int port; +}; + +static struct workqueue_struct *wq; + static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -154,28 +167,19 @@ out: return err; } -static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) +static enum rdma_link_layer +mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { - struct ib_smp *in_mad = NULL; - struct ib_smp *out_mad = NULL; - int err = -ENOMEM; + struct mlx4_dev *dev = to_mdev(device)->dev; - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) - goto out; - - memset(props, 0, sizeof *props); - - init_query_mad(in_mad); - in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; - in_mad->attr_mod = cpu_to_be32(port); - - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); - if (err) - goto out; + return dev->caps.port_mask & (1 << (port_num - 1)) ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; +} +static int ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); @@ -196,6 +200,80 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; + return 0; +} + +static u8 state_to_phys_state(enum ib_port_state state) +{ + return state == IB_PORT_ACTIVE ? 5 : 3; +} + +static int eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ + struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe; + struct net_device *ndev; + enum ib_mtu tmp; + + props->active_width = IB_WIDTH_4X; + props->active_speed = 4; + props->port_cap_flags = IB_PORT_CM_SUP; + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->max_mtu = IB_MTU_2048; + props->subnet_timeout = 0; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = 0; + props->state = IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); + props->active_mtu = IB_MTU_256; + spin_lock(&iboe->lock); + ndev = iboe->netdevs[port - 1]; + if (!ndev) + goto out; + + tmp = iboe_get_mtu(ndev->mtu); + props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; + + props->state = netif_running(ndev) && netif_oper_up(ndev) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); + +out: + spin_unlock(&iboe->lock); + return 0; +} + +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + ib_link_query_port(ibdev, port, props, out_mad) : + eth_link_query_port(ibdev, port, props, out_mad); + out: kfree(in_mad); kfree(out_mad); @@ -203,8 +281,8 @@ out: return err; } -static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *gid) +static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; @@ -241,6 +319,25 @@ out: return err; } +static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + + *gid = dev->iboe.gid_table[port - 1][index]; + + return 0; +} + +static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + return __mlx4_ib_query_gid(ibdev, port, index, gid); + else + return iboe_query_gid(ibdev, port, index, gid); +} + static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { @@ -289,6 +386,7 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, { struct mlx4_cmd_mailbox *mailbox; int err; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) @@ -304,7 +402,7 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } - err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B); mlx4_free_cmd_mailbox(dev->dev, mailbox); @@ -447,18 +545,132 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd) return 0; } +static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_gid_entry *ge; + + ge = kzalloc(sizeof *ge, GFP_KERNEL); + if (!ge) + return -ENOMEM; + + ge->gid = *gid; + if (mlx4_ib_add_mc(mdev, mqp, gid)) { + ge->port = mqp->port; + ge->added = 1; + } + + mutex_lock(&mqp->mutex); + list_add_tail(&ge->list, &mqp->gid_list); + mutex_unlock(&mqp->mutex); + + return 0; +} + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid) +{ + u8 mac[6]; + struct net_device *ndev; + int ret = 0; + + if (!mqp->port) + return 0; + + spin_lock(&mdev->iboe.lock); + ndev = mdev->iboe.netdevs[mqp->port - 1]; + if (ndev) + dev_hold(ndev); + spin_unlock(&mdev->iboe.lock); + + if (ndev) { + rdma_get_mcast_mac((struct in6_addr *)gid, mac); + rtnl_lock(); + dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac); + ret = 1; + rtnl_unlock(); + dev_put(ndev); + } + + return ret; +} + static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { - return mlx4_multicast_attach(to_mdev(ibqp->device)->dev, - &to_mqp(ibqp)->mqp, gid->raw, - !!(to_mqp(ibqp)->flags & - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)); + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)); + if (err) + return err; + + err = add_gid_entry(ibqp, gid); + if (err) + goto err_add; + + return 0; + +err_add: + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw); + return err; +} + +static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +{ + struct mlx4_ib_gid_entry *ge; + struct mlx4_ib_gid_entry *tmp; + struct mlx4_ib_gid_entry *ret = NULL; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!memcmp(raw, ge->gid.raw, 16)) { + ret = ge; + break; + } + } + + return ret; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { - return mlx4_multicast_detach(to_mdev(ibqp->device)->dev, - &to_mqp(ibqp)->mqp, gid->raw); + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + u8 mac[6]; + struct net_device *ndev; + struct mlx4_ib_gid_entry *ge; + + err = mlx4_multicast_detach(mdev->dev, + &mqp->mqp, gid->raw); + if (err) + return err; + + mutex_lock(&mqp->mutex); + ge = find_gid_entry(mqp, gid->raw); + if (ge) { + spin_lock(&mdev->iboe.lock); + ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; + if (ndev) + dev_hold(ndev); + spin_unlock(&mdev->iboe.lock); + rdma_get_mcast_mac((struct in6_addr *)gid, mac); + if (ndev) { + rtnl_lock(); + dev_mc_del(mdev->iboe.netdevs[ge->port - 1], mac); + rtnl_unlock(); + dev_put(ndev); + } + list_del(&ge->list); + kfree(ge); + } else + printk(KERN_WARNING "could not find mgid entry\n"); + + mutex_unlock(&mqp->mutex); + + return 0; } static int init_node_data(struct mlx4_ib_dev *dev) @@ -543,15 +755,143 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; +static void mlx4_addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; +} + +static void update_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + struct ib_event event; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + printk(KERN_WARNING "update gid table failed %ld\n", PTR_ERR(mailbox)); + return; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof gw->gids); + + err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B); + if (err) + printk(KERN_WARNING "set port command failed\n"); + else { + memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); + event.device = &gw->dev->ib_dev; + event.element.port_num = gw->port; + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +{ + struct net_device *ndev = dev->iboe.netdevs[port - 1]; + struct update_gid_work *work; + + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + if (!clear) { + mlx4_addrconf_ifid_eui48(&work->gids[0].raw[8], ndev); + work->gids[0].global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + } + + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); + + return 0; +} + +static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) +{ + switch (event) { + case NETDEV_UP: + update_ipv6_gids(dev, port, 0); + break; + + case NETDEV_DOWN: + update_ipv6_gids(dev, port, 1); + dev->iboe.netdevs[port - 1] = NULL; + } +} + +static void netdev_added(struct mlx4_ib_dev *dev, int port) +{ + update_ipv6_gids(dev, port, 0); +} + +static void netdev_removed(struct mlx4_ib_dev *dev, int port) +{ + update_ipv6_gids(dev, port, 1); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct mlx4_ib_dev *ibdev; + struct net_device *oldnd; + struct mlx4_ib_iboe *iboe; + int port; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + iboe = &ibdev->iboe; + + spin_lock(&iboe->lock); + mlx4_foreach_ib_transport_port(port, ibdev->dev) { + oldnd = iboe->netdevs[port - 1]; + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROTOCOL_EN, port); + if (oldnd != iboe->netdevs[port - 1]) { + if (iboe->netdevs[port - 1]) + netdev_added(ibdev, port); + else + netdev_removed(ibdev, port); + } + } + + if (dev == iboe->netdevs[0]) + handle_en_event(ibdev, 1, event); + else if (dev == iboe->netdevs[1]) + handle_en_event(ibdev, 2, event); + + spin_unlock(&iboe->lock); + + return NOTIFY_DONE; +} + static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i; + int err; + struct mlx4_ib_iboe *iboe; printk_once(KERN_INFO "%s", mlx4_ib_version); - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ @@ -564,6 +904,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) return NULL; } + iboe = &ibdev->iboe; + if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; @@ -612,6 +954,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; + ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; @@ -656,6 +999,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + spin_lock_init(&iboe->lock); + if (init_node_data(ibdev)) goto err_map; @@ -668,16 +1013,28 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_mad_init(ibdev)) goto err_reg; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) + goto err_reg; + } + for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[i])) - goto err_reg; + goto err_notif; } ibdev->ib_active = true; return ibdev; +err_notif: + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + printk(KERN_WARNING "failure unregistering notifier\n"); + flush_workqueue(wq); + err_reg: ib_unregister_device(&ibdev->ib_dev); @@ -703,11 +1060,16 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + printk(KERN_WARNING "failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + iounmap(ibdev->uar_map); - for (p = 1; p <= ibdev->num_ports; ++p) + mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); - iounmap(ibdev->uar_map); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); @@ -747,19 +1109,33 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, } static struct mlx4_interface mlx4_ib_interface = { - .add = mlx4_ib_add, - .remove = mlx4_ib_remove, - .event = mlx4_ib_event + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .protocol = MLX4_PROTOCOL_IB }; static int __init mlx4_ib_init(void) { - return mlx4_register_interface(&mlx4_ib_interface); + int err; + + wq = create_singlethread_workqueue("mlx4_ib"); + if (!wq) + return -ENOMEM; + + err = mlx4_register_interface(&mlx4_ib_interface); + if (err) { + destroy_workqueue(wq); + return err; + } + + return 0; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); + destroy_workqueue(wq); } module_init(mlx4_ib_init); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 3486d76..2a322f2 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -112,6 +112,13 @@ enum mlx4_ib_qp_flags { MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1, }; +struct mlx4_ib_gid_entry { + struct list_head list; + union ib_gid gid; + int added; + u8 port; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -138,6 +145,8 @@ struct mlx4_ib_qp { u8 resp_depth; u8 sq_no_prefetch; u8 state; + int mlx_type; + struct list_head gid_list; }; struct mlx4_ib_srq { @@ -157,7 +166,14 @@ struct mlx4_ib_srq { struct mlx4_ib_ah { struct ib_ah ibah; - struct mlx4_av av; + union mlx4_ext_av av; +}; + +struct mlx4_ib_iboe { + spinlock_t lock; + struct net_device *netdevs[MLX4_MAX_PORTS]; + struct notifier_block nb; + union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; struct mlx4_ib_dev { @@ -176,6 +192,7 @@ struct mlx4_ib_dev { struct mutex cap_mask_mutex; bool ib_active; + struct mlx4_ib_iboe iboe; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) @@ -314,9 +331,20 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast, u8 port); + static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { - return !!(ah->av.g_slid & 0x80); + u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; + + if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) + return 1; + + return !!(ah->av.ib.g_slid & 0x80); } +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index bb1277c..17f60fe 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -48,17 +49,25 @@ enum { enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, - MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1 }; enum { /* - * Largest possible UD header: send with GRH and immediate data. + * Largest possible UD header: send with GRH and immediate + * data plus 14 bytes for an Ethernet header. (LRH would only + * use 8 bytes, so Ethernet is the biggest case) */ - MLX4_IB_UD_HEADER_SIZE = 72, + MLX4_IB_UD_HEADER_SIZE = 78, MLX4_IB_LSO_HEADER_SPARE = 128, }; +enum { + MLX4_IB_IBOE_ETHERTYPE = 0x8915 +}; + struct mlx4_ib_sqp { struct mlx4_ib_qp qp; int pkey_index; @@ -462,6 +471,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); qp->state = IB_QPS_RESET; if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) @@ -649,6 +659,16 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re } } +static void del_gid_entries(struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + list_del(&ge->list); + kfree(ge); + } +} + static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { @@ -695,6 +715,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, if (!qp->ibqp.srq) mlx4_db_free(dev->dev, &qp->db); } + + del_gid_entries(qp); } struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, @@ -852,6 +874,12 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx4_qp_path *path, u8 port) { + int err; + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET; + u8 mac[6]; + int is_mcast; + path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); if (ah->static_rate) { @@ -882,9 +910,35 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + if (is_eth) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -1; + + err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); + if (err) + return err; + + memcpy(path->dmac, mac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* use index 0 into MAC table for IBoE */ + path->grh_mylmc &= 0x80; + } + return 0; } +static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { + ge->added = 1; + ge->port = qp->port; + } + } +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -980,7 +1034,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_TIMEOUT) { - context->pri_path.ackto = attr->timeout << 3; + context->pri_path.ackto |= attr->timeout << 3; optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } @@ -1118,8 +1172,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; - if (attr_mask & IB_QP_PORT) + if (attr_mask & IB_QP_PORT) { qp->port = attr->port_num; + update_mcg_macs(dev, qp); + } if (attr_mask & IB_QP_ALT_PATH) qp->alt_port = attr->alt_port_num; @@ -1226,35 +1282,45 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, int header_size; int spc; int i; + int is_eth; + int is_grh; send_size = 0; for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; - ib_ud_header_init(send_size, 1, 0, mlx4_ib_ah_grh_present(ah), 0, &sqp->ud_header); + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + ib_ud_header_init(send_size, !is_eth, is_eth, is_grh, 0, &sqp->ud_header); + + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } - sqp->ud_header.lrh.service_level = - be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; - sqp->ud_header.lrh.destination_lid = ah->av.dlid; - sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f); - if (mlx4_ib_ah_grh_present(ah)) { + if (is_grh) { sqp->ud_header.grh.traffic_class = - (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff; + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; sqp->ud_header.grh.flow_label = - ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff); - sqp->ud_header.grh.hop_limit = ah->av.hop_limit; - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24, - ah->av.gid_index, &sqp->ud_header.grh.source_gid); + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); memcpy(sqp->ud_header.grh.destination_gid.raw, - ah->av.dgid, 16); + ah->av.ib.dgid, 16); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | - (sqp->ud_header.lrh.destination_lid == - IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | - (sqp->ud_header.lrh.service_level << 8)); - mlx->rlid = sqp->ud_header.lrh.destination_lid; + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } switch (wr->opcode) { case IB_WR_SEND: @@ -1270,9 +1336,21 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return -EINVAL; } - sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; - if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) - sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + if (is_eth) { + u8 *smac; + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + /* FIXME: cache smac value? */ + smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; + memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); @@ -1434,6 +1512,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); } static void set_mlx_icrc_seg(void *dseg) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 47e163a..ca5645c 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -374,6 +374,27 @@ struct mlx4_av { u8 dgid[16]; }; +struct mlx4_eth_av { + __be32 port_pd; + u8 reserved1; + u8 smac_idx; + u16 reserved2; + u8 reserved3; + u8 gid_index; + u8 stat_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 dgid[16]; + u32 reserved4[2]; + __be16 vlan; + u8 mac[6]; +}; + +union mlx4_ext_av { + struct mlx4_av ib; + struct mlx4_eth_av eth; +}; + struct mlx4_dev { struct pci_dev *pdev; unsigned long flags; @@ -402,6 +423,12 @@ struct mlx4_init_port_param { if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \ ~(dev)->caps.port_mask) & 1 << ((port) - 1)) +#define mlx4_foreach_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask & 1 << ((port) - 1)) || \ + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + + int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); -- cgit v0.10.2 From af7bd463761c6abd8ca8d831f9cc0ac19f3b7d4b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 26 Aug 2010 17:18:59 +0300 Subject: IB/core: Add VLAN support for IBoE Add 802.1q VLAN support to IBoE. The VLAN tag is encoded within the GID derived from a link local address in the following way: GID[11] GID[12] contain the VLAN ID when the GID contains a VLAN. The 3 bits user priority field of the packets are identical to the 3 bits of the SL. In case of rdma_cm apps, the TOS field is used to generate the SL field by doing a shift right of 5 bits effectively taking to 3 MS bits of the TOS field. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f61bc07..6884da2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1791,6 +1791,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; + u16 vid; if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; @@ -1810,14 +1811,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - iboe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr); - iboe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr); - - route->path_rec->hop_limit = 1; - route->path_rec->reversible = 1; - route->path_rec->pkey = cpu_to_be16(0xffff); - route->path_rec->mtu_selector = IB_SA_EQ; - if (addr->dev_addr.bound_dev_if) ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { @@ -1825,6 +1818,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } + vid = rdma_vlan_dev_vlan_id(ndev); + + iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); + iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); + + route->path_rec->hop_limit = 1; + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; + route->path_rec->sl = id_priv->tos >> 5; + route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 3d3c926..357a766 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -587,13 +587,21 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; + struct net_device *dev; + u16 vid = 0; resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: dev_addr = &route->addr.dev_addr; - iboe_mac_to_ll((union ib_gid *) &resp->ib_route[0].dgid, - dev_addr->dst_dev_addr); + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (dev) { + vid = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + } + + iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, + dev_addr->dst_dev_addr, vid); iboe_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index cb0dd5a..bb7e192 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -33,6 +33,7 @@ #include #include +#include #include @@ -103,6 +104,17 @@ static const struct ib_field eth_table[] = { .size_bits = 16 } }; +static const struct ib_field vlan_table[] = { + { STRUCT_FIELD(vlan, tag), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(vlan, type), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -205,6 +217,7 @@ static const struct ib_field deth_table[] = { * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present + * @vlan_present: packet is tagged vlan * @grh_present:GRH flag (if non-zero, GRH will be included) * @immediate_present: specify if immediate data is present * @header:Structure to initialize @@ -212,6 +225,7 @@ static const struct ib_field deth_table[] = { void ib_ud_header_init(int payload_bytes, int lrh_present, int eth_present, + int vlan_present, int grh_present, int immediate_present, struct ib_ud_header *header) @@ -234,6 +248,9 @@ void ib_ud_header_init(int payload_bytes, header->lrh.packet_length = cpu_to_be16(packet_length); } + if (vlan_present) + header->eth.type = cpu_to_be16(ETH_P_8021Q); + if (grh_present) { header->grh.ip_version = 6; header->grh.payload_length = @@ -254,6 +271,7 @@ void ib_ud_header_init(int payload_bytes, header->lrh_present = lrh_present; header->eth_present = eth_present; + header->vlan_present = vlan_present; header->grh_present = grh_present; header->immediate_present = immediate_present; } @@ -312,6 +330,11 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->eth, buf + len); len += IB_ETH_BYTES; } + if (header->vlan_present) { + ib_pack(vlan_table, ARRAY_SIZE(vlan_table), + &header->vlan, buf + len); + len += IB_VLAN_BYTES; + } if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), &header->grh, buf + len); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 17f60fe..2696484 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1291,7 +1291,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); - ib_ud_header_init(send_size, !is_eth, is_eth, is_grh, 0, &sqp->ud_header); + ib_ud_header_init(send_size, !is_eth, is_eth, 0, is_grh, 0, &sqp->ud_header); if (!is_eth) { sqp->ud_header.lrh.service_level = diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 1a1c55f..a34c9d3 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1493,7 +1493,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, int err; u16 pkey; - ib_ud_header_init(256, /* assume a MAD */ 1, 0, + ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0, &sqp->ud_header); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 904ffa9..b5fc9f3 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -129,21 +130,41 @@ static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } -static inline void iboe_mac_to_ll(union ib_gid *gid, u8 *mac) +static inline void iboe_mac_vlan_to_ll(union ib_gid *gid, u8 *mac, u16 vid) { memset(gid->raw, 0, 16); *((__be32 *) gid->raw) = cpu_to_be32(0xfe800000); - gid->raw[12] = 0xfe; - gid->raw[11] = 0xff; + if (vid < 0x1000) { + gid->raw[12] = vid & 0xff; + gid->raw[11] = vid >> 8; + } else { + gid->raw[12] = 0xfe; + gid->raw[11] = 0xff; + } memcpy(gid->raw + 13, mac + 3, 3); memcpy(gid->raw + 8, mac, 3); gid->raw[8] ^= 2; } +static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) +{ + return dev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_vlan_id(dev) : 0xffff; +} + static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - iboe_mac_to_ll(gid, dev_addr->src_dev_addr); + struct net_device *dev; + u16 vid = 0xffff; + + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (dev) { + vid = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + } + + iboe_mac_vlan_to_ll(gid, dev_addr->src_dev_addr, vid); } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) @@ -244,4 +265,18 @@ static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) mac[i] = addr->s6_addr[i + 10]; } +static inline u16 rdma_get_vlan_id(union ib_gid *dgid) +{ + u16 vid; + + vid = dgid->raw[11] << 8 | dgid->raw[12]; + return vid < 0x1000 ? vid : 0xffff; +} + +static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) +{ + return dev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(dev) : 0; +} + #endif /* IB_ADDR_H */ diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index 6b91d8e..b37fe3b 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -38,6 +38,7 @@ enum { IB_LRH_BYTES = 8, IB_ETH_BYTES = 14, + IB_VLAN_BYTES = 4, IB_GRH_BYTES = 40, IB_BTH_BYTES = 12, IB_DETH_BYTES = 8 @@ -219,11 +220,18 @@ struct ib_unpacked_eth { __be16 type; }; +struct ib_unpacked_vlan { + __be16 tag; + __be16 type; +}; + struct ib_ud_header { int lrh_present; struct ib_unpacked_lrh lrh; int eth_present; struct ib_unpacked_eth eth; + int vlan_present; + struct ib_unpacked_vlan vlan; int grh_present; struct ib_unpacked_grh grh; struct ib_unpacked_bth bth; @@ -245,6 +253,7 @@ void ib_unpack(const struct ib_field *desc, void ib_ud_header_init(int payload_bytes, int lrh_present, int eth_present, + int vlan_present, int grh_present, int immediate_present, struct ib_ud_header *header); -- cgit v0.10.2 From 4c3eb3ca13966508bcb64f39dcdef48be22f1731 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 26 Aug 2010 17:19:22 +0300 Subject: IB/mlx4: Add VLAN support for IBoE This patch allows IBoE traffic to be encapsulated in 802.1Q tagged VLAN frames. The VLAN tag is encoded in the GID and derived from it by a simple computation. The netdev notifier callback is modified to catch VLAN device addition/removal and the port's GID table is updated to reflect the change, so that for each netdevice there is an entry in the GID table. When the port's GID table is exhausted, GID entries will not be added. Only children of the main interfaces can add to the GID table; if a VLAN interface is added on another VLAN interface (e.g. "vconfig add eth2.6 8"), then that interfaces will not add an entry to the GID table. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 3bf3544..4b8f9c4 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,6 +31,7 @@ */ #include +#include #include #include @@ -91,17 +92,26 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; + union ib_gid sgid; u8 mac[6]; int err; int is_mcast; + u16 vlan_tag; err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); if (err) return ERR_PTR(err); memcpy(ah->av.eth.mac, mac, 6); + err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); + if (err) + return ERR_PTR(err); + vlan_tag = rdma_get_vlan_id(&sgid); + if (vlan_tag < 0x1000) + vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); ah->av.eth.gid_index = ah_attr->grh.sgid_index; + ah->av.eth.vlan = cpu_to_be16(vlan_tag); if (ah_attr->static_rate) { ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index e65db73..8736bd8 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,8 @@ static void init_query_mad(struct ib_smp *mad) mad->method = IB_MGMT_METHOD_GET; } +static union ib_gid zgid; + static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { @@ -755,12 +758,17 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); - eui[3] = 0xFF; - eui[4] = 0xFE; + if (vlan_id < 0x1000) { + eui[3] = vlan_id >> 8; + eui[4] = vlan_id & 0xff; + } else { + eui[3] = 0xff; + eui[4] = 0xfe; + } eui[0] ^= 2; } @@ -802,28 +810,93 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) { struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; + struct net_device *tmp; + int i; + u8 *hits; + int ret; + union ib_gid gid; + int free; + int found; + int need_update = 0; + u16 vid; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; - if (!clear) { - mlx4_addrconf_ifid_eui48(&work->gids[0].raw[8], ndev); - work->gids[0].global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + hits = kzalloc(128, GFP_ATOMIC); + if (!hits) { + ret = -ENOMEM; + goto out; + } + + read_lock(&dev_base_lock); + for_each_netdev(&init_net, tmp) { + if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { + gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + vid = rdma_vlan_dev_vlan_id(tmp); + mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); + found = 0; + free = -1; + for (i = 0; i < 128; ++i) { + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) + free = i; + if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { + hits[i] = 1; + found = 1; + break; + } + } + + if (!found) { + if (tmp == ndev && + (memcmp(&dev->iboe.gid_table[port - 1][0], + &gid, sizeof gid) || + !memcmp(&dev->iboe.gid_table[port - 1][0], + &zgid, sizeof gid))) { + dev->iboe.gid_table[port - 1][0] = gid; + ++need_update; + hits[0] = 1; + } else if (free >= 0) { + dev->iboe.gid_table[port - 1][free] = gid; + hits[free] = 1; + ++need_update; + } + } + } } + read_unlock(&dev_base_lock); + + for (i = 0; i < 128; ++i) + if (!hits[i]) { + if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) + ++need_update; + dev->iboe.gid_table[port - 1][i] = zgid; + } - INIT_WORK(&work->work, update_gids_task); - work->port = port; - work->dev = dev; - queue_work(wq, &work->work); + if (need_update) { + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); + } else + kfree(work); + kfree(hits); return 0; + +out: + kfree(work); + return ret; } static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) { switch (event) { case NETDEV_UP: + case NETDEV_CHANGEADDR: update_ipv6_gids(dev, port, 0); break; @@ -871,9 +944,11 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event } } - if (dev == iboe->netdevs[0]) + if (dev == iboe->netdevs[0] || + (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) handle_en_event(ibdev, 1, event); - else if (dev == iboe->netdevs[1]) + else if (dev == iboe->netdevs[1] + || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) handle_en_event(ibdev, 2, event); spin_unlock(&iboe->lock); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 2696484..9a7794a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -37,6 +37,7 @@ #include #include +#include #include @@ -57,10 +58,11 @@ enum { enum { /* * Largest possible UD header: send with GRH and immediate - * data plus 14 bytes for an Ethernet header. (LRH would only - * use 8 bytes, so Ethernet is the biggest case) + * data plus 18 bytes for an Ethernet header with VLAN/802.1Q + * tag. (LRH would only use 8 bytes, so Ethernet is the + * biggest case) */ - MLX4_IB_UD_HEADER_SIZE = 78, + MLX4_IB_UD_HEADER_SIZE = 82, MLX4_IB_LSO_HEADER_SPARE = 128, }; @@ -879,6 +881,8 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, IB_LINK_LAYER_ETHERNET; u8 mac[6]; int is_mcast; + u16 vlan_tag; + int vidx; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -907,10 +911,10 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, memcpy(path->rgid, ah->grh.dgid.raw, 16); } - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 0xf) << 2); - if (is_eth) { + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3) | ((ah->sl & 8) >> 1); + if (!(ah->ah_flags & IB_AH_GRH)) return -1; @@ -922,7 +926,18 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, path->ackto = MLX4_IB_LINK_TYPE_ETH; /* use index 0 into MAC table for IBoE */ path->grh_mylmc &= 0x80; - } + + vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + if (vlan_tag < 0x1000) { + if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) + return -ENOENT; + + path->vlan_index = vidx; + path->fl = 1 << 6; + } + } else + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 0xf) << 2); return 0; } @@ -1277,13 +1292,16 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + union ib_gid sgid; u16 pkey; int send_size; int header_size; int spc; int i; int is_eth; + int is_vlan = 0; int is_grh; + u16 vlan; send_size = 0; for (i = 0; i < wr->num_sge; ++i) @@ -1291,7 +1309,13 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); - ib_ud_header_init(send_size, !is_eth, is_eth, 0, is_grh, 0, &sqp->ud_header); + if (is_eth) { + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + vlan = rdma_get_vlan_id(&sgid); + is_vlan = vlan < 0x1000; + } + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); if (!is_eth) { sqp->ud_header.lrh.service_level = @@ -1345,7 +1369,15 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); - sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + if (!is_vlan) { + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + } else { + u16 pcp; + + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13; + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } } else { sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) @@ -1507,13 +1539,14 @@ static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr) + struct ib_send_wr *wr, __be16 *vlan) { memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); + *vlan = dseg->vlan; } static void set_mlx_icrc_seg(void *dseg) @@ -1616,6 +1649,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 uninitialized_var(lso_hdr_sz); __be32 blh; int i; + __be16 vlan = cpu_to_be16(0xffff); spin_lock_irqsave(&qp->sq.lock, flags); @@ -1719,7 +1753,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_QPT_UD: - set_datagram_seg(wqe, wr); + set_datagram_seg(wqe, wr, &vlan); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; @@ -1797,6 +1831,11 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; + if (be16_to_cpu(vlan) < 0x1000) { + ctrl->ins_vlan = 1 << 6; + ctrl->vlan_tag = vlan; + } + stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); @@ -1946,17 +1985,27 @@ static int to_ib_qp_access_flags(int mlx4_flags) return ib_flags; } -static void to_ib_ah_attr(struct mlx4_dev *dev, struct ib_ah_attr *ib_ah_attr, +static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, struct mlx4_qp_path *path) { + struct mlx4_dev *dev = ibdev->dev; + int is_eth; + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) return; + is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) + ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1); + else + ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; + ib_ah_attr->dlid = be16_to_cpu(path->rlid); - ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; @@ -2009,8 +2058,8 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr to_ib_qp_access_flags(be32_to_cpu(context.params2)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { - to_ib_ah_attr(dev->dev, &qp_attr->ah_attr, &context.pri_path); - to_ib_ah_attr(dev->dev, &qp_attr->alt_ah_attr, &context.alt_path); + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index a0d8a26..9a87c4f 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -69,6 +69,7 @@ static void mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err; + int idx; if (!priv->vlgrp) return; @@ -83,7 +84,10 @@ static void mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) if (err) en_err(priv, "Failed configuring VLAN filter\n"); } + if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx)) + en_err(priv, "failed adding vlan %d\n", vid); mutex_unlock(&mdev->state_lock); + } static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) @@ -91,6 +95,7 @@ static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err; + int idx; if (!priv->vlgrp) return; @@ -101,6 +106,11 @@ static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) /* Remove VID from port VLAN filter */ mutex_lock(&mdev->state_lock); + if (!mlx4_find_cached_vlan(mdev->dev, priv->port, vid, &idx)) + mlx4_unregister_vlan(mdev->dev, priv->port, idx); + else + en_err(priv, "could not find vid %d in cache\n", vid); + if (mdev->device_up && priv->port_up) { err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); if (err) diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index 4492109..dab5eaf 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -463,6 +463,7 @@ struct mlx4_en_priv { char *mc_addrs; int mc_addrs_cnt; struct mlx4_en_stat_out_mbox hw_stats; + int vids[128]; }; diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 606aa58..56371ef 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -182,6 +182,25 @@ static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, return err; } +int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) +{ + struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + int i; + + for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) { + if (table->refs[i] && + (vid == (MLX4_VLAN_MASK & + be32_to_cpu(table->entries[i])))) { + /* VLAN already registered, increase reference count */ + *idx = i; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan); + int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ca5645c..ff9893a3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -496,6 +496,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index); void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index); +int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 97cfdc8..0eeb2a1 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -109,7 +109,7 @@ struct mlx4_qp_path { __be32 tclass_flowlabel; u8 rgid[16]; u8 sched_queue; - u8 snooper_flags; + u8 vlan_index; u8 reserved3[2]; u8 counter_index; u8 reserved4; -- cgit v0.10.2 From 8ad330a002ffbc422f32a77398666832f0df8a3d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Fri, 22 Oct 2010 14:32:05 +0200 Subject: IB/core: Add link layer type information to sysfs Since an IB transport port may use either IB or Ethernet as its link layer, add the file /sys/class/infiniband//ports//link_layer to show the link layer for the port. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 3627300..9ab5df7 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -222,6 +222,19 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, } } +static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return sprintf(buf, "%s\n", "InfiniBand"); + case IB_LINK_LAYER_ETHERNET: + return sprintf(buf, "%s\n", "Ethernet"); + default: + return sprintf(buf, "%s\n", "Unknown"); + } +} + static PORT_ATTR_RO(state); static PORT_ATTR_RO(lid); static PORT_ATTR_RO(lid_mask_count); @@ -230,6 +243,7 @@ static PORT_ATTR_RO(sm_sl); static PORT_ATTR_RO(cap_mask); static PORT_ATTR_RO(rate); static PORT_ATTR_RO(phys_state); +static PORT_ATTR_RO(link_layer); static struct attribute *port_default_attrs[] = { &port_attr_state.attr, @@ -240,6 +254,7 @@ static struct attribute *port_default_attrs[] = { &port_attr_cap_mask.attr, &port_attr_rate.attr, &port_attr_phys_state.attr, + &port_attr_link_layer.attr, NULL }; -- cgit v0.10.2 From ca7cf94f8bf77bf0dfb35b615d82ac76a0ed77ff Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 25 Oct 2010 19:44:22 -0700 Subject: RDMA/cxgb3: Remove unnecessary KERN_ use Signed-off-by: Joe Perches Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 13c8887..d02dcc6 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1093,8 +1093,8 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) PDBG("%s ep %p credits %u\n", __func__, ep, credits); if (credits == 0) { - PDBG(KERN_ERR "%s 0 credit ack ep %p state %u\n", - __func__, ep, state_read(&ep->com)); + PDBG("%s 0 credit ack ep %p state %u\n", + __func__, ep, state_read(&ep->com)); return CPL_RET_BUF_DONE; } -- cgit v0.10.2 From aa1ad26089a90d5b7e2c908835ba6dc930eed019 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 25 Oct 2010 19:44:22 -0700 Subject: RDMA/cxgb4: Remove unnecessary KERN_ use Signed-off-by: Joe Perches Acked-by: Steve Wise Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0f68e2b..0dc62b1 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1749,8 +1749,8 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) ep = lookup_tid(t, tid); PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); if (credits == 0) { - PDBG(KERN_ERR "%s 0 credit ack ep %p tid %u state %u\n", - __func__, ep, ep->hwtid, state_read(&ep->com)); + PDBG("%s 0 credit ack ep %p tid %u state %u\n", + __func__, ep, ep->hwtid, state_read(&ep->com)); return 0; } -- cgit v0.10.2 From 82fdb0ab54096b8dbc8558e2dd37e9e0ac180db8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 22 Oct 2010 20:41:24 +0000 Subject: IB/qib: Fix extra log level in qib_early_err() Noticed this odd looking thing in dmesg: ib_qib 0000:02:00.0: <3>ib_qib: Unable to enable pcie error reporting: -5 which is due to a bad use of dev_info. Signed-off-by: Jason Gunthorpe Acked-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 61de065..64c9e7d 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1406,7 +1406,7 @@ extern struct mutex qib_mutex; */ #define qib_early_err(dev, fmt, ...) \ do { \ - dev_info(dev, KERN_ERR QIB_DRV_NAME ": " fmt, ##__VA_ARGS__); \ + dev_err(dev, fmt, ##__VA_ARGS__); \ } while (0) #define qib_dev_err(dd, fmt, ...) \ -- cgit v0.10.2 From 9e43e0106d6f526724911e80adb97dbcec520b5d Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 22 Oct 2010 15:29:46 -0700 Subject: IB/qib: Fix uninitialized pointer if CONFIG_PCI_MSI not set If CONFIG_PCI_MSI is not set, and a QLE7140 is present, the pointer "dd" is uninitialized. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index f1d16d3..f3b5039 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1243,6 +1243,7 @@ static int __devinit qib_init_one(struct pci_dev *pdev, qib_early_err(&pdev->dev, "QLogic PCIE device 0x%x cannot " "work if CONFIG_PCI_MSI is not enabled\n", ent->device); + dd = ERR_PTR(-ENODEV); #endif break; -- cgit v0.10.2 From 5d26a1df23f7e904e7b03bb2580288f6ae7cfe48 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 22 Oct 2010 15:29:54 -0700 Subject: IB/qib: Allow driver to load if PCIe AER fails Some PCIe root complex chip sets don't support advanced error reporting. Allow the driver to load OK if pci_enable_pcie_error_reporting() fails. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 7fa6e55..8a64426 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -109,10 +109,12 @@ int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); ret = pci_enable_pcie_error_reporting(pdev); - if (ret) + if (ret) { qib_early_err(&pdev->dev, "Unable to enable pcie error reporting: %d\n", ret); + ret = 0; + } goto done; bail: -- cgit v0.10.2 From 2ca78d23a758d3299abd128556debff4c1fefb16 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 25 Oct 2010 21:19:06 -0700 Subject: IB/qib: clean up properly if pci_set_consistent_dma_mask() fails Clean up properly if pci_set_consistent_dma_mask() fails. Signed-off-by: Jason Gunthorpe Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 8a64426..48b6674 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -103,9 +103,11 @@ int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); } else ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (ret) + if (ret) { qib_early_err(&pdev->dev, "Unable to set DMA consistent mask: %d\n", ret); + goto bail; + } pci_set_master(pdev); ret = pci_enable_pcie_error_reporting(pdev); -- cgit v0.10.2