summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core/verbs.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-24 02:45:06 (GMT)
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-24 02:45:06 (GMT)
commit048ccca8c1c8f583deec3367d7df521bb1f542ae (patch)
treeefa882c88f658f711d63581a3063203c63682338 /drivers/infiniband/core/verbs.c
parentb3e27d5d4a29bcc8e057b496d5ef5194addaaac0 (diff)
parent34356f64ac0df2326fa50e2d4bca6f7c03ed16c1 (diff)
downloadlinux-048ccca8c1c8f583deec3367d7df521bb1f542ae.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull rdma updates from Doug Ledford: "Initial roundup of 4.5 merge window patches - Remove usage of ib_query_device and instead store attributes in ib_device struct - Move iopoll out of block and into lib, rename to irqpoll, and use in several places in the rdma stack as our new completion queue polling library mechanism. Update the other block drivers that already used iopoll to use the new mechanism too. - Replace the per-entry GID table locks with a single GID table lock - IPoIB multicast cleanup - Cleanups to the IB MR facility - Add support for 64bit extended IB counters - Fix for netlink oops while parsing RDMA nl messages - RoCEv2 support for the core IB code - mlx4 RoCEv2 support - mlx5 RoCEv2 support - Cross Channel support for mlx5 - Timestamp support for mlx5 - Atomic support for mlx5 - Raw QP support for mlx5 - MAINTAINERS update for mlx4/mlx5 - Misc ocrdma, qib, nes, usNIC, cxgb3, cxgb4, mlx4, mlx5 updates - Add support for remote invalidate to the iSER driver (pushed through the RDMA tree due to dependencies, acknowledged by nab) - Update to NFSoRDMA (pushed through the RDMA tree due to dependencies, acknowledged by Bruce)" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (169 commits) IB/mlx5: Unify CQ create flags check IB/mlx5: Expose Raw Packet QP to user space consumers {IB, net}/mlx5: Move the modify QP operation table to mlx5_ib IB/mlx5: Support setting Ethernet priority for Raw Packet QPs IB/mlx5: Add Raw Packet QP query functionality IB/mlx5: Add create and destroy functionality for Raw Packet QP IB/mlx5: Refactor mlx5_ib_qp to accommodate other QP types IB/mlx5: Allocate a Transport Domain for each ucontext net/mlx5_core: Warn on unsupported events of QP/RQ/SQ net/mlx5_core: Add RQ and SQ event handling net/mlx5_core: Export transport objects IB/mlx5: Expose CQE version to user-space IB/mlx5: Add CQE version 1 support to user QPs and SRQs IB/mlx5: Fix data validation in mlx5_ib_alloc_ucontext IB/sa: Fix netlink local service GFP crash IB/srpt: Remove redundant wc array IB/qib: Improve ipoib UD performance IB/mlx4: Advertise RoCE v2 support IB/mlx4: Create and use another QP1 for RoCEv2 IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers ...
Diffstat (limited to 'drivers/infiniband/core/verbs.c')
-rw-r--r--drivers/infiniband/core/verbs.c238
1 files changed, 158 insertions, 80 deletions
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 545906d..5af6d02 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
struct ib_pd *ib_alloc_pd(struct ib_device *device)
{
struct ib_pd *pd;
- struct ib_device_attr devattr;
- int rc;
-
- rc = ib_query_device(device, &devattr);
- if (rc)
- return ERR_PTR(rc);
pd = device->alloc_pd(device, NULL, NULL);
if (IS_ERR(pd))
@@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device)
pd->local_mr = NULL;
atomic_set(&pd->usecnt, 0);
- if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
pd->local_dma_lkey = device->local_dma_lkey;
else {
struct ib_mr *mr;
@@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
}
EXPORT_SYMBOL(ib_create_ah);
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+ const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+ struct iphdr ip4h_checked;
+ const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+ /* If it's IPv6, the version must be 6, otherwise, the first
+ * 20 bytes (before the IPv4 header) are garbled.
+ */
+ if (ip6h->version != 6)
+ return (ip4h->version == 4) ? 4 : 0;
+ /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+ /* RoCE v2 requires no options, thus header length
+ * must be 5 words
+ */
+ if (ip4h->ihl != 5)
+ return 6;
+
+ /* Verify checksum.
+ * We can't write on scattered buffers so we need to copy to
+ * temp buffer.
+ */
+ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+ ip4h_checked.check = 0;
+ ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+ /* if IPv4 header checksum is OK, believe it */
+ if (ip4h->check == ip4h_checked.check)
+ return 4;
+ return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+ u8 port_num,
+ const struct ib_grh *grh)
+{
+ int grh_version;
+
+ if (rdma_protocol_ib(device, port_num))
+ return RDMA_NETWORK_IB;
+
+ grh_version = ib_get_header_version((union rdma_network_hdr *)grh);
+
+ if (grh_version == 4)
+ return RDMA_NETWORK_IPV4;
+
+ if (grh->next_hdr == IPPROTO_UDP)
+ return RDMA_NETWORK_IPV6;
+
+ return RDMA_NETWORK_ROCE_V1;
+}
+
struct find_gid_index_context {
u16 vlan_id;
+ enum ib_gid_type gid_type;
};
static bool find_gid_index(const union ib_gid *gid,
@@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid,
struct find_gid_index_context *ctx =
(struct find_gid_index_context *)context;
+ if (ctx->gid_type != gid_attr->gid_type)
+ return false;
+
if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
(is_vlan_dev(gid_attr->ndev) &&
vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid,
static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
u16 vlan_id, const union ib_gid *sgid,
+ enum ib_gid_type gid_type,
u16 *gid_index)
{
- struct find_gid_index_context context = {.vlan_id = vlan_id};
+ struct find_gid_index_context context = {.vlan_id = vlan_id,
+ .gid_type = gid_type};
return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
&context, gid_index);
}
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid)
+{
+ struct sockaddr_in src_in;
+ struct sockaddr_in dst_in;
+ __be32 src_saddr, dst_saddr;
+
+ if (!sgid || !dgid)
+ return -EINVAL;
+
+ if (net_type == RDMA_NETWORK_IPV4) {
+ memcpy(&src_in.sin_addr.s_addr,
+ &hdr->roce4grh.saddr, 4);
+ memcpy(&dst_in.sin_addr.s_addr,
+ &hdr->roce4grh.daddr, 4);
+ src_saddr = src_in.sin_addr.s_addr;
+ dst_saddr = dst_in.sin_addr.s_addr;
+ ipv6_addr_set_v4mapped(src_saddr,
+ (struct in6_addr *)sgid);
+ ipv6_addr_set_v4mapped(dst_saddr,
+ (struct in6_addr *)dgid);
+ return 0;
+ } else if (net_type == RDMA_NETWORK_IPV6 ||
+ net_type == RDMA_NETWORK_IB) {
+ *dgid = hdr->ibgrh.dgid;
+ *sgid = hdr->ibgrh.sgid;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
const struct ib_wc *wc, const struct ib_grh *grh,
struct ib_ah_attr *ah_attr)
@@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
u32 flow_class;
u16 gid_index;
int ret;
+ enum rdma_network_type net_type = RDMA_NETWORK_IB;
+ enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ int hoplimit = 0xff;
+ union ib_gid dgid;
+ union ib_gid sgid;
memset(ah_attr, 0, sizeof *ah_attr);
if (rdma_cap_eth_ah(device, port_num)) {
+ if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+ net_type = wc->network_hdr_type;
+ else
+ net_type = ib_get_net_type_by_grh(device, port_num, grh);
+ gid_type = ib_network_to_gid_type(net_type);
+ }
+ ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+ &sgid, &dgid);
+ if (ret)
+ return ret;
+
+ if (rdma_protocol_roce(device, port_num)) {
+ int if_index = 0;
u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
wc->vlan_id : 0xffff;
+ struct net_device *idev;
+ struct net_device *resolved_dev;
if (!(wc->wc_flags & IB_WC_GRH))
return -EPROTOTYPE;
- if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
- !(wc->wc_flags & IB_WC_WITH_VLAN)) {
- ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
- ah_attr->dmac,
- wc->wc_flags & IB_WC_WITH_VLAN ?
- NULL : &vlan_id,
- 0);
- if (ret)
- return ret;
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ idev = device->get_netdev(device, port_num);
+ if (!idev)
+ return -ENODEV;
+
+ ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
+ ah_attr->dmac,
+ wc->wc_flags & IB_WC_WITH_VLAN ?
+ NULL : &vlan_id,
+ &if_index, &hoplimit);
+ if (ret) {
+ dev_put(idev);
+ return ret;
}
- ret = get_sgid_index_from_eth(device, port_num, vlan_id,
- &grh->dgid, &gid_index);
+ resolved_dev = dev_get_by_index(&init_net, if_index);
+ if (resolved_dev->flags & IFF_LOOPBACK) {
+ dev_put(resolved_dev);
+ resolved_dev = idev;
+ dev_hold(resolved_dev);
+ }
+ rcu_read_lock();
+ if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
+ resolved_dev))
+ ret = -EHOSTUNREACH;
+ rcu_read_unlock();
+ dev_put(idev);
+ dev_put(resolved_dev);
if (ret)
return ret;
- if (wc->wc_flags & IB_WC_WITH_SMAC)
- memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+ ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+ &dgid, gid_type, &gid_index);
+ if (ret)
+ return ret;
}
ah_attr->dlid = wc->slid;
@@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
if (wc->wc_flags & IB_WC_GRH) {
ah_attr->ah_flags = IB_AH_GRH;
- ah_attr->grh.dgid = grh->sgid;
+ ah_attr->grh.dgid = sgid;
if (!rdma_cap_eth_ah(device, port_num)) {
- ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+ ret = ib_find_cached_gid_by_port(device, &dgid,
+ IB_GID_TYPE_IB,
port_num, NULL,
&gid_index);
if (ret)
@@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
ah_attr->grh.sgid_index = (u8) gid_index;
flow_class = be32_to_cpu(grh->version_tclass_flow);
ah_attr->grh.flow_label = flow_class & 0xFFFFF;
- ah_attr->grh.hop_limit = 0xFF;
+ ah_attr->grh.hop_limit = hoplimit;
ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
}
return 0;
@@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
union ib_gid sgid;
struct ib_gid_attr sgid_attr;
int ifindex;
+ int hop_limit;
ret = ib_query_gid(qp->device,
qp_attr->ah_attr.port_num,
@@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
ifindex = sgid_attr.ndev->ifindex;
- ret = rdma_addr_find_dmac_by_grh(&sgid,
- &qp_attr->ah_attr.grh.dgid,
- qp_attr->ah_attr.dmac,
- NULL, ifindex);
+ ret = rdma_addr_find_l2_eth_by_grh(&sgid,
+ &qp_attr->ah_attr.grh.dgid,
+ qp_attr->ah_attr.dmac,
+ NULL, &ifindex, &hop_limit);
dev_put(sgid_attr.ndev);
+
+ qp_attr->ah_attr.grh.hop_limit = hop_limit;
}
}
out:
@@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
}
return mr;
}
EXPORT_SYMBOL(ib_get_dma_mr);
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
- return mr->device->query_mr ?
- mr->device->query_mr(mr, mr_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_mr);
-
int ib_dereg_mr(struct ib_mr *mr)
{
- struct ib_pd *pd;
+ struct ib_pd *pd = mr->pd;
int ret;
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
ret = mr->device->dereg_mr(mr);
if (!ret)
atomic_dec(&pd->usecnt);
@@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
}
return mr;
}
EXPORT_SYMBOL(ib_alloc_mr);
-/* Memory windows */
-
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
- struct ib_mw *mw;
-
- if (!pd->device->alloc_mw)
- return ERR_PTR(-ENOSYS);
-
- mw = pd->device->alloc_mw(pd, type);
- if (!IS_ERR(mw)) {
- mw->device = pd->device;
- mw->pd = pd;
- mw->uobject = NULL;
- mw->type = type;
- atomic_inc(&pd->usecnt);
- }
-
- return mw;
-}
-EXPORT_SYMBOL(ib_alloc_mw);
-
-int ib_dealloc_mw(struct ib_mw *mw)
-{
- struct ib_pd *pd;
- int ret;
-
- pd = mw->pd;
- ret = mw->device->dealloc_mw(mw);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_mw);
-
/* "Fast" memory regions */
struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr,
int (*set_page)(struct ib_mr *, u64))
{
struct scatterlist *sg;
- u64 last_end_dma_addr = 0, last_page_addr = 0;
+ u64 last_end_dma_addr = 0;
unsigned int last_page_off = 0;
u64 page_mask = ~((u64)mr->page_size - 1);
int i, ret;
@@ -1572,7 +1651,6 @@ next_page:
mr->length += dma_len;
last_end_dma_addr = end_dma_addr;
- last_page_addr = end_dma_addr & page_mask;
last_page_off = end_dma_addr & ~page_mask;
}