From 571e09eeff544e5562bd2a704f1fe91083f7592f Mon Sep 17 00:00:00 2001 From: Abhilash Jindal Date: Sun, 31 Jan 2016 13:53:31 -0500 Subject: IB/mlx4: Use boottime Wall time obtained from ktime_get_real_ns is susceptible to sudden jumps due to user setting the time or due to NTP. Boot time is constantly increasing time better suited for comparing two timestamps. Signed-off-by: Abhilash Jindal Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 21cb41a..c74ef26 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -310,7 +310,7 @@ static void aliasguid_query_handler(int status, if (status) { pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); - rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; + rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -416,7 +416,7 @@ next_entry: be64_to_cpu((__force __be64)rec->guid_indexes), be64_to_cpu((__force __be64)applied_guid_indexes), be64_to_cpu((__force __be64)declined_guid_indexes)); - rec->time_to_run = ktime_get_real_ns() + + rec->time_to_run = ktime_get_boot_ns() + resched_delay_sec * NSEC_PER_SEC; } else { rec->status = MLX4_GUID_INFO_STATUS_SET; @@ -708,7 +708,7 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, } } if (resched_delay_sec) { - u64 curr_time = ktime_get_real_ns(); + u64 curr_time = ktime_get_boot_ns(); *resched_delay_sec = (low_record_time < curr_time) ? 0 : div_u64((low_record_time - curr_time), NSEC_PER_SEC); -- cgit v0.10.2 From a3100a78794175d7f2488a3155d247da3d7390e4 Mon Sep 17 00:00:00 2001 From: Marina Varshaver Date: Thu, 18 Feb 2016 18:31:05 +0200 Subject: IB/core: Add don't trap flag to flow creation Don't trap flag (i.e. IB_FLOW_ATTR_FLAGS_DONT_TRAP) indicates that QP will receive traffic, but will not steal it. When a packet matches a flow steering rule that was created with the don't trap flag, the QPs assigned to this rule will get this packet, but matching will continue to other equal/lower priority rules. This will let other QPs assigned to those rules to get the packet too. If both don't trap rule and other rules have the same priority and match the same packet, the behavior is undefined. The don't trap flag can't be set with default rule types (i.e. IB_FLOW_ATTR_ALL_DEFAULT, IB_FLOW_ATTR_MC_DEFAULT) as default rules don't have rules after them and don't trap has no meaning here. Signed-off-by: Marina Varshaver Reviewed-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6ffc9c4..0f05de6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3085,6 +3085,14 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c7ab6c..41f2c25 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1653,6 +1653,9 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) + return ERR_PTR(-EOPNOTSUPP); + memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 284b00c..514223f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1487,6 +1487,11 @@ enum ib_flow_domain { IB_FLOW_DOMAIN_NUM /* Must be last */ }; +enum ib_flow_flags { + IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ +}; + struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; -- cgit v0.10.2 From 0e451e883bd13ce616f439e2414b8c17fa28318a Mon Sep 17 00:00:00 2001 From: Marina Varshaver Date: Thu, 18 Feb 2016 18:31:06 +0200 Subject: IB/mlx4: Add support for the don't trap rule Add support for receiving multicast/unicast traffic with the don't trap rule. Sniffing these packets requires a flow steering rule of type NORMAL at priority 0 with flag IB_FLOW_ATTR_FLAGS_DONT_TRAP set. Choosing between multicast or unicast is done via ethernet L2 dest_mac mask and value: - If mask is all zeros - unicast and multicast are set. - If mask non zero - only mask with multicast bit 1 and rest 0 is supported, the mac value will choose if it is multicast or unicast rule. If the mask multicast bit is on and some other bits are on too, it means a request for specific multicast or unicast, this is not supported, either receive all multicast or all unicast. Only when limitations are met registered QP will receive requested type but other QPs can receive same traffic if registered for it. Otherwise, if limitations are not met, an error will be returned. Limitations: - Rule must be with priority 0. - A0 mode is not supported. - Sniffer QP cannot appear in any other flow steering rule. Signed-off-by: Marina Varshaver Reviewed-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 41f2c25..914bc98 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1643,6 +1643,56 @@ static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_ return err; } +static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, + struct ib_flow_attr *flow_attr, + enum mlx4_net_trans_promisc_mode *type) +{ + int err = 0; + + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) || + (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) || + (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) { + return -EOPNOTSUPP; + } + + if (flow_attr->num_of_specs == 0) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + union ib_flow_spec *ib_spec; + + ib_spec = (union ib_flow_spec *)(flow_attr + 1); + if (ib_spec->type != IB_FLOW_SPEC_ETH) + return -EINVAL; + + /* if all is zero than MC and UC */ + if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01, + ib_spec->eth.mask.dst_mac[1], + ib_spec->eth.mask.dst_mac[2], + ib_spec->eth.mask.dst_mac[3], + ib_spec->eth.mask.dst_mac[4], + ib_spec->eth.mask.dst_mac[5]}; + + /* Above xor was only on MC bit, non empty mask is valid + * only if this bit is set and rest are zero. + */ + if (!is_zero_ether_addr(&mac[0])) + return -EINVAL; + + if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac)) + type[0] = MLX4_FS_MC_SNIFFER; + else + type[0] = MLX4_FS_UC_SNIFFER; + } + } + + return err; +} + static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) @@ -1653,7 +1703,8 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); - if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); memset(type, 0, sizeof(type)); @@ -1666,7 +1717,19 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: - type[0] = MLX4_FS_REGULAR; + /* If dont trap flag (continue match) is set, under specific + * condition traffic be replicated to given qp, + * without stealing it + */ + if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) { + err = mlx4_ib_add_dont_trap_rule(dev, + flow_attr, + type); + if (err) + goto err_free; + } else { + type[0] = MLX4_FS_REGULAR; + } break; case IB_FLOW_ATTR_ALL_DEFAULT: @@ -1678,8 +1741,8 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, break; case IB_FLOW_ATTR_SNIFFER: - type[0] = MLX4_FS_UC_SNIFFER; - type[1] = MLX4_FS_MC_SNIFFER; + type[0] = MLX4_FS_MIRROR_RX_PORT; + type[1] = MLX4_FS_MIRROR_SX_PORT; break; default: diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index d66c690..e970945 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -157,7 +157,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [29] = "802.1ad offload support", [31] = "Modifying loopback source checks using UPDATE_QP support", [32] = "Loopback source checks support", - [33] = "RoCEv2 support" + [33] = "RoCEv2 support", + [34] = "DMFS Sniffer support (UC & MC)" }; int i; @@ -810,6 +811,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; + if (field & 0x20) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER; MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON; diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c index 1d4e2e0..42d8de8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -752,8 +752,10 @@ static const u8 __promisc_mode[] = { [MLX4_FS_REGULAR] = 0x0, [MLX4_FS_ALL_DEFAULT] = 0x1, [MLX4_FS_MC_DEFAULT] = 0x3, - [MLX4_FS_UC_SNIFFER] = 0x4, - [MLX4_FS_MC_SNIFFER] = 0x5, + [MLX4_FS_MIRROR_RX_PORT] = 0x4, + [MLX4_FS_MIRROR_SX_PORT] = 0x5, + [MLX4_FS_UC_SNIFFER] = 0x6, + [MLX4_FS_MC_SNIFFER] = 0x7, }; int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a0e8cc8..8541a91 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -219,6 +219,7 @@ enum { MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31, MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, + MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, }; enum { @@ -1160,6 +1161,8 @@ enum mlx4_net_trans_promisc_mode { MLX4_FS_REGULAR = 1, MLX4_FS_ALL_DEFAULT, MLX4_FS_MC_DEFAULT, + MLX4_FS_MIRROR_RX_PORT, + MLX4_FS_MIRROR_SX_PORT, MLX4_FS_UC_SNIFFER, MLX4_FS_MC_SNIFFER, MLX4_FS_MODE_NUM, /* should be last */ -- cgit v0.10.2 From e1614869d370d4d1599d771346d7da570f1d2bfa Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 28 Jan 2016 08:59:56 -0500 Subject: RDMA/ocrdma: Export udp encapsulation capability Add support to read device configuration and initialize port-immutables to report UDP-Encap flag during port query. Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 12503f1..b58833d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -114,6 +114,7 @@ struct ocrdma_dev_attr { u8 local_ca_ack_delay; u8 ird; u8 num_ird_pages; + u8 udp_encap; }; struct ocrdma_dma_mem { @@ -598,4 +599,10 @@ static inline u8 ocrdma_get_ae_link_state(u32 ae_state) return ((ae_state & OCRDMA_AE_LSC_LS_MASK) >> OCRDMA_AE_LSC_LS_SHIFT); } +static inline bool ocrdma_is_udp_encap_supported(struct ocrdma_dev *dev) +{ + return (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV4) || + (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV6); +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 283ca84..aea7f17 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1144,6 +1144,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_pd = (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT; + attr->udp_encap = (rsp->max_pd_ca_ack_delay & + OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK) >> + OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT; attr->max_dpp_pds = (rsp->max_dpp_pds_credits & OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index f387430..3d75f65 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -89,8 +89,10 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; + struct ocrdma_dev *dev; int err; + dev = get_ocrdma_dev(ibdev); err = ocrdma_query_port(ibdev, port_num, &attr); if (err) return err; @@ -98,6 +100,8 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (ocrdma_is_udp_encap_supported(dev)) + immutable->core_cap_flags |= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 99dd6fd..8d75bd4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -140,7 +140,11 @@ enum { OCRDMA_DB_RQ_SHIFT = 24 }; -#define OCRDMA_ROUDP_FLAGS_SHIFT 0x03 +enum { + OCRDMA_L3_TYPE_IB_GRH = 0x00, + OCRDMA_L3_TYPE_IPV4 = 0x01, + OCRDMA_L3_TYPE_IPV6 = 0x02 +}; #define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ #define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */ @@ -546,7 +550,8 @@ enum { OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT = 8, OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK = 0xFF << OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT, - + OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT = 3, + OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK = 0x18, OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT = 0, OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK = 0xFFFF, OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT = 16, -- cgit v0.10.2 From 6b0626679d81626eac47cba7940f1435ec480b2e Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 28 Jan 2016 08:59:57 -0500 Subject: RDMA/ocrdma: Support RoCE-v2 in the UD path This patch adds following changes to support RoCE-v2 in the UD path. * During AH creation GID-type is resolved for a given gid-index. * Based on GID-type protocol header is built. * Work completion reports network header type and set IB_WC_WITH_NETWORK_HDR_TYPE flag in wc->wc_flags to indicate that the network header type is valid. Signed-off-by: Somnath Kotur Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index b58833d..45bdfa0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -357,6 +357,7 @@ struct ocrdma_ah { struct ocrdma_av *av; u16 sgid_index; u32 id; + u8 hdr_type; }; struct ocrdma_qp_hwq_info { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 3790771..4aed1db 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -55,6 +55,21 @@ #define OCRDMA_VID_PCP_SHIFT 0xD +static u16 ocrdma_hdr_type_to_proto_num(int devid, u8 hdr_type) +{ + switch (hdr_type) { + case OCRDMA_L3_TYPE_IB_GRH: + return (u16)0x8915; + case OCRDMA_L3_TYPE_IPV4: + return (u16)0x0800; + case OCRDMA_L3_TYPE_IPV6: + return (u16)0x86dd; + default: + pr_err("ocrdma%d: Invalid network header\n", devid); + return 0; + } +} + static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, union ib_gid *sgid, int pdid, bool *isvlan, u16 vlan_tag) @@ -63,10 +78,23 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ocrdma_eth_vlan eth; struct ocrdma_grh grh; int eth_sz; + u16 proto_num = 0; + u8 nxthdr = 0x11; + struct iphdr ipv4; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; memset(ð, 0, sizeof(eth)); memset(&grh, 0, sizeof(grh)); + /* Protocol Number */ + proto_num = ocrdma_hdr_type_to_proto_num(dev->id, ah->hdr_type); + if (!proto_num) + return -EINVAL; + nxthdr = (proto_num == 0x8915) ? 0x1b : 0x11; /* VLAN */ if (!vlan_tag || (vlan_tag > 0xFFF)) vlan_tag = dev->pvid; @@ -78,13 +106,13 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, dev->id); } eth.eth_type = cpu_to_be16(0x8100); - eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth.roce_eth_type = cpu_to_be16(proto_num); vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; eth.vlan_tag = cpu_to_be16(vlan_tag); eth_sz = sizeof(struct ocrdma_eth_vlan); *isvlan = true; } else { - eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth.eth_type = cpu_to_be16(proto_num); eth_sz = sizeof(struct ocrdma_eth_basic); } /* MAC */ @@ -93,18 +121,33 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, if (status) return status; ah->sgid_index = attr->grh.sgid_index; - memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); - memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); - - grh.tclass_flow = cpu_to_be32((6 << 28) | - (attr->grh.traffic_class << 24) | - attr->grh.flow_label); - /* 0x1b is next header value in GRH */ - grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | - (0x1b << 8) | attr->grh.hop_limit); /* Eth HDR */ memcpy(&ah->av->eth_hdr, ð, eth_sz); - memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + if (ah->hdr_type == RDMA_NETWORK_IPV4) { + *((__be16 *)&ipv4) = htons((4 << 12) | (5 << 8) | + attr->grh.traffic_class); + ipv4.id = cpu_to_be16(pdid); + ipv4.frag_off = htons(IP_DF); + ipv4.tot_len = htons(0); + ipv4.ttl = attr->grh.hop_limit; + ipv4.protocol = nxthdr; + rdma_gid2ip(&sgid_addr._sockaddr, sgid); + ipv4.saddr = sgid_addr._sockaddr_in.sin_addr.s_addr; + rdma_gid2ip(&dgid_addr._sockaddr, &attr->grh.dgid); + ipv4.daddr = dgid_addr._sockaddr_in.sin_addr.s_addr; + memcpy((u8 *)ah->av + eth_sz, &ipv4, sizeof(struct iphdr)); + } else { + memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); + grh.tclass_flow = cpu_to_be32((6 << 28) | + (attr->grh.traffic_class << 24) | + attr->grh.flow_label); + memcpy(&grh.dgid[0], attr->grh.dgid.raw, + sizeof(attr->grh.dgid.raw)); + grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | + (nxthdr << 8) | + attr->grh.hop_limit); + memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + } if (*isvlan) ah->av->valid |= OCRDMA_AV_VLAN_VALID; ah->av->valid = cpu_to_le32(ah->av->valid); @@ -128,6 +171,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if (atomic_cmpxchg(&dev->update_sl, 1, 0)) ocrdma_init_service_level(dev); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); @@ -148,6 +192,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev); dev_put(sgid_attr.ndev); } + /* Get network header type for this GID */ + ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); if ((pd->uctx) && (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) && diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 8d75bd4..3d15948 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -1740,8 +1740,11 @@ enum { /* w1 */ OCRDMA_CQE_UD_XFER_LEN_SHIFT = 16, + OCRDMA_CQE_UD_XFER_LEN_MASK = 0x1FFF, OCRDMA_CQE_PKEY_SHIFT = 0, OCRDMA_CQE_PKEY_MASK = 0xFFFF, + OCRDMA_CQE_UD_L3TYPE_SHIFT = 29, + OCRDMA_CQE_UD_L3TYPE_MASK = 0x07, /* w2 */ OCRDMA_CQE_QPN_SHIFT = 0, @@ -1866,7 +1869,7 @@ struct ocrdma_ewqe_ud_hdr { u32 rsvd_dest_qpn; u32 qkey; u32 rsvd_ahid; - u32 rsvd; + u32 hdr_type; }; /* extended wqe followed by hdr_wqe for Fast Memory register */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 12420e4..4df3f13 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -2005,6 +2005,7 @@ static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, else ud_hdr->qkey = ud_wr(wr)->remote_qkey; ud_hdr->rsvd_ahid = ah->id; + ud_hdr->hdr_type = ah->hdr_type; if (ah->av->valid & OCRDMA_AV_VLAN_VALID) hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT); } @@ -2717,9 +2718,11 @@ static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, return expand; } -static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) +static int ocrdma_update_ud_rcqe(struct ocrdma_dev *dev, struct ib_wc *ibwc, + struct ocrdma_cqe *cqe) { int status; + u16 hdr_type = 0; status = (le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; @@ -2728,7 +2731,17 @@ static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) ibwc->pkey_index = 0; ibwc->wc_flags = IB_WC_GRH; ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >> - OCRDMA_CQE_UD_XFER_LEN_SHIFT); + OCRDMA_CQE_UD_XFER_LEN_SHIFT) & + OCRDMA_CQE_UD_XFER_LEN_MASK; + + if (ocrdma_is_udp_encap_supported(dev)) { + hdr_type = (le32_to_cpu(cqe->ud.rxlen_pkey) >> + OCRDMA_CQE_UD_L3TYPE_SHIFT) & + OCRDMA_CQE_UD_L3TYPE_MASK; + ibwc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; + ibwc->network_hdr_type = hdr_type; + } + return status; } @@ -2791,12 +2804,15 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, struct ib_wc *ibwc) { + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(qp->ibqp.device); ibwc->opcode = IB_WC_RECV; ibwc->qp = &qp->ibqp; ibwc->status = IB_WC_SUCCESS; if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) - ocrdma_update_ud_rcqe(ibwc, cqe); + ocrdma_update_ud_rcqe(dev, ibwc, cqe); else ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen); -- cgit v0.10.2 From bcf117e2cf6f451b46780e0660e9ae7ab33a33ea Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 28 Jan 2016 08:59:58 -0500 Subject: RDMA/ocrdma: Support RoCE-v2 in the RC path This patch implements following changes to support RoCE-v2 in the RC path: * Get the GID-type for a given sgid. * Based on the GID-type get IPv4/IPv6 L3-address and give those to underlying device. * Resolve and provide network header type to device. Signed-off-by: Somnath Kotur Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index aea7f17..2cfbf15 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2504,7 +2504,12 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, union ib_gid sgid, zgid; struct ib_gid_attr sgid_attr; u32 vlan_id = 0xFFFF; - u8 mac_addr[6]; + u8 mac_addr[6], hdr_type; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); if ((ah_attr->ah_flags & IB_AH_GRH) == 0) @@ -2519,6 +2524,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.hop_lmt_rq_psn |= (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT); cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID; + + /* GIDs */ memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], sizeof(cmd->params.dgid)); @@ -2541,6 +2548,16 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, return status; cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | (mac_addr[2] << 16) | (mac_addr[3] << 24); + + hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + if (hdr_type == RDMA_NETWORK_IPV4) { + rdma_gid2ip(&sgid_addr._sockaddr, &sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &ah_attr->grh.dgid); + memcpy(&cmd->params.dgid[0], + &dgid_addr._sockaddr_in.sin_addr.s_addr, 4); + memcpy(&cmd->params.sgid[0], + &sgid_addr._sockaddr_in.sin_addr.s_addr, 4); + } /* convert them to LE format. */ ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); @@ -2561,7 +2578,9 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.rnt_rc_sl_fl |= (dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; } - + cmd->params.max_sge_recv_flags |= ((hdr_type << + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT) & + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK); return 0; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 3d15948..0efc966 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -1112,6 +1112,8 @@ enum { OCRDMA_QP_PARAMS_STATE_MASK = BIT(5) | BIT(6) | BIT(7), OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC = BIT(8), OCRDMA_QP_PARAMS_FLAGS_INB_ATEN = BIT(9), + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT = 11, + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK = BIT(11) | BIT(12) | BIT(13), OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT = 16, OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK = 0xFFFF << OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT, -- cgit v0.10.2 From 834d16d66ebc2b5faa06af0bda3bb6f9c71b3996 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 28 Jan 2016 08:59:59 -0500 Subject: RDMA/ocrdma: Support user AH creation for RoCE-v2 This patch adds support to create RoCE-v2 compatible AH. It uses ahid field to tell network-header-type to user space library. The library has to decode network-header-type from ahid field. Signed-off-by: Somnath Kotur Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 4aed1db..e3c4f17 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -218,6 +218,11 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) ahid_addr = pd->uctx->ah_tbl.va + attr->dlid; *ahid_addr = 0; *ahid_addr |= ah->id & OCRDMA_AH_ID_MASK; + if (ocrdma_is_udp_encap_supported(dev)) { + *ahid_addr |= ((u32)ah->hdr_type & + OCRDMA_AH_L3_TYPE_MASK) << + OCRDMA_AH_L3_TYPE_SHIFT; + } if (isvlan) *ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK << OCRDMA_AH_VLAN_VALID_SHIFT); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h index 04a30ae..3856dd4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -46,9 +46,10 @@ enum { OCRDMA_AH_ID_MASK = 0x3FF, OCRDMA_AH_VLAN_VALID_MASK = 0x01, - OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F + OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F, + OCRDMA_AH_L3_TYPE_MASK = 0x03, + OCRDMA_AH_L3_TYPE_SHIFT = 0x1D /* 29 bits */ }; - struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *); int ocrdma_destroy_ah(struct ib_ah *); int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *); -- cgit v0.10.2 From b54ba2772b7af82a07eb48f88c88f7cadfb33401 Mon Sep 17 00:00:00 2001 From: Meny Yossefi Date: Thu, 18 Feb 2016 18:14:59 +0200 Subject: net/mlx5_core: Add helper function to read virtual port counters Added helper function to read 64bit virtual port Infiniband traffic counters. Signed-off-by: Meny Yossefi Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index c7398b9..90ab09e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -850,3 +850,43 @@ int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); } EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); + +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, void *out, size_t out_sz) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = mlx5_vzalloc(in_sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + MLX5_SET(query_vport_counter_in, in, vport_number, 0); + } else { + err = -EPERM; + goto free; + } + } + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_vport_counter_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto free; + err = mlx5_cmd_status_to_err_v2(out); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e54..0732e6c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3126,7 +3126,8 @@ struct mlx5_ifc_query_vport_counter_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 reserved_at_41[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_at_60[0x60]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 1237710..a9f2bcc 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -92,5 +92,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, void *out, size_t out_sz); #endif /* __MLX5_VPORT_H__ */ -- cgit v0.10.2 From 1c64bf6f291cae7cbe779e407db9477378bb4e7d Mon Sep 17 00:00:00 2001 From: Meny Yossefi Date: Thu, 18 Feb 2016 18:15:00 +0200 Subject: net/mlx5_core: Add helper function to read IB error counters Added helper function to read IB standard error counters via the PPCNT register. The PPCNT register read command provides the 32-bit error counters of both IB/RoCE link layer and transport layer. Signed-off-by: Meny Yossefi Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index a87e773..5635ce7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -324,6 +324,29 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz) +{ + u32 *in; + int err; + + in = mlx5_vzalloc(sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(ppcnt_reg, in, local_port, port_num); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + err = mlx5_core_access_reg(dev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt); + int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) { u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 987764a..99e2edc 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1284,7 +1284,8 @@ enum { MLX5_RFC_3635_COUNTERS_GROUP = 0x3, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5, MLX5_PER_PRIORITY_COUNTERS_GROUP = 0x10, - MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11 + MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, }; static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1e3006d..8edcd08 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -847,6 +847,8 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz); static inline int fw_initializing(struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 0732e6c..9f404de 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1208,6 +1208,36 @@ struct mlx5_ifc_phys_layer_cntrs_bits { u8 reserved_at_640[0x180]; }; +struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { + u8 symbol_error_counter[0x10]; + + u8 link_error_recovery_counter[0x8]; + + u8 link_downed_counter[0x8]; + + u8 port_rcv_errors[0x10]; + + u8 port_rcv_remote_physical_errors[0x10]; + + u8 port_rcv_switch_relay_errors[0x10]; + + u8 port_xmit_discards[0x10]; + + u8 port_xmit_constraint_errors[0x8]; + + u8 port_rcv_constraint_errors[0x8]; + + u8 reserved_at_70[0x8]; + + u8 link_overrun_errors[0x8]; + + u8 reserved_at_80[0x10]; + + u8 vl_15_dropped[0x10]; + + u8 reserved_at_a0[0xa0]; +}; + struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits { u8 transmit_queue_high[0x20]; @@ -2618,6 +2648,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout; struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout; struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -6955,6 +6986,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_peir_reg_bits peir_reg; struct mlx5_ifc_pelc_reg_bits pelc_reg; struct mlx5_ifc_pfcc_reg_bits pfcc_reg; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_pifr_reg_bits pifr_reg; struct mlx5_ifc_pipg_reg_bits pipg_reg; -- cgit v0.10.2 From 3efd9a11212d500e36c2837db853178cdaa86d5a Mon Sep 17 00:00:00 2001 From: Meny Yossefi Date: Thu, 18 Feb 2016 18:15:01 +0200 Subject: IB/mlx5: Modify MAD reading counters method to use counter registers Modify mlx5_ib_process_mad to use PPCNT and query_vport commands instead of MAD_IFC, as MAD_IFC is deprecated on new firmware versions (and doesn't support RoCE anyway). Traffic counters exist in both 32-bit and 64-bit forms. Declaring support of extended coutners results in traffic counters to be read in their 64-bit form only via the query_vport command. Error counters exist only in 32-bit form and read via PPCNT command. This commit also adds counters support in RoCE. Signed-off-by: Meny Yossefi Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index b84d13a..41d8a00 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -31,8 +31,10 @@ */ #include +#include #include #include +#include #include "mlx5_ib.h" enum { @@ -57,20 +59,12 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); } -int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - const struct ib_wc *in_wc, const struct ib_grh *in_grh, - const struct ib_mad_hdr *in, size_t in_mad_size, - struct ib_mad_hdr *out, size_t *out_mad_size, - u16 *out_mad_pkey_index) +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid; int err; - const struct ib_mad *in_mad = (const struct ib_mad *)in; - struct ib_mad *out_mad = (struct ib_mad *)out; - - if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad))) - return IB_MAD_RESULT_FAILURE; slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); @@ -117,6 +111,156 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + void *out) +{ +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + void *out) +{ + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); +} + +static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err; + void *out_cnt; + + /* Decalring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } + + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_vport_counter(dev->mdev, 0, + port_num, out_cnt, sz); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + + kvfree(out_cnt); + if (err) + return IB_MAD_RESULT_FAILURE; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + } else { + return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } +} + int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) { struct ib_smp *in_mad = NULL; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 99e2edc..12079fd 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -105,6 +105,29 @@ __mlx5_mask(typ, fld)) ___t; \ }) +/* Big endian getters */ +#define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\ + __mlx5_64_off(typ, fld))) + +#define MLX5_GET_BE(type_t, typ, p, fld) ({ \ + type_t tmp; \ + switch (sizeof(tmp)) { \ + case sizeof(u8): \ + tmp = (__force type_t)MLX5_GET(typ, p, fld); \ + break; \ + case sizeof(u16): \ + tmp = (__force type_t)cpu_to_be16(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u32): \ + tmp = (__force type_t)cpu_to_be32(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u64): \ + tmp = (__force type_t)MLX5_GET64_BE(typ, p, fld); \ + break; \ + } \ + tmp; \ + }) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, -- cgit v0.10.2 From 1015c2e8ca2b94d8964f8ab30d925b6f678fd9d2 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sun, 21 Feb 2016 16:27:16 +0200 Subject: IB/mlx5: Define interface bits for IPoIB offloads The HW can supply several offloads for UD QP, added offloads for checksumming for both TX and RX and LSO for TX. Two new bits were added in order to expose and enable these offloads: 1. HCA capability bit: declares the support for IPoIB basic offloads. 2. QPC bit which will be used in the QP creation flow, which set these abilities in the QP. Signed-off-by: Erez Shitrit Signed-off-by: Eran Ben Elisha Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9f404de..711c9dc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -736,7 +736,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; - u8 reserved_at_200[0xe]; + u8 reserved_at_200[0x3]; + u8 ipoib_basic_offloads[0x1]; + u8 reserved_at_204[0xa]; u8 drain_sigerr[0x1]; u8 cmdif_checksum[0x2]; u8 sigerr_cqe[0x1]; @@ -1810,7 +1812,7 @@ struct mlx5_ifc_qpc_bits { u8 log_sq_size[0x4]; u8 reserved_at_55[0x6]; u8 rlky[0x1]; - u8 reserved_at_5c[0x4]; + u8 ulp_stateless_offload_mode[0x4]; u8 counter_set_id[0x8]; u8 uar_page[0x18]; -- cgit v0.10.2 From f031396531fe2b1a6ffb4fa5eceb9c1fa276869a Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sun, 21 Feb 2016 16:27:17 +0200 Subject: IB/mlx5: Implement UD QP offloads for IPoIB in the TX flow In order to support LSO and CSUM in the TX flow the driver does the following: * LSO bit for the enum mlx5_ib_qp_flags was added, indicates QP that supports LSO offloads. * Enables the special offload when the QP is created, and enable the special work request id (IB_WR_LSO) when comes. * Calculates the size of the WQE according to the new WQE format that support these offloads. * Handles the new WQE format when arrived, sets the relevant fields, and copies the needed data. Signed-off-by: Erez Shitrit Signed-off-by: Eran Ben Elisha Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 03c418c..76b0939 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -504,6 +504,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, (MLX5_CAP_ETH(dev->mdev, csum_cap))) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d2b9737..14396b0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -325,11 +325,12 @@ struct mlx5_ib_cq_buf { }; enum mlx5_ib_qp_flags { - MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, - MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, - MLX5_IB_QP_CROSS_CHANNEL = 1 << 2, - MLX5_IB_QP_MANAGED_SEND = 1 << 3, - MLX5_IB_QP_MANAGED_RECV = 1 << 4, + MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, }; struct mlx5_umr_wr { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 34cb8e8..baa8808 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -58,6 +58,7 @@ enum { static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, @@ -72,6 +73,9 @@ static const u32 mlx5_ib_opcode[] = { [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; static int is_qp0(enum ib_qp_type qp_type) { @@ -260,11 +264,11 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, return 0; } -static int sq_overhead(enum ib_qp_type qp_type) +static int sq_overhead(struct ib_qp_init_attr *attr) { int size = 0; - switch (qp_type) { + switch (attr->qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ @@ -287,6 +291,10 @@ static int sq_overhead(enum ib_qp_type qp_type) break; case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + /* fall through */ case IB_QPT_SMI: case IB_QPT_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + @@ -311,7 +319,7 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) int inl_size = 0; int size; - size = sq_overhead(attr->qp_type); + size = sq_overhead(attr); if (size < 0) return size; @@ -348,8 +356,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, return -EINVAL; } - qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - - sizeof(struct mlx5_wqe_inline_seg); + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) @@ -783,7 +791,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, int err; uuari = &dev->mdev->priv.uuari; - if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_IPOIB_UD_LSO)) return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) @@ -1228,6 +1238,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) qp->flags |= MLX5_IB_QP_MANAGED_RECV; } + + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) + if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); + return -EOPNOTSUPP; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -1385,6 +1403,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* 0xffffff means we ask to work with cqe version 0 */ MLX5_SET(qpc, qpc, user_index, uidx); } + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + qp->flags |= MLX5_IB_QP_LSO; + } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; @@ -2442,6 +2467,59 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } +static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, + struct ib_send_wr *wr, void *qend, + struct mlx5_ib_qp *qp, int *size) +{ + void *seg = eseg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + u64 left, leftlen, copysz; + void *pdata = ud_wr->header; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr_sz = cpu_to_be16(left); + + /* + * check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and than the copy the left + */ + leftlen = qend - (void *)eseg->inline_hdr_start; + copysz = min_t(u64, leftlen, left); + + memcpy(seg - size_of_inl_hdr_start, pdata, copysz); + + if (likely(copysz > size_of_inl_hdr_start)) { + seg += ALIGN(copysz - size_of_inl_hdr_start, 16); + *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; + } + + if (unlikely(copysz < left)) { /* the last wqe in the queue */ + seg = mlx5_get_send_wqe(qp, 0); + left -= copysz; + pdata += copysz; + memcpy(seg, pdata, left); + seg += ALIGN(left, 16); + *size += ALIGN(left, 16) / 16; + } + } + + return seg; +} + static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { @@ -3373,7 +3451,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_UD: case IB_QPT_SMI: case IB_QPT_GSI: set_datagram_seg(seg, wr); @@ -3382,7 +3459,29 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + seg += sizeof(struct mlx5_wqe_eth_pad); + size += sizeof(struct mlx5_wqe_eth_pad) / 16; + + seg = set_eth_seg(seg, wr, qend, qp, &size); + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + } + break; case MLX5_IB_QPT_REG_UMR: if (wr->opcode != MLX5_IB_WR_UMR) { err = -EINVAL; -- cgit v0.10.2 From c7ce833b364bc19ef51b3c973c94a863e4af0e06 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sun, 21 Feb 2016 16:27:18 +0200 Subject: IB/mlx5: Add support for CSUM in RX flow The driver checks the csum from the HW when completion arrived and marks it in the wc->wc_flags field for the ulp drivers. These is for packets from type IB_WC_RECV only. Signed-off-by: Erez Shitrit Signed-off-by: Eran Ben Elisha Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index fd1de31..5ece9a8 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -207,7 +207,10 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; - wc->wc_flags = 0; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; break; case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; -- cgit v0.10.2 From b11a4f9cde1c06e0073662882b60c1fb95a1d597 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:03 +0200 Subject: IB/mlx5: Add support for setting source QP number In order to create multiple GSI QPs, we need to set the source QP number to one on all these QPs. Add the necessary definitions and infrastructure to do that. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 14396b0..32699f9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -165,6 +165,18 @@ struct mlx5_ib_flow_db { #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ + +/* Create a UD QP whose source QP number is 1 */ +static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void) +{ + return IB_QP_CREATE_RESERVED_START; +} + struct wr_list { u16 opcode; u16 next; @@ -331,6 +343,8 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, + /* QP uses 1 as its source QP number */ + MLX5_IB_QP_SQPN_QP1 = 1 << 6, }; struct mlx5_umr_wr { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index baa8808..794e760 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -793,7 +793,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, uuari = &dev->mdev->priv.uuari; if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | - IB_QP_CREATE_IPOIB_UD_LSO)) + IB_QP_CREATE_IPOIB_UD_LSO | + mlx5_ib_create_qp_sqpn_qp1())) return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) @@ -838,6 +839,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { + (*in)->ctx.deth_sqpn = cpu_to_be32(1); + qp->flags |= MLX5_IB_QP_SQPN_QP1; + } + mlx5_fill_page_array(&qp->buf, (*in)->pas); err = mlx5_db_alloc(dev->mdev, &qp->db); @@ -1289,6 +1295,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd.sq_wqe_count, max_wqes); return -EINVAL; } + if (init_attr->create_flags & + mlx5_ib_create_qp_sqpn_qp1()) { + mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); + return -EINVAL; + } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, base); if (err) @@ -2309,6 +2320,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + context->deth_sqpn = cpu_to_be32(1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -3973,6 +3986,8 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; if (qp->flags & MLX5_IB_QP_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1(); qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 711c9dc..72bba52 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -772,7 +772,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_22e[0x7]; u8 qkv[0x1]; u8 pkv[0x1]; - u8 reserved_at_237[0x4]; + u8 set_deth_sqpn[0x1]; + u8 reserved_at_239[0x3]; u8 xrc[0x1]; u8 ud[0x1]; u8 uc[0x1]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 5b8c89f..e5bbcf0 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -499,7 +499,8 @@ struct mlx5_qp_context { u8 reserved2[4]; __be32 next_send_psn; __be32 cqn_send; - u8 reserved3[8]; + __be32 deth_sqpn; + u8 reserved3[4]; __be32 last_acked_psn; __be32 ssn; __be32 params2; -- cgit v0.10.2 From 158abf862a2947bfac250a10e79ac20f5e6fea6c Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:04 +0200 Subject: IB/mlx5: Modify QP debugging prints Add debugging prints to the modify QP verb to help understand the cause a returned error. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 794e760..c8b12f9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2197,8 +2197,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); - if (err < 0) + if (err < 0) { + mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; + } context->flags = cpu_to_be32(err << 16); @@ -2418,30 +2420,45 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - ll)) + ll)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); goto out; + } if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= - dev->mdev->port_caps[port - 1].pkey_table_len) + dev->mdev->port_caps[port - 1].pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", + attr->pkey_index); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); goto out; + } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); goto out; + } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; -- cgit v0.10.2 From d16e91daf446c605a92112889552f9df757186bc Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:05 +0200 Subject: IB/mlx5: Add GSI QP wrapper mlx5 creates special GSI QPs that has limited ability to control the P_Key of transmitted packets. The sent P_Key is taken from the QP object, similarly to what happens with regular UD QPs. Create a software wrapper around GSI QPs that with the following patches will be able to emulate the functionality of a GSI QP including control of the P_Key per work request. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 27a7015..4e85188 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c new file mode 100644 index 0000000..7116554 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_qp { + struct ib_qp ibqp; + struct ib_qp *rx_qp; + u8 port_num; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + /* Serialize qp state modifications */ + struct mutex mutex; +}; + +static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) +{ + return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); +} + +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *init_attr; + const u8 port_num = init_attr->port_num; + int ret; + + mlx5_ib_dbg(dev, "creating GSI QP\n"); + + if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { + mlx5_ib_warn(dev, + "invalid port number %d during GSI QP creation\n", + port_num); + return ERR_PTR(-EINVAL); + } + + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); + if (!gsi) + return ERR_PTR(-ENOMEM); + + mutex_init(&gsi->mutex); + + mutex_lock(&dev->devr.mutex); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free; + } + + gsi->cap = init_attr->cap; + gsi->sq_sig_type = init_attr->sq_sig_type; + gsi->ibqp.qp_num = 1; + gsi->port_num = port_num; + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_free; + } + + dev->devr.ports[init_attr->port_num - 1].gsi = gsi; + + mutex_unlock(&dev->devr.mutex); + + return &gsi->ibqp; + +err_free: + mutex_unlock(&dev->devr.mutex); + kfree(gsi); + return ERR_PTR(ret); +} + +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + const int port_num = gsi->port_num; + int ret; + + mlx5_ib_dbg(dev, "destroying GSI QP\n"); + + mutex_lock(&dev->devr.mutex); + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + mutex_unlock(&dev->devr.mutex); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + mutex_unlock(&dev->devr.mutex); + + kfree(gsi); + + return 0; +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + mutex_lock(&gsi->mutex); + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mutex_lock(&gsi->mutex); + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_send(gsi->rx_qp, wr, bad_wr); +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 76b0939..0b30dc5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1970,6 +1970,8 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) dev = container_of(devr, struct mlx5_ib_dev, devr); + mutex_init(&devr->mutex); + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 32699f9..c68a913 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -163,6 +163,11 @@ struct mlx5_ib_flow_db { #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. @@ -502,6 +507,12 @@ struct mlx5_mr_cache { unsigned long last_add; }; +struct mlx5_ib_gsi_qp; + +struct mlx5_ib_port_resources { + struct mlx5_ib_gsi_qp *gsi; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct ib_xrcd *x0; @@ -509,6 +520,9 @@ struct mlx5_ib_resources { struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; + /* Protects changes to the port resources */ + struct mutex mutex; }; struct mlx5_roce { @@ -754,6 +768,20 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); +/* GSI QP helper functions */ +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr); +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -773,7 +801,7 @@ static inline u8 convert_access(int acc) static inline int is_qp1(enum ib_qp_type qp_type) { - return qp_type == IB_QPT_GSI; + return qp_type == MLX5_IB_QPT_HW_GSI; } #define MLX5_MAX_UMR_SHIFT 16 diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c8b12f9..85cf9c4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -296,7 +296,7 @@ static int sq_overhead(struct ib_qp_init_attr *attr) sizeof(struct mlx5_wqe_eth_seg); /* fall through */ case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; @@ -598,7 +598,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; - case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -1530,7 +1530,7 @@ static void get_cqs(struct mlx5_ib_qp *qp, break; case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1693,7 +1693,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) @@ -1722,6 +1722,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, break; + case IB_QPT_GSI: + return mlx5_ib_gsi_create_qp(pd, init_attr); + case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_MAX: @@ -1740,6 +1743,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); + if (unlikely(qp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_destroy_qp(qp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -2220,7 +2226,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { @@ -2403,11 +2409,18 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -2418,9 +2431,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } - if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - ll)) { + if (qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; @@ -3304,13 +3316,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf = qp->bf; + struct mlx5_bf *bf; int uninitialized_var(size); - void *qend = qp->sq.qend; + void *qend; unsigned long flags; unsigned idx; int err = 0; @@ -3322,6 +3334,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = qp->bf; + qend = qp->sq.qend; + spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -3482,7 +3501,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; @@ -3631,6 +3650,9 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int ind; int i; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); @@ -3951,6 +3973,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int err = 0; u8 raw_packet_qp_state; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * Wait for any outstanding page faults, in case the user frees memory -- cgit v0.10.2 From ebab41cff4db96c42dfc9939d1c1715496bcf961 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:06 +0200 Subject: IB/mlx5: Create multiple transmission GSI QPs In order to send GSI MADs on different P_Keys, mlx5 needs different QPs to be created, each with a different P_Key set when the QP is modified to the INIT state. Create QPs for each non-zero P_Key in the P_Key table. Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 7116554..91bd20e 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -40,6 +40,12 @@ struct mlx5_ib_gsi_qp { enum ib_sig_type sq_sig_type; /* Serialize qp state modifications */ struct mutex mutex; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). + */ + spinlock_t lock; + struct ib_qp **tx_qps; }; static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) @@ -47,6 +53,11 @@ static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); } +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { @@ -54,6 +65,8 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct mlx5_ib_gsi_qp *gsi; struct ib_qp_init_attr hw_init_attr = *init_attr; const u8 port_num = init_attr->port_num; + const int num_pkeys = pd->device->attrs.max_pkeys; + const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; int ret; mlx5_ib_dbg(dev, "creating GSI QP\n"); @@ -69,6 +82,12 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, if (!gsi) return ERR_PTR(-ENOMEM); + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) { + ret = -ENOMEM; + goto err_free; + } + mutex_init(&gsi->mutex); mutex_lock(&dev->devr.mutex); @@ -77,8 +96,10 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", port_num); ret = -EBUSY; - goto err_free; + goto err_free_tx; } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); gsi->cap = init_attr->cap; gsi->sq_sig_type = init_attr->sq_sig_type; @@ -91,7 +112,7 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", PTR_ERR(gsi->rx_qp)); ret = PTR_ERR(gsi->rx_qp); - goto err_free; + goto err_free_tx; } dev->devr.ports[init_attr->port_num - 1].gsi = gsi; @@ -100,8 +121,10 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, return &gsi->ibqp; -err_free: +err_free_tx: mutex_unlock(&dev->devr.mutex); + kfree(gsi->tx_qps); +err_free: kfree(gsi); return ERR_PTR(ret); } @@ -111,6 +134,7 @@ int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); const int port_num = gsi->port_num; + int qp_index; int ret; mlx5_ib_dbg(dev, "destroying GSI QP\n"); @@ -125,12 +149,143 @@ int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) } dev->devr.ports[port_num - 1].gsi = NULL; mutex_unlock(&dev->devr.mutex); + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + kfree(gsi->tx_qps); kfree(gsi); return 0; } +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->rx_qp->send_cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .sq_sig_type = gsi->sq_sig_type, + .qp_type = IB_QPT_UD, + .create_flags = mlx5_ib_create_qp_sqpn_qp1(), + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 qp_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = qp_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + ret = modify_to_rts(gsi, qp, qp_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +static void setup_qps(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} + int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask) { @@ -142,6 +297,15 @@ int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, mutex_lock(&gsi->mutex); ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + goto unlock; + } + + if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) + setup_qps(gsi); + +unlock: mutex_unlock(&gsi->mutex); return ret; -- cgit v0.10.2 From 7722f47e71e58592a2ba4437d27c802ba1c64e08 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:07 +0200 Subject: IB/mlx5: Create GSI transmission QPs when P_Key table is changed Whenever the P_Key table is changed, we create the required GSI transmission QPs on-demand. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 91bd20e..1648f53 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -341,3 +341,13 @@ int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, return ib_post_recv(gsi->rx_qp, wr, bad_wr); } + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + if (!gsi) + return; + + mutex_lock(&gsi->mutex); + setup_qps(gsi); + mutex_unlock(&gsi->mutex); +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0b30dc5..d4224fa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1721,6 +1721,17 @@ static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_reg_pages, }; +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + mutex_lock(&ports->devr->mutex); + mlx5_ib_gsi_pkey_change(ports->gsi); + mutex_unlock(&ports->devr->mutex); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -1757,6 +1768,8 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; + + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: @@ -1966,6 +1979,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); @@ -2059,6 +2073,12 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + devr->ports[port].devr = devr; + } + return 0; error5: @@ -2077,12 +2097,20 @@ error0: static void destroy_dev_resources(struct mlx5_ib_resources *devr) { + struct mlx5_ib_dev *dev = + container_of(devr, struct mlx5_ib_dev, devr); + int port; + mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + + /* Make sure no change P_Key work items are still executing */ + for (port = 0; port < dev->num_ports; ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c68a913..a8fc345 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -510,7 +510,9 @@ struct mlx5_mr_cache { struct mlx5_ib_gsi_qp; struct mlx5_ib_port_resources { + struct mlx5_ib_resources *devr; struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; }; struct mlx5_ib_resources { @@ -781,6 +783,7 @@ int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); static inline void init_query_mad(struct ib_smp *mad) { -- cgit v0.10.2 From 25361e02c44873a17e0148d9d5c42fa2e938a019 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:08 +0200 Subject: IB/mlx5: Generate completions in software The GSI QP emulation requires also emulating completions for transmitted MADs. The CQ on which these completions are generated can also be used by the hardware, and the MAD layer is free to use any CQ of the device for the GSI QP. Add a method for generating software completions to each mlx5 CQ. Software completions are polled first, and generate calls to the completion handler callback if necessary. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 5ece9a8..2a9ad84 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -568,18 +568,44 @@ repoll: return 0; } +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + kfree(soft_wc); + } + + return npolled; +} + int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx5_ib_cq *cq = to_mcq(ibcq); struct mlx5_ib_qp *cur_qp = NULL; unsigned long flags; + int soft_polled = 0; int npolled; int err = 0; spin_lock_irqsave(&cq->lock, flags); - for (npolled = 0; npolled < num_entries; npolled++) { - err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled); if (err) break; } @@ -590,7 +616,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) - return npolled; + return soft_polled + npolled; else return err; } @@ -598,16 +624,27 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; - mlx5_cq_arm(&to_mcq(ibcq)->mcq, + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, uar_page, MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), to_mcq(ibcq)->mcq.cons_index); - return 0; + return ret; } static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, @@ -760,6 +797,14 @@ static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) mlx5_db_free(dev->mdev, &cq->db); } +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -810,6 +855,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, &index, &inlen); if (err) goto err_create; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } cq->cqe_size = cqe_size; @@ -835,6 +882,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; + INIT_LIST_HEAD(&cq->wc_list); + if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { err = -EFAULT; @@ -1222,3 +1271,27 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) cq = to_mcq(ibcq); return cq->cqe_size; } + +/* Called from atomic context */ +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) +{ + struct mlx5_ib_wc *soft_wc; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC); + if (!soft_wc) + return -ENOMEM; + + soft_wc->wc = *wc; + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + wc->status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a8fc345..0142efb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -393,6 +393,14 @@ struct mlx5_ib_cq { struct ib_umem *resize_umem; int cqe_size; u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; }; struct mlx5_ib_srq { @@ -785,6 +793,8 @@ int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; -- cgit v0.10.2 From ea6dc2036224aaee887f391a1ee8833bea18c68b Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:09 +0200 Subject: IB/mlx5: Reorder GSI completions The emulated GSI QP's send completions are generated by multiple hardware QPs, so their completions could arrive out of order with respect to the order their work request were submitted. Reorder the completions by keeping a list of the posted work request and their completions. A newly received completion from the hardware updates the list and marks its work request as completed. However, the completions are only reported to the client according to the list order. In order to support that, create a new private CQ to handle the hardware completions. Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 1648f53..8d04062 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -32,6 +32,13 @@ #include "mlx5_ib.h" +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct ib_wc wc; + int send_flags; + bool completed:1; +}; + struct mlx5_ib_gsi_qp { struct ib_qp ibqp; struct ib_qp *rx_qp; @@ -40,9 +47,13 @@ struct mlx5_ib_gsi_qp { enum ib_sig_type sq_sig_type; /* Serialize qp state modifications */ struct mutex mutex; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; int num_qps; /* Protects access to the tx_qps. Post send operations synchronize - * with tx_qp creation in setup_qp(). + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. */ spinlock_t lock; struct ib_qp **tx_qps; @@ -58,6 +69,57 @@ static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); } +static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index) +{ + return ++index % gsi->cap.max_send_wr; +} + +#define for_each_outstanding_wr(gsi, index) \ + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \ + index = next_outstanding(gsi, index)) + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_cq *gsi_cq = gsi->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for_each_outstanding_wr(gsi, index) { + wr = &gsi->outstanding_wrs[index]; + + if (!wr->completed) + break; + + if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || + wr->send_flags & IB_SEND_SIGNALED) + WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); + + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->wc.wr_id; + wr->wc = *wc; + wr->wc.wr_id = wr_id; + wr->wc.qp = &gsi->ibqp; + + generate_completions(gsi); + spin_unlock_irqrestore(&gsi->lock, flags); +} + struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { @@ -88,6 +150,14 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, goto err_free; } + gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, + sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + mutex_init(&gsi->mutex); mutex_lock(&dev->devr.mutex); @@ -96,7 +166,7 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", port_num); ret = -EBUSY; - goto err_free_tx; + goto err_free_wrs; } gsi->num_qps = num_qps; spin_lock_init(&gsi->lock); @@ -106,13 +176,23 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, gsi->ibqp.qp_num = 1; gsi->port_num = port_num; + gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); if (IS_ERR(gsi->rx_qp)) { mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", PTR_ERR(gsi->rx_qp)); ret = PTR_ERR(gsi->rx_qp); - goto err_free_tx; + goto err_destroy_cq; } dev->devr.ports[init_attr->port_num - 1].gsi = gsi; @@ -121,8 +201,12 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, return &gsi->ibqp; -err_free_tx: +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: mutex_unlock(&dev->devr.mutex); + kfree(gsi->outstanding_wrs); +err_free_tx: kfree(gsi->tx_qps); err_free: kfree(gsi); @@ -158,6 +242,9 @@ int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) gsi->tx_qps[qp_index] = NULL; } + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); kfree(gsi->tx_qps); kfree(gsi); @@ -170,7 +257,7 @@ static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) struct ib_qp_init_attr init_attr = { .event_handler = gsi->rx_qp->event_handler, .qp_context = gsi->rx_qp->qp_context, - .send_cq = gsi->rx_qp->send_cq, + .send_cq = gsi->cq, .recv_cq = gsi->rx_qp->recv_cq, .cap = { .max_send_wr = gsi->cap.max_send_wr, @@ -326,12 +413,69 @@ int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, return ret; } +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi]; + gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi); + + if (!wc) { + memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); + gsi_wr->wc.pkey_index = wr->pkey_index; + gsi_wr->wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; - return ib_post_send(gsi->rx_qp, wr, bad_wr); + spin_lock_irqsave(&gsi->lock, flags); + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(gsi->rx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi = (gsi->outstanding_pi - 1) % + gsi->cap.max_send_wr; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; } int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, -- cgit v0.10.2 From 83cae2aff53960ab6cf5bb82654201ce43b77fb6 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:10 +0200 Subject: IB/mlx5: Pick the right GSI transmission QP for sending Pick the QP to use according to the wr.ud.pkey_index field in the work request. If the QP doesn't exist, it means the P_Key is zero and the packet would have been dropped, so just generate a completion and move on. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 8d04062..938f6dd 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -443,10 +443,47 @@ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, return 0; } +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &gsi->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); + if (ret) + return ret; + + generate_completions(gsi); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + int qp_index = wr->pkey_index; + + if (!mlx5_ib_deth_sqpn_cap(dev)) + return gsi->rx_qp; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + struct ib_qp *tx_qp; unsigned long flags; int ret; @@ -456,11 +493,20 @@ int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, cur_wr.wr.next = NULL; spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); if (ret) goto err; - ret = ib_post_send(gsi->rx_qp, &cur_wr.wr, bad_wr); + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); if (ret) { /* Undo the effect of adding the outstanding wr */ gsi->outstanding_pi = (gsi->outstanding_pi - 1) % -- cgit v0.10.2 From ebe6ccc53ff06a3782b95547eecb393222de057f Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:11 +0200 Subject: IB/mlx5: Eliminate GSI RX QP's send buffers Now that the transmission of GSI MADs is done with the special transmission QPs, eliminate the send buffers in the GSI receive QP. Reviewed-by: Leon Romanovsky Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 938f6dd..53e03c8 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -187,6 +187,11 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); if (IS_ERR(gsi->rx_qp)) { mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", -- cgit v0.10.2 From 84424a7fc793979da12992cfe5c2f5f73a3e8725 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 29 Feb 2016 15:45:12 +0200 Subject: IB/cma: Print warning on different inner and header P_Keys Commit 4c21b5bcef73 ("IB/cma: Add net_dev and private data checks to RDMA CM") added checks for incoming RDMA CM requests that they can be matched to a netdev based on the P_Key in the BTH of the request. This behavior was reverted in commit ab3964ad2acf ("IB/cma: Use inner P_Key to determine netdev"), since the mlx5 and ipath drivers didn't send the correct value in the BTH P_Key. Since the ipath driver was removed, and the mlx5 driver can now send GSI packets on different P_Keys, we could revert the patch to let the rdma_cm module look on the BTH P_Key when deciding to what netdev a packet belongs. However, that still breaks compatibility with the older drivers. Change the behavior to print a warning when receiving a request that has a different BTH P_Key and inner payload P_Key. In the future, after users have seen the warnings and upgraded their setups, remove the warning and block these requests. Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9729639..7eace1f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1206,6 +1206,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; @@ -1213,6 +1217,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; -- cgit v0.10.2 From 395a8e4c32ea2d032cf803f52f2e00983f91722d Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Mon, 29 Feb 2016 16:46:50 +0200 Subject: IB/mlx5: Refactoring register MR code In order to add re-registration of memory region, some logic was extracted to separate functions: - ODP related logic. - Some of the UMR WQE preparation code. - DMA mapping. - Umem creation. - Creating MKey using FW interface. - MR fields assignments after successful creation. Signed-off-by: Noa Osherovich Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 6000f7a..9d6dade 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -77,6 +77,34 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static void update_odp_mr(struct mlx5_ib_mr *mr) +{ + if (mr->umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +} +#endif + static void reg_mr_callback(int status, void *context) { struct mlx5_ib_mr *mr = context; @@ -693,10 +721,40 @@ static int use_umr(int order) return order <= MLX5_MAX_UMR_SHIFT; } -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift, u64 virt_addr, u64 len, - int access_flags) +static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int npages, int page_shift, int *size, + __be64 **mr_pas, dma_addr_t *dma) +{ + __be64 *pas; + struct device *ddev = dev->ib_dev.dma_device; + + /* + * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. + */ + *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!(*mr_pas)) + return -ENOMEM; + + pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, *size - npages * sizeof(u64)); + + *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, *dma)) { + kfree(*mr_pas); + return -ENOMEM; + } + + return 0; +} + +static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_umr_wr *umrwr = umr_wr(wr); @@ -706,7 +764,6 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, sg->lkey = dev->umrc.pd->local_dma_lkey; wr->next = NULL; - wr->send_flags = 0; wr->sg_list = sg; if (n) wr->num_sge = 1; @@ -718,6 +775,19 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, umrwr->npages = n; umrwr->page_shift = page_shift; umrwr->mkey = key; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); + + wr->send_flags = 0; + umrwr->target.virt_addr = virt_addr; umrwr->length = len; umrwr->access_flags = access_flags; @@ -734,6 +804,31 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, umrwr->mkey = key; } +static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, + int access_flags, int *npages, + int *page_shift, int *ncont, int *order) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length, + access_flags, 0); + if (IS_ERR(umem)) { + mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order); + if (!*npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + ib_umem_release(umem); + return ERR_PTR(-EINVAL); + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + *npages, *ncont, *order, *page_shift); + + return umem; +} + void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) { struct mlx5_ib_umr_context *context; @@ -770,7 +865,6 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct ib_sge sg; int size; __be64 *mr_pas; - __be64 *pas; dma_addr_t dma; int err = 0; int i; @@ -790,26 +884,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. - * To avoid copying garbage after the pas array, we allocate - * a little more. */ - size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); - mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!mr_pas) { - err = -ENOMEM; + err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, + &dma); + if (err) goto free_mr; - } - - pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); - /* Clear padding after the actual pages. */ - memset(pas + npages, 0, size - npages * sizeof(u64)); - - dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, dma)) { - err = -ENOMEM; - goto free_pas; - } memset(&umrwr, 0, sizeof(umrwr)); umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; @@ -840,7 +918,6 @@ unmap_dma: up(&umrc->sem); dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); -free_pas: kfree(mr_pas); free_mr: @@ -974,10 +1051,14 @@ free_pas: } #endif -static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, - u64 length, struct ib_umem *umem, - int npages, int page_shift, - int access_flags) +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, + u64 virt_addr, u64 length, + struct ib_umem *umem, int npages, + int page_shift, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; @@ -986,7 +1067,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -1032,11 +1113,22 @@ err_2: kvfree(in); err_1: - kfree(mr); + if (!ibmr) + kfree(mr); return ERR_PTR(err); } +static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + int npages, u64 length, int access_flags) +{ + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.length = length; +} + struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -1052,22 +1144,11 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); - umem = ib_umem_get(pd->uobject->context, start, length, access_flags, - 0); - if (IS_ERR(umem)) { - mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); - return (void *)umem; - } + umem = mr_umem_get(pd, start, length, access_flags, &npages, + &page_shift, &ncont, &order); - mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); - if (!npages) { - mlx5_ib_warn(dev, "avoid zero region\n"); - err = -EINVAL; - goto error; - } - - mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", - npages, ncont, order, page_shift); + if (IS_ERR(umem)) + return (void *)umem; if (use_umr(order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, @@ -1083,8 +1164,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } if (!mr) - mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, - access_flags); + mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, + page_shift, access_flags); if (IS_ERR(mr)) { err = PTR_ERR(mr); @@ -1094,34 +1175,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); mr->umem = umem; - mr->npages = npages; - atomic_add(npages, &dev->mdev->priv.reg_pages); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + set_mr_fileds(dev, mr, npages, length, access_flags); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem->odp_data) { - /* - * This barrier prevents the compiler from moving the - * setting of umem->odp_data->private to point to our - * MR, before reg_umr finished, to ensure that the MR - * initialization have finished before starting to - * handle invalidations. - */ - smp_wmb(); - mr->umem->odp_data->private = mr; - /* - * Make sure we will see the new - * umem->odp_data->private value in the invalidation - * routines, before we can get page faults on the - * MR. Page faults can happen once we put the MR in - * the tree, below this line. Without the barrier, - * there can be a fault handling and an invalidation - * before umem->odp_data->private == mr is visible to - * the invalidation handler. - */ - smp_wmb(); - } + update_odp_mr(mr); #endif return &mr->ibmr; -- cgit v0.10.2 From 56e11d628c5d0553d9fc2ca1855144970e6b9eb6 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Mon, 29 Feb 2016 16:46:51 +0200 Subject: IB/mlx5: Added support for re-registration of MRs This patch adds support for re-registration of memory regions in MLX5. The functionality is basically the same as deregister followed by register, but attempts to reuse the existing resources as much as possible. Original memory keys are kept if possible, saving the need to communicate new ones to remote peers. Signed-off-by: Noa Osherovich Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d4224fa..16f7d0b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2233,6 +2233,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2293,6 +2294,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 0142efb..f84ec2b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -162,6 +162,11 @@ struct mlx5_ib_flow_db { #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) + +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END + #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI @@ -453,6 +458,7 @@ struct mlx5_ib_mr { struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; + int access_flags; /* Needed for rereg MR */ }; struct mlx5_ib_umr_context { @@ -689,6 +695,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_udata *udata); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9d6dade..cf26cd1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -77,6 +77,12 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) +{ + return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= + length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { @@ -1127,6 +1133,7 @@ static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr->ibmr.lkey = mr->mmr.key; mr->ibmr.rkey = mr->mmr.key; mr->ibmr.length = length; + mr->access_flags = access_flags; } struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, @@ -1222,6 +1229,167 @@ error: return err; } +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, + u64 length, int npages, int page_shift, int order, + int access_flags, int flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr *bad; + struct mlx5_umr_wr umrwr = {}; + struct ib_sge sg; + struct umr_common *umrc = &dev->umrc; + dma_addr_t dma = 0; + __be64 *mr_pas = NULL; + int size; + int err; + + umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; + + if (flags & IB_MR_REREG_TRANS) { + err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, + &mr_pas, &dma); + if (err) + return err; + + umrwr.target.virt_addr = virt_addr; + umrwr.length = length; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } + + prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key, + page_shift); + + if (flags & IB_MR_REREG_PD) { + umrwr.pd = pd; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; + } + + if (flags & IB_MR_REREG_ACCESS) { + umrwr.access_flags = access_flags; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; + } + + mlx5_ib_init_umr_context(&umr_context); + + /* post send request to UMR QP */ + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + + up(&umrc->sem); + if (flags & IB_MR_REREG_TRANS) { + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + kfree(mr_pas); + } + return err; +} + +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int new_access_flags, + struct ib_pd *new_pd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; + int access_flags = flags & IB_MR_REREG_ACCESS ? + new_access_flags : + mr->access_flags; + u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; + u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; + int page_shift = 0; + int npages = 0; + int ncont = 0; + int order = 0; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + + if (flags != IB_MR_REREG_PD) { + /* + * Replace umem. This needs to be done whether or not UMR is + * used. + */ + flags |= IB_MR_REREG_TRANS; + ib_umem_release(mr->umem); + mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages, + &page_shift, &ncont, &order); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + mr->umem = NULL; + return err; + } + } + + if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + /* + * UMR can't be used - MKey needs to be replaced. + */ + if (mr->umred) { + err = unreg_umr(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to unregister MR\n"); + } else { + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to destroy MKey\n"); + } + if (err) + return err; + + mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, + page_shift, access_flags); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->umred = 0; + } else { + /* + * Send a UMR WQE + */ + err = rereg_umr(pd, mr, addr, len, npages, page_shift, + order, access_flags, flags); + if (err) { + mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + return err; + } + } + + if (flags & IB_MR_REREG_PD) { + ib_mr->pd = pd; + mr->mmr.pd = to_mpd(pd)->pdn; + } + + if (flags & IB_MR_REREG_ACCESS) + mr->access_flags = access_flags; + + if (flags & IB_MR_REREG_TRANS) { + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + set_mr_fileds(dev, mr, npages, len, access_flags); + mr->mmr.iova = addr; + mr->mmr.size = len; + } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif + + return 0; +} + static int mlx5_alloc_priv_descs(struct ib_device *device, struct mlx5_ib_mr *mr, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 85cf9c4..295eb2a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2678,6 +2678,44 @@ static __be64 get_umr_update_mtt_mask(void) return cpu_to_be64(result); } +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr) { @@ -2696,9 +2734,15 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = get_umr_update_mtt_mask(); umr->bsf_octowords = get_klm_octo(umrwr->target.offset); umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } else { - umr->mkey_mask = get_umr_reg_mr_mask(); } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) + umr->mkey_mask |= get_umr_update_access_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) + umr->mkey_mask |= get_umr_update_pd_mask(); + if (!umr->mkey_mask) + umr->mkey_mask = get_umr_reg_mr_mask(); } else { umr->mkey_mask = get_umr_unreg_mr_mask(); } @@ -2750,7 +2794,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->flags = convert_access(umrwr->access_flags); if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); } seg->len = cpu_to_be64(umrwr->length); -- cgit v0.10.2 From a606b0f6691daf861482f8b77326f672238ffbfd Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 29 Feb 2016 18:05:28 +0200 Subject: net/mlx5: Refactor mlx5_core_mr to mkey Mlx5's mkey mechanism is also used for memory windows. The current code base uses MR (memory region) naming, which is inaccurate. Changing MR to mkey in order to represent its different usages more accurately. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 2a9ad84..a00ba44 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -434,7 +434,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mr *mmr; + struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; @@ -539,17 +539,17 @@ repoll: case MLX5_CQE_SIG_ERR: sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; - read_lock(&dev->mdev->priv.mr_table.lock); - mmr = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - if (unlikely(!mmr)) { - read_unlock(&dev->mdev->priv.mr_table.lock); + read_lock(&dev->mdev->priv.mkey_table.lock); + mmkey = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmkey)) { + read_unlock(&dev->mdev->priv.mkey_table.lock); mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); return -EINVAL; } - mr = to_mibmr(mmr); + mr = to_mibmr(mmkey); get_sig_err_item(sig_err_cqe, &mr->sig->err_item); mr->sig->sig_err_exists = true; mr->sig->sigerr_count++; @@ -561,7 +561,7 @@ repoll: mr->sig->err_item.expected, mr->sig->err_item.actual); - read_unlock(&dev->mdev->priv.mr_table.lock); + read_unlock(&dev->mdev->priv.mkey_table.lock); goto repoll; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f84ec2b..4167d67 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -446,7 +446,7 @@ struct mlx5_ib_mr { int ndescs; int max_descs; int desc_size; - struct mlx5_core_mr mmr; + struct mlx5_core_mkey mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; @@ -603,9 +603,9 @@ static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } -static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey) { - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index cf26cd1..399e2b5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -57,7 +57,7 @@ static int clean_mr(struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* Wait until all page fault handlers using the mr complete. */ @@ -120,7 +120,7 @@ static void reg_mr_callback(int status, void *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; int err; spin_lock_irqsave(&ent->lock, flags); @@ -147,7 +147,7 @@ static void reg_mr_callback(int status, void *context) spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); - mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mr->mmkey.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; cache->last_add = jiffies; @@ -158,10 +158,10 @@ static void reg_mr_callback(int status, void *context) spin_unlock_irqrestore(&ent->lock, flags); write_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), - &mr->mmr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey); if (err) - pr_err("Error inserting to mr tree. 0x%x\n", -err); + pr_err("Error inserting to mkey tree. 0x%x\n", -err); write_unlock_irqrestore(&table->lock, flags); } @@ -202,7 +202,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), reg_mr_callback, mr, &mr->out); if (err) { @@ -691,14 +691,14 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_in; kfree(in); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; return &mr->ibmr; @@ -897,7 +897,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, memset(&umrwr, 0, sizeof(umrwr)); umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key, + prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); mlx5_ib_init_umr_context(&umr_context); @@ -914,9 +914,9 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, } } - mr->mmr.iova = virt_addr; - mr->mmr.size = len; - mr->mmr.pd = to_mpd(pd)->pdn; + mr->mmkey.iova = virt_addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; mr->live = 1; @@ -1027,7 +1027,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, wr.wr.opcode = MLX5_IB_WR_UMR; wr.npages = sg.length / sizeof(u64); wr.page_shift = PAGE_SHIFT; - wr.mkey = mr->mmr.key; + wr.mkey = mr->mmkey.key; wr.target.offset = start_page_index; mlx5_ib_init_umr_context(&umr_context); @@ -1100,7 +1100,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen, NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); @@ -1111,7 +1111,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, mr->live = 1; kvfree(in); - mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); return mr; @@ -1130,8 +1130,8 @@ static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, { mr->npages = npages; atomic_add(npages, &dev->mdev->priv.reg_pages); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->ibmr.length = length; mr->access_flags = access_flags; } @@ -1179,7 +1179,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto error; } - mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); mr->umem = umem; set_mr_fileds(dev, mr, npages, length, access_flags); @@ -1205,7 +1205,7 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) memset(&umrwr.wr, 0, sizeof(umrwr)); umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmr.key); + prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); @@ -1259,7 +1259,7 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; } - prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key, + prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift); if (flags & IB_MR_REREG_PD) { @@ -1371,7 +1371,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (flags & IB_MR_REREG_PD) { ib_mr->pd = pd; - mr->mmr.pd = to_mpd(pd)->pdn; + mr->mmkey.pd = to_mpd(pd)->pdn; } if (flags & IB_MR_REREG_ACCESS) @@ -1380,8 +1380,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (flags & IB_MR_REREG_TRANS) { atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); set_mr_fileds(dev, mr, npages, len, access_flags); - mr->mmr.iova = addr; - mr->mmr.size = len; + mr->mmkey.iova = addr; + mr->mmkey.size = len; } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING update_odp_mr(mr); @@ -1461,7 +1461,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmr.key, err); + mr->mmkey.key, err); return err; } } else { @@ -1587,13 +1587,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, } in->seg.flags = MLX5_PERM_UMR_EN | access_mode; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_destroy_psv; - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; kfree(in); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b8d7636..34e79e7 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -142,13 +142,13 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, u32 key) { u32 base_key = mlx5_base_mkey(key); - struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); - struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!mmr || mmr->key != key || !mr->live) + if (!mmkey || mmkey->key != key || !mr->live) return NULL; - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, @@ -232,7 +232,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, io_virt += pfault->mpfault.bytes_committed; bcnt -= pfault->mpfault.bytes_committed; - start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; if (mr->umem->writable) access_mask |= ODP_WRITE_ALLOWED_BIT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index aac071a..6ef0bfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -515,7 +515,7 @@ struct mlx5e_priv { struct mlx5_uar cq_uar; u32 pdn; u32 tdn; - struct mlx5_core_mr mr; + struct mlx5_core_mkey mkey; struct mlx5e_rq drop_rq; struct mlx5e_channel **channel; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d4e1c30..43a1489 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -982,7 +982,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; - c->mkey_be = cpu_to_be32(priv->mr.key); + c->mkey_be = cpu_to_be32(priv->mkey.key); c->num_tc = priv->params.num_tc; mlx5e_build_channeltc_to_txq_map(priv, ix); @@ -2194,7 +2194,7 @@ static void mlx5e_build_netdev(struct net_device *netdev) } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, - struct mlx5_core_mr *mr) + struct mlx5_core_mkey *mkey) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_create_mkey_mbox_in *in; @@ -2210,7 +2210,7 @@ static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - err = mlx5_core_create_mkey(mdev, mr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL, NULL); kvfree(in); @@ -2259,7 +2259,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) goto err_dealloc_pd; } - err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); + err = mlx5e_create_mkey(priv, priv->pdn, &priv->mkey); if (err) { mlx5_core_err(mdev, "create mkey failed, %d\n", err); goto err_dealloc_transport_domain; @@ -2333,7 +2333,7 @@ err_destroy_tises: mlx5e_destroy_tises(priv); err_destroy_mkey: - mlx5_core_destroy_mkey(mdev, &priv->mr); + mlx5_core_destroy_mkey(mdev, &priv->mkey); err_dealloc_transport_domain: mlx5_core_dealloc_transport_domain(mdev, priv->tdn); @@ -2367,7 +2367,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5e_destroy_rqt(priv, MLX5E_INDIRECTION_RQT); mlx5e_close_drop_rq(priv); mlx5e_destroy_tises(priv); - mlx5_core_destroy_mkey(priv->mdev, &priv->mr); + mlx5_core_destroy_mkey(priv->mdev, &priv->mkey); mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 1545a94..0916bbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1117,7 +1117,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) mlx5_init_cq_table(dev); mlx5_init_qp_table(dev); mlx5_init_srq_table(dev); - mlx5_init_mr_table(dev); + mlx5_init_mkey_table(dev); err = mlx5_init_fs(dev); if (err) { @@ -1164,7 +1164,7 @@ err_sriov: err_reg_dev: mlx5_cleanup_fs(dev); err_fs: - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); @@ -1237,7 +1237,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) #endif mlx5_cleanup_fs(dev); - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 6fa22b5..77a72939 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -36,25 +36,26 @@ #include #include "mlx5_core.h" -void mlx5_init_mr_table(struct mlx5_core_dev *dev) +void mlx5_init_mkey_table(struct mlx5_core_dev *dev) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; memset(table, 0, sizeof(*table)); rwlock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); } -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev) +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) { } -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_create_mkey_mbox_out lout; int err; u8 key; @@ -83,34 +84,35 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, return mlx5_cmd_status_to_err(&lout.hdr); } - mr->iova = be64_to_cpu(in->seg.start_addr); - mr->size = be64_to_cpu(in->seg.len); - mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; - mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + mkey->iova = be64_to_cpu(in->seg.start_addr); + mkey->size = be64_to_cpu(in->seg.len); + mkey->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mkey->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", - be32_to_cpu(lout.mkey), key, mr->key); + be32_to_cpu(lout.mkey), key, mkey->key); - /* connect to MR tree */ + /* connect to mkey tree */ write_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->key), mr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey); write_unlock_irq(&table->lock); if (err) { - mlx5_core_warn(dev, "failed radix tree insert of mr 0x%x, %d\n", - mlx5_base_mkey(mr->key), err); - mlx5_core_destroy_mkey(dev, mr); + mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", + mlx5_base_mkey(mkey->key), err); + mlx5_core_destroy_mkey(dev, mkey); } return err; } EXPORT_SYMBOL(mlx5_core_create_mkey); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_destroy_mkey_mbox_in in; struct mlx5_destroy_mkey_mbox_out out; - struct mlx5_core_mr *deleted_mr; + struct mlx5_core_mkey *deleted_mkey; unsigned long flags; int err; @@ -118,16 +120,16 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) memset(&out, 0, sizeof(out)); write_lock_irqsave(&table->lock, flags); - deleted_mr = radix_tree_delete(&table->tree, mlx5_base_mkey(mr->key)); + deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key)); write_unlock_irqrestore(&table->lock, flags); - if (!deleted_mr) { - mlx5_core_warn(dev, "failed radix tree delete of mr 0x%x\n", - mlx5_base_mkey(mr->key)); + if (!deleted_mkey) { + mlx5_core_warn(dev, "failed radix tree delete of mkey 0x%x\n", + mlx5_base_mkey(mkey->key)); return -ENOENT; } in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); if (err) return err; @@ -139,7 +141,7 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) } EXPORT_SYMBOL(mlx5_core_destroy_mkey); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen) { struct mlx5_query_mkey_mbox_in in; @@ -149,7 +151,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, memset(out, 0, outlen); in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); if (err) return err; @@ -161,7 +163,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, } EXPORT_SYMBOL(mlx5_core_query_mkey); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey) { struct mlx5_query_special_ctxs_mbox_in in; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8edcd08..9108904 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -338,7 +338,7 @@ struct mlx5_core_sig_ctx { u32 sigerr_count; }; -struct mlx5_core_mr { +struct mlx5_core_mkey { u64 iova; u64 size; u32 key; @@ -426,7 +426,7 @@ struct mlx5_srq_table { struct radix_tree_root tree; }; -struct mlx5_mr_table { +struct mlx5_mkey_table { /* protect radix tree */ rwlock_t lock; @@ -484,9 +484,9 @@ struct mlx5_priv { struct mlx5_cq_table cq_table; /* end: cq staff */ - /* start: mr staff */ - struct mlx5_mr_table mr_table; - /* end: mr staff */ + /* start: mkey staff */ + struct mlx5_mkey_table mkey_table; + /* end: mkey staff */ /* start: alloc staff */ /* protect buffer alocation according to numa node */ @@ -739,16 +739,18 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_query_srq_mbox_out *out); int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); -void mlx5_init_mr_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev); -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +void mlx5_init_mkey_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey); +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index e5bbcf0..cf031a3 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -622,9 +622,9 @@ static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); } -static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) +static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) { - return radix_tree_lookup(&dev->priv.mr_table.tree, key); + return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } struct mlx5_page_fault_resume_mbox_in { -- cgit v0.10.2 From b2a239df4e65fe35240ddf3e5f9f31335c90589b Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 29 Feb 2016 18:05:29 +0200 Subject: IB/core: Add vendor's specific data to alloc mw Passing udata to the vendor's driver in order to pass data from the user-space driver to the kernel-space driver. This data will be used in downstream patches. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6ffc9c4..2bf751e 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1174,6 +1174,7 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; + struct ib_udata udata; int ret; if (out_len < sizeof(resp)) @@ -1195,7 +1196,12 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, goto err_free; } - mw = pd->device->alloc_mw(pd, cmd.mw_type); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); + + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 2734820..42a7b89 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -657,7 +657,8 @@ err: return ERR_PTR(err); } -static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_pd *php; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index fb2de75..423a3a9 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -961,7 +961,8 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents); int c4iw_dealloc_mw(struct ib_mw *mw); -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 7849890..766d39c 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "iw_cxgb4.h" @@ -552,7 +553,8 @@ err: return ERR_PTR(err); } -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_pd *php; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 52ce7b0..1eca01c 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -711,7 +711,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 242b94e..ce0b5aa 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -32,6 +32,7 @@ */ #include +#include #include "mlx4_ib.h" @@ -334,7 +335,8 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mw *mw; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 8c4daf7..5af19b4 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -56,7 +56,8 @@ static int nes_dereg_mr(struct ib_mr *ib_mr); /** * nes_alloc_mw */ -static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, + struct ib_udata *udata) { struct nes_pd *nespd = to_nespd(ibpd); struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 284b00c..3f79070 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1808,7 +1808,8 @@ struct ib_device { struct scatterlist *sg, int sg_nents); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, - enum ib_mw_type type); + enum ib_mw_type type, + struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, -- cgit v0.10.2 From d2370e0a573e5c5ea9c96373558727abb3ea71f7 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 29 Feb 2016 18:05:30 +0200 Subject: IB/mlx5: Add memory windows allocation support This patch adds user-space support for memory windows allocation and deallocation. It also exposes the supported types via query_device_caps verb. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Tested-by: Max Gurtovoy Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 16f7d0b..4d9b7cc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -487,6 +487,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; @@ -2306,6 +2311,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mlx5_ib_internal_fill_odp_caps(dev); + if (MLX5_CAP_GEN(mdev, imaicl)) { + dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; + dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4167d67..648d2e2 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -43,6 +43,7 @@ #include #include #include +#include #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -461,6 +462,11 @@ struct mlx5_ib_mr { int access_flags; /* Needed for rereg MR */ }; +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_core_mkey mmkey; +}; + struct mlx5_ib_umr_context { enum ib_wc_status status; struct completion done; @@ -633,6 +639,11 @@ static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx5_ib_mr, ibmr); } +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx5_ib_mw, ibmw); +} + struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; @@ -693,6 +704,9 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 399e2b5..70a047d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -40,6 +40,7 @@ #include #include #include "mlx5_ib.h" +#include "user.h" enum { MAX_PENDING_REG_MR = 8, @@ -1620,6 +1621,88 @@ err_free: return ERR_PTR(err); } +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in = NULL; + struct mlx5_ib_mw *mw = NULL; + int ndescs; + int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return ERR_PTR(err); + + if (req.comp_mask || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!mw || !in) { + err = -ENOMEM; + goto free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_KLM | + MLX5_PERM_LOCAL_READ; + if (type == IB_MW_TYPE_2) + in->seg.flags_pd |= cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto free; + + mw->ibmw.rkey = mw->mmkey.key; + + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); + goto free; + } + } + + kfree(in); + return &mw->ibmw; + +free: + kfree(mw); + kfree(in); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_mw *mmw = to_mmw(mw); + int err; + + err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, + &mmw->mmkey); + if (!err) + kfree(mmw); + return err; +} + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status) { diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index b94a554..61bc308 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -152,6 +152,13 @@ struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 72bba52..3044cfa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -769,7 +769,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cd[0x1]; u8 reserved_at_22c[0x1]; u8 apm[0x1]; - u8 reserved_at_22e[0x7]; + u8 reserved_at_22e[0x2]; + u8 imaicl[0x1]; + u8 reserved_at_231[0x4]; u8 qkv[0x1]; u8 pkv[0x1]; u8 set_deth_sqpn[0x1]; -- cgit v0.10.2 From 0ca4c39f32cd3fad57c18cd8df49d6b4e7bc2411 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 26 Dec 2015 18:18:18 +0100 Subject: IB/ocrdma: Delete unnecessary variable initialisations in 11 functions The variable "status" will be set to an appropriate value a bit later. Thus let us omit the explicit initialisation at the beginning. Signed-off-by: Markus Elfring Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index e3c4f17..797362a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -74,7 +74,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, union ib_gid *sgid, int pdid, bool *isvlan, u16 vlan_tag) { - int status = 0; + int status; struct ocrdma_eth_vlan eth; struct ocrdma_grh grh; int eth_sz; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 2cfbf15..9b8ff26 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1113,7 +1113,7 @@ mbx_err: static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, void *payload_va) { - int status = 0; + int status; struct ocrdma_mbx_rsp *rsp = payload_va; if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> @@ -2893,7 +2893,7 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, struct ocrdma_dcbx_cfg *dcbxcfg) { - int status = 0; + int status; dma_addr_t pa; struct ocrdma_mqe cmd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 255f774..8bef09a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -610,7 +610,7 @@ static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) static void ocrdma_update_stats(struct ocrdma_dev *dev) { ulong now = jiffies, secs; - int status = 0; + int status; struct ocrdma_rdma_stats_resp *rdma_stats = (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; @@ -641,7 +641,7 @@ static ssize_t ocrdma_dbgfs_ops_write(struct file *filp, { char tmp_str[32]; long reset; - int status = 0; + int status; struct ocrdma_stats *pstats = filp->private_data; struct ocrdma_dev *dev = pstats->dev; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 4df3f13..4a4c8d6 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -419,7 +419,7 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ib_udata *udata) { struct ocrdma_pd *pd = NULL; - int status = 0; + int status; pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) @@ -468,7 +468,7 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { - int status = 0; + int status; if (dev->pd_mgr->pd_prealloc_valid) status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); @@ -596,7 +596,7 @@ map_err: int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) { - int status = 0; + int status; struct ocrdma_mm *mm, *tmp; struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); @@ -623,7 +623,7 @@ int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; u64 unmapped_db = (u64) dev->nic_info.unmapped_db; unsigned long len = (vma->vm_end - vma->vm_start); - int status = 0; + int status; bool found; if (vma->vm_start & (PAGE_SIZE - 1)) @@ -1285,7 +1285,7 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, struct ib_udata *udata, int dpp_offset, int dpp_credit_lmt, int srq) { - int status = 0; + int status; u64 usr_db; struct ocrdma_create_qp_uresp uresp; struct ocrdma_pd *pd = qp->pd; @@ -1949,7 +1949,7 @@ int ocrdma_modify_srq(struct ib_srq *ibsrq, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata) { - int status = 0; + int status; struct ocrdma_srq *srq; srq = get_ocrdma_srq(ibsrq); -- cgit v0.10.2 From d1c95b0e6526fc2a1841cc0b6b3bcb46c31cc038 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 26 Dec 2015 18:28:35 +0100 Subject: IB/ocrdma: Skip using unneeded intermediate variable Return zero at the end without using the local variable "status". Signed-off-by: Markus Elfring Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 9b8ff26..16740dc 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2141,7 +2141,6 @@ int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, enum ib_qp_state *old_ib_state) { unsigned long flags; - int status = 0; enum ocrdma_qp_state new_state; new_state = get_ocrdma_qp_state(new_ib_state); @@ -2166,7 +2165,7 @@ int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, qp->state = new_state; spin_unlock_irqrestore(&qp->q_lock, flags); - return status; + return 0; } static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp) -- cgit v0.10.2 From 95f60bb8118c1fc368d7414409d555f050aea7f2 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 26 Dec 2015 18:40:43 +0100 Subject: IB/ocrdma: Skip using unneeded intermediate variable Return the value from a call of the ocrdma_mbx_modify_qp() function without using an extra assignment for the local variable "status". Signed-off-by: Markus Elfring Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 4a4c8d6..a8496a1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1494,9 +1494,7 @@ int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, */ if (status < 0) return status; - status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); - - return status; + return ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); } int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, -- cgit v0.10.2 From add08d765e942eab8eb15a592baeb372a3dd6831 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 09:38:22 +0100 Subject: IB/mlx5: Convert UMR CQ to new CQ API Simplifies the code, and makes it more fair vs other users by using a softirq for polling. Signed-off-by: Christoph Hellwig Reviewed-by: Haggai Eran Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4d9b7cc..63c3d21 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1861,7 +1861,7 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); mlx5_ib_destroy_qp(dev->umrc.qp); - ib_destroy_cq(dev->umrc.cq); + ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } @@ -1876,7 +1876,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev) struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - struct ib_cq_init_attr cq_attr = {}; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); @@ -1893,15 +1892,12 @@ static int create_umr_res(struct mlx5_ib_dev *dev) goto error_0; } - cq_attr.cqe = 128; - cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, - &cq_attr); + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); init_attr->send_cq = cq; init_attr->recv_cq = cq; @@ -1968,7 +1964,7 @@ error_4: mlx5_ib_destroy_qp(qp); error_3: - ib_destroy_cq(cq); + ib_free_cq(cq); error_2: ib_dealloc_pd(pd); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 648d2e2..3c02b3c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -468,16 +468,11 @@ struct mlx5_ib_mw { }; struct mlx5_ib_umr_context { + struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; -static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) -{ - context->status = -1; - init_completion(&context->done); -} - struct umr_common { struct ib_pd *pd; struct ib_cq *cq; @@ -762,7 +757,6 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 70a047d..dd92314 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -836,26 +836,20 @@ static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, return umem; } -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) { - struct mlx5_ib_umr_context *context; - struct ib_wc wc; - int err; + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); - while (1) { - err = ib_poll_cq(cq, 1, &wc); - if (err < 0) { - pr_warn("poll cq error %d\n", err); - return; - } - if (err == 0) - break; + context->status = wc->status; + complete(&context->done); +} - context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; - context->status = wc.status; - complete(&context->done); - } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); } static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, @@ -896,12 +890,13 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (err) goto free_mr; + mlx5_ib_init_umr_context(&umr_context); + memset(&umrwr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; + umrwr.wr.wr_cqe = &umr_context.cqe; prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -1013,8 +1008,10 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + mlx5_ib_init_umr_context(&umr_context); + memset(&wr, 0, sizeof(wr)); - wr.wr.wr_id = (u64)(unsigned long)&umr_context; + wr.wr.wr_cqe = &umr_context.cqe; sg.addr = dma; sg.length = ALIGN(npages * sizeof(u64), @@ -1031,7 +1028,6 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, wr.mkey = mr->mmkey.key; wr.target.offset = start_page_index; - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &wr.wr, &bad); if (err) { @@ -1204,11 +1200,12 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) struct ib_send_wr *bad; int err; + mlx5_ib_init_umr_context(&umr_context); + memset(&umrwr.wr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; + umrwr.wr.wr_cqe = &umr_context.cqe; prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -1246,7 +1243,9 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, int size; int err; - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; + mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; if (flags & IB_MR_REREG_TRANS) { @@ -1273,8 +1272,6 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; } - mlx5_ib_init_umr_context(&umr_context); - /* post send request to UMR QP */ down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); -- cgit v0.10.2 From 0025b0bdeae7c13b8ab1dce64b0108ed9c071e2e Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 3 Mar 2016 11:23:37 -0500 Subject: IB/mlx5: Make coding style more consistent These three related functions can't agree whether to put the umrwr on the stack dirty and then memset it, or to initialize it on the stack. Make them all agree. Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index dd92314..628f4350 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -860,7 +860,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; @@ -892,7 +892,6 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, mlx5_ib_init_umr_context(&umr_context); - memset(&umrwr, 0, sizeof(umrwr)); umrwr.wr.wr_cqe = &umr_context.cqe; prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); @@ -1196,13 +1195,12 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; int err; mlx5_ib_init_umr_context(&umr_context); - memset(&umrwr.wr, 0, sizeof(umrwr)); umrwr.wr.wr_cqe = &umr_context.cqe; prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); -- cgit v0.10.2 From 911f4331bc87f4589b9096f4fb24b335d4c2967d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 3 Mar 2016 13:37:51 +0200 Subject: IB/mlx5: Expose correct max_fast_reg_page_list_len While documentation indicates that the number of translation entries per memory key is unlimited, in practice, we can only fit a finite amount of translation entries in a single registration wqe (which is log_max_klm_list_size). Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 63c3d21..55fa588 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -539,7 +539,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; - props->max_fast_reg_page_list_len = (unsigned int)-1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); -- cgit v0.10.2 From f5aa9159a418726d74b67c8815ffd2739afb4c7a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 29 Feb 2016 19:07:32 +0200 Subject: IB/core: Add arbitrary sg_list support Devices that are capable in registering SG lists with gaps can now expose it in the core to ULPs using a new device capability IB_DEVICE_SG_GAPS_REG (in a new field device_cap_flags_ex in the device attributes as we ran out of bits), and a new mr_type IB_MR_TYPE_SG_GAPS_REG which allocates a memory region which is capable of handling SG lists with gaps. Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5af6d02..16f3fb1 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1567,6 +1567,8 @@ EXPORT_SYMBOL(ib_check_mr_status); * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS_REG, non of these + * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3f79070..bcd5b24 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -212,6 +212,7 @@ enum ib_device_cap_flags { IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1 << 31), + IB_DEVICE_SG_GAPS_REG = (1ULL << 32), }; enum ib_signature_prot_cap { @@ -662,10 +663,15 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * @IB_MR_TYPE_SIGNATURE: memory region that is used for * signature operations (data-integrity * capable regions) + * @IB_MR_TYPE_SG_GAPS: memory region that is capable to + * register any arbitrary sg lists (without + * the normal mr constraints - see + * ib_map_mr_sg) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, + IB_MR_TYPE_SG_GAPS, }; /** -- cgit v0.10.2 From b005d316471374b1ff26df8c8460cc1ea9186647 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 29 Feb 2016 19:07:33 +0200 Subject: mlx5: Add arbitrary sg list support Allocate proper context for arbitrary scatterlist registration If ib_alloc_mr is called with IB_MR_MAP_ARB_SG, the driver allocate a private klm list instead of a private page list. Set the UMR wqe correctly when posting the fast registration. Also, expose device cap IB_DEVICE_MAP_ARB_SG according to the device id (until we have a FW bit that correctly exposes it). Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 55fa588..7e89a54 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -491,6 +491,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 3c02b3c..60b8962 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -447,6 +447,7 @@ struct mlx5_ib_mr { int ndescs; int max_descs; int desc_size; + int access_mode; struct mlx5_core_mkey mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 628f4350..4d5bff1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1521,8 +1521,8 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; struct mlx5_ib_mr *mr; - int access_mode, err; - int ndescs = roundup(max_num_sg, 4); + int ndescs = ALIGN(max_num_sg, 4); + int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) @@ -1540,7 +1540,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); if (mr_type == IB_MR_TYPE_MEM_REG) { - access_mode = MLX5_ACCESS_MODE_MTT; + mr->access_mode = MLX5_ACCESS_MODE_MTT; in->seg.log2_page_size = PAGE_SHIFT; err = mlx5_alloc_priv_descs(pd->device, mr, @@ -1550,6 +1550,15 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->desc_size = sizeof(u64); mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SG_GAPS) { + mr->access_mode = MLX5_ACCESS_MODE_KLM; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SIGNATURE) { u32 psv_index[2]; @@ -1568,7 +1577,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_free_sig; - access_mode = MLX5_ACCESS_MODE_KLM; + mr->access_mode = MLX5_ACCESS_MODE_KLM; mr->sig->psv_memory.psv_idx = psv_index[0]; mr->sig->psv_wire.psv_idx = psv_index[1]; @@ -1582,7 +1591,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; } - in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + in->seg.flags = MLX5_PERM_UMR_EN | mr->access_mode; err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) @@ -1739,6 +1748,32 @@ done: return ret; } +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i; + + mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.length = 0; + mr->ndescs = sg_nents; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i > mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg)); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg); + } + + return i; +} + static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); @@ -1766,7 +1801,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 295eb2a..8dee8bc 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2629,6 +2629,11 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, int ndescs = mr->ndescs; memset(umr, 0, sizeof(*umr)); + + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + umr->flags = MLX5_UMR_CHECK_NOT_FREE; umr->klm_octowords = get_klm_octo(ndescs); umr->mkey_mask = frwr_mkey_mask(); @@ -2767,13 +2772,19 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, int ndescs = ALIGN(mr->ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); - seg->flags = get_umr_flags(access) | MLX5_ACCESS_MODE_MTT; + + if (mr->access_mode == MLX5_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); seg->start_addr = cpu_to_be64(mr->ibmr.iova); seg->len = cpu_to_be64(mr->ibmr.length); seg->xlt_oct_size = cpu_to_be32(ndescs); - seg->log2_page_size = ilog2(mr->ibmr.page_size); } static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) -- cgit v0.10.2 From 318d311e8f016dbbf22160d7b1c19a290a95ad9d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 29 Feb 2016 19:07:34 +0200 Subject: iser: Accept arbitrary sg lists mapping if the device supports it If the device support arbitrary sg list mapping (device cap IB_DEVICE_SG_GAPS_REG set) we allocate the memory regions with IB_MR_TYPE_SG_GAPS and allow the block layer to pass us gaps by skip setting the queue virt_boundary. Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index c827c93..80b6bed 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -969,7 +969,16 @@ static umode_t iser_attr_is_visible(int param_type, int param) static int iscsi_iser_slave_alloc(struct scsi_device *sdev) { - blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); + struct iscsi_session *session; + struct iser_conn *iser_conn; + struct ib_device *ib_dev; + + session = starget_to_session(scsi_target(sdev))->dd_data; + iser_conn = session->leadconn->dd_data; + ib_dev = iser_conn->ib_conn.device->ib_device; + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); return 0; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 40c0f49..f21bdcc 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -252,14 +252,21 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) } static int -iser_alloc_reg_res(struct ib_device *ib_device, +iser_alloc_reg_res(struct iser_device *device, struct ib_pd *pd, struct iser_reg_resources *res, unsigned int size) { + struct ib_device *ib_dev = device->ib_device; + enum ib_mr_type mr_type; int ret; - res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size); + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + res->mr = ib_alloc_mr(pd, mr_type, size); if (IS_ERR(res->mr)) { ret = PTR_ERR(res->mr); iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); @@ -277,7 +284,7 @@ iser_free_reg_res(struct iser_reg_resources *rsc) } static int -iser_alloc_pi_ctx(struct ib_device *ib_device, +iser_alloc_pi_ctx(struct iser_device *device, struct ib_pd *pd, struct iser_fr_desc *desc, unsigned int size) @@ -291,7 +298,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, pi_ctx = desc->pi_ctx; - ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size); + ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size); if (ret) { iser_err("failed to allocate reg_resources\n"); goto alloc_reg_res_err; @@ -324,7 +331,7 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx) } static struct iser_fr_desc * -iser_create_fastreg_desc(struct ib_device *ib_device, +iser_create_fastreg_desc(struct iser_device *device, struct ib_pd *pd, bool pi_enable, unsigned int size) @@ -336,12 +343,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, if (!desc) return ERR_PTR(-ENOMEM); - ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size); + ret = iser_alloc_reg_res(device, pd, &desc->rsc, size); if (ret) goto reg_res_alloc_failure; if (pi_enable) { - ret = iser_alloc_pi_ctx(ib_device, pd, desc, size); + ret = iser_alloc_pi_ctx(device, pd, desc, size); if (ret) goto pi_ctx_alloc_failure; } @@ -374,7 +381,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, spin_lock_init(&fr_pool->lock); fr_pool->size = 0; for (i = 0; i < cmds_max; i++) { - desc = iser_create_fastreg_desc(device->ib_device, device->pd, + desc = iser_create_fastreg_desc(device, device->pd, ib_conn->pi_support, size); if (IS_ERR(desc)) { ret = PTR_ERR(desc); -- cgit v0.10.2 From 153fefbf34e5079c2dd69490f5f23373758d2e9c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 7 Mar 2016 18:51:45 +0200 Subject: net/mlx5_core: Create anchor of last flow table Create an empty flow table in the end of NIC rx namesapce. Adding this flow table simplify the implementation of "forward to next prio" rules. Signed-off-by: Maor Gottlieb Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 6f68dba..a2781ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -77,6 +77,9 @@ #define KERNEL_NUM_PRIOS 1 #define KENREL_MIN_LEVEL 2 +#define ANCHOR_MAX_FT 1 +#define ANCHOR_NUM_PRIOS 1 +#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1) struct node_caps { size_t arr_sz; long *caps; @@ -92,7 +95,7 @@ static struct init_tree_node { int max_ft; } root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 3, + .ar_size = 4, .children = (struct init_tree_node[]) { ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), @@ -108,6 +111,8 @@ static struct init_tree_node { FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), FS_CAP(flow_table_properties_nic_receive.flow_table_modify)), ADD_NS(ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, LEFTOVERS_MAX_FT))), + ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {}, + ADD_NS(ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, ANCHOR_MAX_FT))), } }; @@ -1126,6 +1131,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, case MLX5_FLOW_NAMESPACE_BYPASS: case MLX5_FLOW_NAMESPACE_KERNEL: case MLX5_FLOW_NAMESPACE_LEFTOVERS: + case MLX5_FLOW_NAMESPACE_ANCHOR: prio = type; break; case MLX5_FLOW_NAMESPACE_FDB: @@ -1351,6 +1357,25 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns) } } +#define ANCHOR_PRIO 0 +#define ANCHOR_SIZE 1 +static int create_anchor_flow_table(struct mlx5_core_dev + *dev) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_flow_table *ft; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ANCHOR); + if (!ns) + return -EINVAL; + ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE); + if (IS_ERR(ft)) { + mlx5_core_err(dev, "Failed to create last anchor flow table"); + return PTR_ERR(ft); + } + return 0; +} + static int init_root_ns(struct mlx5_core_dev *dev) { @@ -1363,6 +1388,9 @@ static int init_root_ns(struct mlx5_core_dev *dev) set_prio_attrs(dev->priv.root_ns); + if (create_anchor_flow_table(dev)) + goto cleanup; + return 0; cleanup: @@ -1392,6 +1420,15 @@ static void cleanup_single_prio_root_ns(struct mlx5_core_dev *dev, root_ns = NULL; } +static void destroy_flow_tables(struct fs_prio *prio) +{ + struct mlx5_flow_table *iter; + struct mlx5_flow_table *tmp; + + fs_for_each_ft_safe(iter, tmp, prio) + mlx5_destroy_flow_table(iter); +} + static void cleanup_root_ns(struct mlx5_core_dev *dev) { struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns; @@ -1420,6 +1457,7 @@ static void cleanup_root_ns(struct mlx5_core_dev *dev) list); fs_get_obj(obj_iter_prio2, iter_prio2); + destroy_flow_tables(obj_iter_prio2); if (tree_remove_node(iter_prio2)) { mlx5_core_warn(dev, "Priority %d wasn't destroyed, refcount > 1\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 00245fd..574a903 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -142,6 +142,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_list_for_each_entry(pos, root) \ list_for_each_entry(pos, root, node.list) +#define fs_list_for_each_entry_safe(pos, tmp, root) \ + list_for_each_entry_safe(pos, tmp, root, node.list) + #define fs_for_each_ns_or_ft_reverse(pos, prio) \ list_for_each_entry_reverse(pos, &(prio)->node.children, list) @@ -157,6 +160,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_for_each_ft(pos, prio) \ fs_list_for_each_entry(pos, &(prio)->node.children) +#define fs_for_each_ft_safe(pos, tmp, prio) \ + fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children) + #define fs_for_each_fg(pos, ft) \ fs_list_for_each_entry(pos, &(ft)->node.children) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8230caa..72adf53 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -52,6 +52,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, MLX5_FLOW_NAMESPACE_LEFTOVERS, + MLX5_FLOW_NAMESPACE_ANCHOR, MLX5_FLOW_NAMESPACE_FDB, }; -- cgit v0.10.2 From b3638e1a76648dbd482cc5a8f27eb6948cc3bc86 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 7 Mar 2016 18:51:46 +0200 Subject: net/mlx5_core: Introduce forward to next priority action Add support to create flow rule that forward packets to the first flow table in the next priority (next priority could be the first priority in the next namespace or the next priority in the same namespace). This feature could be used for DONT_TRAP rules or rules that only want to mark the packet with flow tag. In order to do it optimally, each flow table has list of all rules that point to this flow table, when a flow table is destroyed/created, we update the list head correspondingly. This kind of rule is created when destination is NULL and action is MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO. Signed-off-by: Maor Gottlieb Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index a2781ee..bf34467 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -201,8 +201,10 @@ static void tree_put_node(struct fs_node *node) static int tree_remove_node(struct fs_node *node) { - if (atomic_read(&node->refcount) > 1) - return -EPERM; + if (atomic_read(&node->refcount) > 1) { + atomic_dec(&node->refcount); + return -EEXIST; + } tree_put_node(node); return 0; } @@ -365,6 +367,11 @@ static void del_rule(struct fs_node *node) memcpy(match_value, fte->val, sizeof(fte->val)); fs_get_obj(ft, fg->node.parent); list_del(&rule->node.list); + if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + mutex_lock(&rule->dest_attr.ft->lock); + list_del(&rule->next_ft); + mutex_unlock(&rule->dest_attr.ft->lock); + } fte->dests_size--; if (fte->dests_size) { err = mlx5_cmd_update_fte(dev, ft, @@ -470,6 +477,8 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte, ft->node.type = FS_TYPE_FLOW_TABLE; ft->type = table_type; ft->max_fte = max_fte; + INIT_LIST_HEAD(&ft->fwd_rules); + mutex_init(&ft->lock); return ft; } @@ -606,9 +615,63 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio return err; } +static int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct fs_fte *fte; + int err = 0; + + fs_get_obj(fte, rule->node.parent); + if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return -EINVAL; + lock_ref_node(&fte->node); + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + + memcpy(&rule->dest_attr, dest, sizeof(*dest)); + err = mlx5_cmd_update_fte(get_dev(&ft->node), + ft, fg->id, fte); + unlock_ref_node(&fte->node); + + return err; +} + +/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft */ +static int connect_fwd_rules(struct mlx5_core_dev *dev, + struct mlx5_flow_table *new_next_ft, + struct mlx5_flow_table *old_next_ft) +{ + struct mlx5_flow_destination dest; + struct mlx5_flow_rule *iter; + int err = 0; + + /* new_next_ft and old_next_ft could be NULL only + * when we create/destroy the anchor flow table. + */ + if (!new_next_ft || !old_next_ft) + return 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = new_next_ft; + + mutex_lock(&old_next_ft->lock); + list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules); + mutex_unlock(&old_next_ft->lock); + list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) { + err = mlx5_modify_rule_destination(iter, &dest); + if (err) + pr_err("mlx5_core: failed to modify rule to point on flow table %d\n", + new_next_ft->id); + } + return 0; +} + static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, struct fs_prio *prio) { + struct mlx5_flow_table *next_ft; int err = 0; /* Connect_prev_fts and update_root_ft_create are mutually exclusive */ @@ -617,6 +680,11 @@ static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table err = connect_prev_fts(dev, ft, prio); if (err) return err; + + next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, ft, next_ft); + if (err) + return err; } if (MLX5_CAP_FLOWTABLE(dev, @@ -767,6 +835,7 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest) if (!rule) return NULL; + INIT_LIST_HEAD(&rule->next_ft); rule->node.type = FS_TYPE_FLOW_DEST; memcpy(&rule->dest_attr, dest, sizeof(*dest)); @@ -787,9 +856,14 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte, return ERR_PTR(-ENOMEM); fs_get_obj(ft, fg->node.parent); - /* Add dest to dests list- added as first element after the head */ + /* Add dest to dests list- we need flow tables to be in the + * end of the list for forward to next prio rules. + */ tree_init_node(&rule->node, 1, del_rule); - list_add_tail(&rule->node.list, &fte->node.children); + if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + list_add(&rule->node.list, &fte->node.children); + else + list_add_tail(&rule->node.list, &fte->node.children); fte->dests_size++; if (fte->dests_size == 1) err = mlx5_cmd_create_fte(get_dev(&ft->node), @@ -908,6 +982,25 @@ out: return fg; } +static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_rule *rule; + + list_for_each_entry(rule, &fte->node.children, node.list) { + if (rule->dest_attr.type == dest->type) { + if ((dest->type == MLX5_FLOW_DESTINATION_TYPE_VPORT && + dest->vport_num == rule->dest_attr.vport_num) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + dest->ft == rule->dest_attr.ft) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_TIR && + dest->tir_num == rule->dest_attr.tir_num)) + return rule; + } + } + return NULL; +} + static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, u32 *match_value, u8 action, @@ -924,6 +1017,13 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD); if (compare_match_value(&fg->mask, match_value, &fte->val) && action == fte->action && flow_tag == fte->flow_tag) { + rule = find_flow_rule(fte, dest); + if (rule) { + atomic_inc(&rule->node.refcount); + unlock_ref_node(&fte->node); + unlock_ref_node(&fg->node); + return rule; + } rule = add_rule_fte(fte, fg, dest); unlock_ref_node(&fte->node); if (IS_ERR(rule)) @@ -989,14 +1089,14 @@ static struct mlx5_flow_rule *add_rule_to_auto_fg(struct mlx5_flow_table *ft, return rule; } -struct mlx5_flow_rule * -mlx5_add_flow_rule(struct mlx5_flow_table *ft, - u8 match_criteria_enable, - u32 *match_criteria, - u32 *match_value, - u32 action, - u32 flow_tag, - struct mlx5_flow_destination *dest) +static struct mlx5_flow_rule * +_mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) { struct mlx5_flow_group *g; struct mlx5_flow_rule *rule; @@ -1019,6 +1119,63 @@ unlock: unlock_ref_node(&ft->node); return rule; } + +static bool fwd_next_prio_supported(struct mlx5_flow_table *ft) +{ + return ((ft->type == FS_FT_NIC_RX) && + (MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs))); +} + +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + struct mlx5_flow_destination gen_dest; + struct mlx5_flow_table *next_ft = NULL; + struct mlx5_flow_rule *rule = NULL; + u32 sw_action = action; + struct fs_prio *prio; + + fs_get_obj(prio, ft->node.parent); + if (action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!fwd_next_prio_supported(ft)) + return ERR_PTR(-EOPNOTSUPP); + if (dest) + return ERR_PTR(-EINVAL); + mutex_lock(&root->chain_lock); + next_ft = find_next_chained_ft(prio); + if (next_ft) { + gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + gen_dest.ft = next_ft; + dest = &gen_dest; + action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + mutex_unlock(&root->chain_lock); + return ERR_PTR(-EOPNOTSUPP); + } + } + + rule = _mlx5_add_flow_rule(ft, match_criteria_enable, match_criteria, + match_value, action, flow_tag, dest); + + if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!IS_ERR_OR_NULL(rule) && + (list_empty(&rule->next_ft))) { + mutex_lock(&next_ft->lock); + list_add(&rule->next_ft, &next_ft->fwd_rules); + mutex_unlock(&next_ft->lock); + rule->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + } + mutex_unlock(&root->chain_lock); + } + return rule; +} EXPORT_SYMBOL(mlx5_add_flow_rule); void mlx5_del_flow_rule(struct mlx5_flow_rule *rule) @@ -1082,6 +1239,10 @@ static int disconnect_flow_table(struct mlx5_flow_table *ft) return 0; next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, next_ft, ft); + if (err) + return err; + err = connect_prev_fts(dev, next_ft, prio); if (err) mlx5_core_warn(dev, "Failed to disconnect flow table %d\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 574a903..f37a624 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -68,6 +68,11 @@ struct fs_node { struct mlx5_flow_rule { struct fs_node node; struct mlx5_flow_destination dest_attr; + /* next_ft should be accessed under chain_lock and only of + * destination type is FWD_NEXT_fT. + */ + struct list_head next_ft; + u32 sw_action; }; /* Type of children is mlx5_flow_group */ @@ -82,6 +87,10 @@ struct mlx5_flow_table { unsigned int required_groups; unsigned int num_groups; } autogroup; + /* Protect fwd_rules */ + struct mutex lock; + /* FWD rules that point on this flow table */ + struct list_head fwd_rules; }; /* Type of children is mlx5_flow_rule */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 72adf53..8dec550 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,10 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +enum { + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, +}; + #define LEFTOVERS_RULE_NUM 2 static inline void build_leftovers_ft_param(int *priority, int *n_ent, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e54..5f70f36 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -458,7 +458,8 @@ struct mlx5_ifc_ads_bits { }; struct mlx5_ifc_flow_table_nic_cap_bits { - u8 reserved_at_0[0x200]; + u8 nic_rx_multi_path_tirs[0x1]; + u8 reserved_at_1[0x1ff]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; -- cgit v0.10.2 From 35d1901134e97cf95c0ab6ef70f5aead6cb34e9e Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 7 Mar 2016 18:51:47 +0200 Subject: IB/mlx5: Add support for don't trap rules Each bypass flow steering priority will be split into two priorities: 1. Priority for don't trap rules. 2. Priority for normal rules. When user creates a flow using IB_FLOW_ATTR_FLAGS_DONT_TRAP flag, the driver creates two flow rules, one used for receiving the traffic and the other one for forwarding the packet to continue matching in lower or equal priorities. Signed-off-by: Maor Gottlieb Reviewed-by: Matan Barak Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 03c418c..5863644 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1369,11 +1369,20 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) return 0; } +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr) { + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; @@ -1383,10 +1392,12 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (flow_is_multicast_only(flow_attr)) + if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = flow_attr->priority; + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; @@ -1434,6 +1445,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, unsigned int spec_index; u32 *match_c; u32 *match_v; + u32 action; int err = 0; if (!is_valid_attr(flow_attr)) @@ -1459,9 +1471,11 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, /* Outer header support only */ match_criteria_enable = (!outer_header_zero(match_c)) << 0; + action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable, match_c, match_v, - MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + action, MLX5_FS_DEFAULT_FLOW_TAG, dst); @@ -1481,6 +1495,29 @@ free: return err ? ERR_PTR(err) : handler; } +static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_dst = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); + if (!IS_ERR(handler)) { + handler_dst = create_flow_rule(dev, ft_prio, + flow_attr, dst); + if (IS_ERR(handler_dst)) { + mlx5_del_flow_rule(handler->rule); + kfree(handler); + handler = handler_dst; + } else { + list_add(&handler_dst->list, &handler->list); + } + } + + return handler; +} enum { LEFTOVERS_MC, LEFTOVERS_UC, @@ -1558,7 +1595,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || - flow_attr->flags) + (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); @@ -1577,8 +1614,13 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - handler = create_flow_rule(dev, ft_prio, flow_attr, - dst); + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { + handler = create_dont_trap_rule(dev, ft_prio, + flow_attr, dst); + } else { + handler = create_flow_rule(dev, ft_prio, flow_attr, + dst); + } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { handler = create_leftovers_rule(dev, ft_prio, flow_attr, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d2b9737..bd84b1f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -126,7 +126,7 @@ struct mlx5_ib_pd { }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) -#define MLX5_IB_FLOW_LAST_PRIO (MLX5_IB_FLOW_MCAST_PRIO - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) #error "Invalid number of bypass priorities" #endif diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 987764a..2bf222e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1294,6 +1294,11 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_BY_PASS_NUM_PRIOS 9 +#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 8 +#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 8 +#define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 +#define MLX5_BY_PASS_NUM_PRIOS (MLX5_BY_PASS_NUM_REGULAR_PRIOS +\ + MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS +\ + MLX5_BY_PASS_NUM_MULTICAST_PRIOS) #endif /* MLX5_DEVICE_H */ -- cgit v0.10.2