diff options
Diffstat (limited to 'drivers/infiniband')
59 files changed, 5276 insertions, 547 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 66b36de..994decc 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -1,4 +1,5 @@ menu "InfiniBand support" + depends on HAS_IOMEM config INFINIBAND depends on PCI || BROKEN @@ -29,6 +30,11 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from <http://www.openib.org>. +config INFINIBAND_USER_MEM + bool + depends on INFINIBAND_USER_ACCESS != n + default y + config INFINIBAND_ADDR_TRANS bool depends on INFINIBAND && INET @@ -40,6 +46,8 @@ source "drivers/infiniband/hw/ehca/Kconfig" source "drivers/infiniband/hw/amso1100/Kconfig" source "drivers/infiniband/hw/cxgb3/Kconfig" +source "drivers/infiniband/hw/mlx4/Kconfig" + source "drivers/infiniband/ulp/ipoib/Kconfig" source "drivers/infiniband/ulp/srp/Kconfig" diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index da2066c..75f325e 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ +obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 189e5d4..cb1ab3e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ device.o fmr_pool.o cache.o +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_mad-y := mad.o smi.o agent.o mad_rmpp.o @@ -28,5 +29,4 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o -ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \ - uverbs_marshall.o +ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 558c9a0..e85f701 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -38,6 +38,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include <rdma/ib_cache.h> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index eff591d..40c004a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -306,7 +306,9 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) do { spin_lock_irqsave(&cm.lock, flags); ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, - next_id++, &id); + next_id, &id); + if (!ret) + next_id = ((unsigned) id + 1) & MAX_ID_MASK; spin_unlock_irqrestore(&cm.lock, flags); } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); @@ -1295,26 +1297,29 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; - /* Check for duplicate REQ and stale connections. */ + /* Check for possible duplicate REQ. */ spin_lock_irqsave(&cm.lock, flags); timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); - if (!timewait_info) - timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); - if (timewait_info) { cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, timewait_info->work.remote_id); - cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irqrestore(&cm.lock, flags); if (cur_cm_id_priv) { cm_dup_req_handler(work, cur_cm_id_priv); cm_deref_id(cur_cm_id_priv); - } else - cm_issue_rej(work->port, work->mad_recv_wc, - IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, - NULL, 0); - listen_cm_id_priv = NULL; - goto out; + } + return NULL; + } + + /* Check for stale connections. */ + timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); + if (timewait_info) { + cm_cleanup_timewait(cm_id_priv->timewait_info); + spin_unlock_irqrestore(&cm.lock, flags); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, + NULL, 0); + return NULL; } /* Find matching listen request. */ diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fde92ce..32a0e66 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -346,12 +346,33 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -static void cma_release_remove(struct rdma_id_private *id_priv) +static int cma_disable_remove(struct rdma_id_private *id_priv, + enum cma_state state) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == state) { + atomic_inc(&id_priv->dev_remove); + ret = 0; + } else + ret = -EINVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static void cma_enable_remove(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->dev_remove)) wake_up(&id_priv->wait_remove); } +static int cma_has_cm_dev(struct rdma_id_private *id_priv) +{ + return (id_priv->id.device && id_priv->cm_id.ib); +} + struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps) { @@ -884,9 +905,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_CONNECT)) - goto out; + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -942,12 +962,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -1057,11 +1077,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int offset, ret; listen_id = cm_id->context; - atomic_inc(&listen_id->dev_remove); - if (!cma_comp(listen_id, CMA_LISTEN)) { - ret = -ECONNABORTED; - goto out; - } + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); @@ -1101,11 +1118,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) release_conn_id: cma_exch(conn_id, CMA_DESTROYING); - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(&conn_id->id); out: - cma_release_remove(listen_id); + cma_enable_remove(listen_id); return ret; } @@ -1171,9 +1188,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr_in *sin; int ret = 0; - memset(&event, 0, sizeof event); - atomic_inc(&id_priv->dev_remove); + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; @@ -1214,12 +1232,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -1234,11 +1252,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, int ret; listen_id = cm_id->context; - atomic_inc(&listen_id->dev_remove); - if (!cma_comp(listen_id, CMA_LISTEN)) { - ret = -ECONNABORTED; - goto out; - } + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, @@ -1255,13 +1270,13 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, dev = ip_dev_find(iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } @@ -1270,7 +1285,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); if (ret) { - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } @@ -1293,14 +1308,14 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, CMA_DESTROYING); - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(&conn_id->id); } out: if (dev) dev_put(dev); - cma_release_remove(listen_id); + cma_enable_remove(listen_id); return ret; } @@ -1519,7 +1534,7 @@ static void cma_work_handler(struct work_struct *_work) destroy = 1; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); @@ -1711,13 +1726,13 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); } @@ -2042,11 +2057,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - memset(&event, 0, sizeof event); - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_CONNECT)) - goto out; + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; @@ -2084,12 +2098,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -2413,7 +2427,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (id->device->node_type) { @@ -2435,7 +2449,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { @@ -2466,8 +2480,7 @@ int rdma_disconnect(struct rdma_cm_id *id) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT) && - !cma_comp(id_priv, CMA_DISCONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { @@ -2499,10 +2512,9 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) int ret; id_priv = mc->id_priv; - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_ADDR_BOUND) && - !cma_comp(id_priv, CMA_ADDR_RESOLVED)) - goto out; + if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) && + cma_disable_remove(id_priv, CMA_ADDR_RESOLVED)) + return 0; if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, @@ -2524,12 +2536,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return 0; } -out: - cma_release_remove(id_priv); + + cma_enable_remove(id_priv); return 0; } @@ -2761,8 +2773,8 @@ static int cma_init(void) int ret; get_random_bytes(&next_port, sizeof next_port); - next_port = (next_port % (sysctl_local_port_range[1] - - sysctl_local_port_range[0])) + + next_port = ((unsigned int) next_port % + (sysctl_local_port_range[1] - sysctl_local_port_range[0])) + sysctl_local_port_range[0]; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7fabb42..3ada17c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include "core_priv.h" @@ -149,6 +150,18 @@ static int alloc_name(char *name) return 0; } +static int start_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + + +static int end_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? + 0 : device->phys_port_cnt; +} + /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate @@ -208,6 +221,45 @@ static int add_client_context(struct ib_device *device, struct ib_client *client return 0; } +static int read_port_table_lengths(struct ib_device *device) +{ + struct ib_port_attr *tprops = NULL; + int num_ports, ret = -ENOMEM; + u8 port_index; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + goto out; + + num_ports = end_port(device) - start_port(device) + 1; + + device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, + GFP_KERNEL); + device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, + GFP_KERNEL); + if (!device->pkey_tbl_len || !device->gid_tbl_len) + goto err; + + for (port_index = 0; port_index < num_ports; ++port_index) { + ret = ib_query_port(device, port_index + start_port(device), + tprops); + if (ret) + goto err; + device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; + device->gid_tbl_len[port_index] = tprops->gid_tbl_len; + } + + ret = 0; + goto out; + +err: + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); +out: + kfree(tprops); + return ret; +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -239,10 +291,19 @@ int ib_register_device(struct ib_device *device) spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); + ret = read_port_table_lengths(device); + if (ret) { + printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", + device->name); + goto out; + } + ret = ib_device_register_sysfs(device); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); goto out; } @@ -284,6 +345,9 @@ void ib_unregister_device(struct ib_device *device) list_del(&device->core_list); + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); + mutex_unlock(&device_mutex); spin_lock_irqsave(&device->client_data_lock, flags); @@ -506,10 +570,7 @@ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { - if (device->node_type == RDMA_NODE_IB_SWITCH) { - if (port_num) - return -EINVAL; - } else if (port_num < 1 || port_num > device->phys_port_cnt) + if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->query_port(device, port_num, port_attr); @@ -581,10 +642,7 @@ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { - if (device->node_type == RDMA_NODE_IB_SWITCH) { - if (port_num) - return -EINVAL; - } else if (port_num < 1 || port_num > device->phys_port_cnt) + if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, @@ -592,6 +650,68 @@ int ib_modify_port(struct ib_device *device, } EXPORT_SYMBOL(ib_modify_port); +/** + * ib_find_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the GID table where the GID was found. This + * parameter may be NULL. + */ +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u8 *port_num, u16 *index) +{ + union ib_gid tmp_gid; + int ret, port, i; + + for (port = start_port(device); port <= end_port(device); ++port) { + for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid); + if (ret) + return ret; + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { + *port_num = port; + if (index) + *index = i; + return 0; + } + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_gid); + +/** + * ib_find_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the PKey table where the PKey was found. + */ +int ib_find_pkey(struct ib_device *device, + u8 port_num, u16 pkey, u16 *index) +{ + int ret, i; + u16 tmp_pkey; + + for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + ret = ib_query_pkey(device, port_num, i, &tmp_pkey); + if (ret) + return ret; + + if (pkey == tmp_pkey) { + *index = i; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_pkey); + static int __init ib_core_init(void) { int ret; @@ -613,6 +733,8 @@ static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ib_sysfs_cleanup(); + /* Make sure that any pending umem accounting work is done. */ + flush_scheduled_work(); } module_init(ib_core_init); diff --git a/drivers/infiniband/core/uverbs_mem.c b/drivers/infiniband/core/umem.c index c95fe95..d40652a 100644 --- a/drivers/infiniband/core/uverbs_mem.c +++ b/drivers/infiniband/core/umem.c @@ -36,16 +36,10 @@ #include <linux/mm.h> #include <linux/dma-mapping.h> +#include <linux/sched.h> #include "uverbs.h" -struct ib_umem_account_work { - struct work_struct work; - struct mm_struct *mm; - unsigned long diff; -}; - - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { struct ib_umem_chunk *chunk, *tmp; @@ -64,35 +58,56 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d } } -int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, - void *addr, size_t size, int write) +/** + * ib_umem_get - Pin and DMA map userspace memory. + * @context: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + */ +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access) { + struct ib_umem *umem; struct page **page_list; struct ib_umem_chunk *chunk; unsigned long locked; unsigned long lock_limit; unsigned long cur_base; unsigned long npages; - int ret = 0; + int ret; int off; int i; if (!can_do_mlock()) - return -EPERM; + return ERR_PTR(-EPERM); - page_list = (struct page **) __get_free_page(GFP_KERNEL); - if (!page_list) - return -ENOMEM; + umem = kmalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); - mem->user_base = (unsigned long) addr; - mem->length = size; - mem->offset = (unsigned long) addr & ~PAGE_MASK; - mem->page_size = PAGE_SIZE; - mem->writable = write; + umem->context = context; + umem->length = size; + umem->offset = addr & ~PAGE_MASK; + umem->page_size = PAGE_SIZE; + /* + * We ask for writable memory if any access flags other than + * "remote read" are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); - INIT_LIST_HEAD(&mem->chunk_list); + INIT_LIST_HEAD(&umem->chunk_list); - npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT; + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; down_write(¤t->mm->mmap_sem); @@ -104,13 +119,13 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, goto out; } - cur_base = (unsigned long) addr & PAGE_MASK; + cur_base = addr & PAGE_MASK; while (npages) { ret = get_user_pages(current, current->mm, cur_base, min_t(int, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !write, page_list, NULL); + 1, !umem->writable, page_list, NULL); if (ret < 0) goto out; @@ -136,7 +151,7 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, chunk->page_list[i].length = PAGE_SIZE; } - chunk->nmap = ib_dma_map_sg(dev, + chunk->nmap = ib_dma_map_sg(context->device, &chunk->page_list[0], chunk->nents, DMA_BIDIRECTIONAL); @@ -151,75 +166,98 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, ret -= chunk->nents; off += chunk->nents; - list_add_tail(&chunk->list, &mem->chunk_list); + list_add_tail(&chunk->list, &umem->chunk_list); } ret = 0; } out: - if (ret < 0) - __ib_umem_release(dev, mem, 0); - else + if (ret < 0) { + __ib_umem_release(context->device, umem, 0); + kfree(umem); + } else current->mm->locked_vm = locked; up_write(¤t->mm->mmap_sem); free_page((unsigned long) page_list); - return ret; + return ret < 0 ? ERR_PTR(ret) : umem; } +EXPORT_SYMBOL(ib_umem_get); -void ib_umem_release(struct ib_device *dev, struct ib_umem *umem) +static void ib_umem_account(struct work_struct *work) { - __ib_umem_release(dev, umem, 1); - - down_write(¤t->mm->mmap_sem); - current->mm->locked_vm -= - PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; - up_write(¤t->mm->mmap_sem); -} + struct ib_umem *umem = container_of(work, struct ib_umem, work); -static void ib_umem_account(struct work_struct *_work) -{ - struct ib_umem_account_work *work = - container_of(_work, struct ib_umem_account_work, work); - - down_write(&work->mm->mmap_sem); - work->mm->locked_vm -= work->diff; - up_write(&work->mm->mmap_sem); - mmput(work->mm); - kfree(work); + down_write(&umem->mm->mmap_sem); + umem->mm->locked_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); } -void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem) +/** + * ib_umem_release - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +void ib_umem_release(struct ib_umem *umem) { - struct ib_umem_account_work *work; + struct ib_ucontext *context = umem->context; struct mm_struct *mm; + unsigned long diff; - __ib_umem_release(dev, umem, 1); + __ib_umem_release(umem->context->device, umem, 1); mm = get_task_mm(current); - if (!mm) + if (!mm) { + kfree(umem); return; + } + + diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end - * up here and not be able to take the mmap_sem. Therefore we - * defer the vm_locked accounting to the system workqueue. + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. */ + if (context->closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_account); + umem->mm = mm; + umem->diff = diff; + + schedule_work(&umem->work); + return; + } + } else + down_write(&mm->mmap_sem); - work = kmalloc(sizeof *work, GFP_KERNEL); - if (!work) { - mmput(mm); - return; - } + current->mm->locked_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(umem); +} +EXPORT_SYMBOL(ib_umem_release); + +int ib_umem_page_count(struct ib_umem *umem) +{ + struct ib_umem_chunk *chunk; + int shift; + int i; + int n; + + shift = ilog2(umem->page_size); - INIT_WORK(&work->work, ib_umem_account); - work->mm = mm; - work->diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; + n = 0; + list_for_each_entry(chunk, &umem->chunk_list, list) + for (i = 0; i < chunk->nmap; ++i) + n += sg_dma_len(&chunk->page_list[i]) >> shift; - schedule_work(&work->work); + return n; } +EXPORT_SYMBOL(ib_umem_page_count); diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 102a59c..c33546f 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -45,6 +45,7 @@ #include <linux/completion.h> #include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> /* @@ -163,11 +164,6 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); -int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, - void *addr, size_t size, int write); -void ib_umem_release(struct ib_device *dev, struct ib_umem *umem); -void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem); - #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bab6676..01d7008 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Mellanox Technologies. All rights reserved. * @@ -295,6 +295,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + ucontext->closing = 0; resp.num_comp_vectors = file->device->num_comp_vectors; @@ -573,7 +574,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; struct ib_udata udata; - struct ib_umem_object *obj; + struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; int ret; @@ -599,35 +600,21 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) return -ENOMEM; - init_uobj(&obj->uobject, 0, file->ucontext, &mr_lock_key); - down_write(&obj->uobject.mutex); - - /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - ret = ib_umem_get(file->device->ib_dev, &obj->umem, - (void *) (unsigned long) cmd.start, cmd.length, - !!(cmd.access_flags & ~IB_ACCESS_REMOTE_READ)); - if (ret) - goto err_free; - - obj->umem.virt_base = cmd.hca_va; + init_uobj(uobj, 0, file->ucontext, &mr_lock_key); + down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; - goto err_release; + goto err_free; } - mr = pd->device->reg_user_mr(pd, &obj->umem, cmd.access_flags, &udata); + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, + cmd.access_flags, &udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto err_put; @@ -635,19 +622,19 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, mr->device = pd->device; mr->pd = pd; - mr->uobject = &obj->uobject; + mr->uobject = uobj; atomic_inc(&pd->usecnt); atomic_set(&mr->usecnt, 0); - obj->uobject.object = mr; - ret = idr_add_uobj(&ib_uverbs_mr_idr, &obj->uobject); + uobj->object = mr; + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); if (ret) goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; resp.rkey = mr->rkey; - resp.mr_handle = obj->uobject.id; + resp.mr_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { @@ -658,17 +645,17 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, put_pd_read(pd); mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->mr_list); + list_add_tail(&uobj->list, &file->ucontext->mr_list); mutex_unlock(&file->mutex); - obj->uobject.live = 1; + uobj->live = 1; - up_write(&obj->uobject.mutex); + up_write(&uobj->mutex); return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_mr_idr, &obj->uobject); + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); err_unreg: ib_dereg_mr(mr); @@ -676,11 +663,8 @@ err_unreg: err_put: put_pd_read(pd); -err_release: - ib_umem_release(file->device->ib_dev, &obj->umem); - err_free: - put_uobj_write(&obj->uobject); + put_uobj_write(uobj); return ret; } @@ -691,7 +675,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; - struct ib_umem_object *memobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -701,8 +684,7 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, if (!uobj) return -EINVAL; - memobj = container_of(uobj, struct ib_umem_object, uobject); - mr = uobj->object; + mr = uobj->object; ret = ib_dereg_mr(mr); if (!ret) @@ -719,8 +701,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, list_del(&uobj->list); mutex_unlock(&file->mutex); - ib_umem_release(file->device->ib_dev, &memobj->umem); - put_uobj(uobj); return in_len; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index d44e547..14d7ccd 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -183,6 +183,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, if (!context) return 0; + context->closing = 1; + list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { struct ib_ah *ah = uobj->object; @@ -230,16 +232,10 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; - struct ib_device *mrdev = mr->device; - struct ib_umem_object *memobj; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); ib_dereg_mr(mr); - - memobj = container_of(uobj, struct ib_umem_object, uobject); - ib_umem_release_on_close(mrdev, &memobj->umem); - - kfree(memobj); + kfree(uobj); } list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { @@ -906,7 +902,6 @@ static void __exit ib_uverbs_cleanup(void) unregister_filesystem(&uverbs_event_fs); class_destroy(uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); - flush_scheduled_work(); idr_destroy(&ib_uverbs_pd_idr); idr_destroy(&ib_uverbs_mr_idr); idr_destroy(&ib_uverbs_mw_idr); diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index 58bc272..0aecea6 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -672,7 +672,7 @@ static int c2_up(struct net_device *netdev) * rdma interface. */ in_dev = in_dev_get(netdev); - in_dev->cnf.arp_ignore = 1; + IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1); in_dev_put(in_dev); return 0; diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 1091662..997cf15 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -56,6 +56,7 @@ #include <asm/byteorder.h> #include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> #include "c2.h" #include "c2_provider.h" @@ -396,6 +397,7 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, } mr->pd = to_c2pd(ib_pd); + mr->umem = NULL; pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, " "*iova_start %llx, first pa %llx, last pa %llx\n", __FUNCTION__, page_shift, pbl_depth, total_len, @@ -428,8 +430,8 @@ static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) return c2_reg_phys_mr(pd, &bl, 1, acc, &kva); } -static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int acc, struct ib_udata *udata) +static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) { u64 *pages; u64 kva = 0; @@ -441,15 +443,23 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, struct c2_mr *c2mr; pr_debug("%s:%u\n", __FUNCTION__, __LINE__); - shift = ffs(region->page_size) - 1; c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL); if (!c2mr) return ERR_PTR(-ENOMEM); c2mr->pd = c2pd; + c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc); + if (IS_ERR(c2mr->umem)) { + err = PTR_ERR(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); + } + + shift = ffs(c2mr->umem->page_size) - 1; + n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) n += chunk->nents; pages = kmalloc(n * sizeof(u64), GFP_KERNEL); @@ -459,35 +469,34 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, } i = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) { + list_for_each_entry(chunk, &c2mr->umem->chunk_list, list) { for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; for (k = 0; k < len; ++k) { pages[i++] = sg_dma_address(&chunk->page_list[j]) + - (region->page_size * k); + (c2mr->umem->page_size * k); } } } - kva = (u64)region->virt_base; + kva = virt; err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), pages, - region->page_size, + c2mr->umem->page_size, i, - region->length, - region->offset, + length, + c2mr->umem->offset, &kva, c2_convert_access(acc), c2mr); kfree(pages); - if (err) { - kfree(c2mr); - return ERR_PTR(err); - } + if (err) + goto err; return &c2mr->ibmr; err: + ib_umem_release(c2mr->umem); kfree(c2mr); return ERR_PTR(err); } @@ -502,8 +511,11 @@ static int c2_dereg_mr(struct ib_mr *ib_mr) err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey); if (err) pr_debug("c2_stag_dealloc failed: %d\n", err); - else + else { + if (mr->umem) + ib_umem_release(mr->umem); kfree(mr); + } return err; } diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h index fc90622..1076df2 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.h +++ b/drivers/infiniband/hw/amso1100/c2_provider.h @@ -73,6 +73,7 @@ struct c2_pd { struct c2_mr { struct ib_mr ibmr; struct c2_pd *pd; + struct ib_umem *umem; }; struct c2_av; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index a891493..e7c2c39 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -47,6 +47,7 @@ #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> #include "cxio_hal.h" @@ -443,6 +444,8 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) remove_handle(rhp, &rhp->mmidr, mmid); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); PDBG("%s mmid 0x%x ptr %p\n", __FUNCTION__, mmid, mhp); kfree(mhp); return 0; @@ -577,8 +580,8 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, } -static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int acc, struct ib_udata *udata) +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) { __be64 *pages; int shift, n, len; @@ -591,7 +594,6 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, struct iwch_reg_user_mr_resp uresp; PDBG("%s ib_pd %p\n", __FUNCTION__, pd); - shift = ffs(region->page_size) - 1; php = to_iwch_pd(pd); rhp = php->rhp; @@ -599,8 +601,17 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + kfree(mhp); + return ERR_PTR(err); + } + + shift = ffs(mhp->umem->page_size) - 1; + n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &mhp->umem->chunk_list, list) n += chunk->nents; pages = kmalloc(n * sizeof(u64), GFP_KERNEL); @@ -611,13 +622,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, i = n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &mhp->umem->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; for (k = 0; k < len; ++k) { pages[i++] = cpu_to_be64(sg_dma_address( &chunk->page_list[j]) + - region->page_size * k); + mhp->umem->page_size * k); } } @@ -625,9 +636,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); - mhp->attr.va_fbo = region->virt_base; + mhp->attr.va_fbo = virt; mhp->attr.page_size = shift - 12; - mhp->attr.len = (u32) region->length; + mhp->attr.len = (u32) length; mhp->attr.pbl_size = i; err = iwch_register_mem(rhp, php, mhp, shift, pages); kfree(pages); @@ -650,6 +661,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, return &mhp->ibmr; err: + ib_umem_release(mhp->umem); kfree(mhp); return ERR_PTR(err); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index 93bcc56..48833f3 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -73,6 +73,7 @@ struct tpt_attributes { struct iwch_mr { struct ib_mr ibmr; + struct ib_umem *umem; struct iwch_dev *rhp; u64 kva; struct tpt_attributes attr; diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 10fb8fb..1d286d3 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -176,6 +176,7 @@ struct ehca_mr { struct ib_mr ib_mr; /* must always be first in ehca_mr */ struct ib_fmr ib_fmr; /* must always be first in ehca_mr */ } ib; + struct ib_umem *umem; spinlock_t mrlock; enum ehca_mr_flag flags; @@ -276,6 +277,7 @@ void ehca_cleanup_mrmw_cache(void); extern spinlock_t ehca_qp_idr_lock; extern spinlock_t ehca_cq_idr_lock; +extern spinlock_t hcall_lock; extern struct idr ehca_qp_idr; extern struct idr ehca_cq_idr; diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index f284be1..100329b 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -517,12 +517,11 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq) else { struct ehca_cq *cq = eq->eqe_cache[i].cq; comp_event_callback(cq); - spin_lock_irqsave(&ehca_cq_idr_lock, flags); + spin_lock(&ehca_cq_idr_lock); cq->nr_events--; if (!cq->nr_events) wake_up(&cq->wait_completion); - spin_unlock_irqrestore(&ehca_cq_idr_lock, - flags); + spin_unlock(&ehca_cq_idr_lock); } } else { ehca_dbg(&shca->ib_device, "Got non completion event"); @@ -711,6 +710,7 @@ static void destroy_comp_task(struct ehca_comp_pool *pool, kthread_stop(task); } +#ifdef CONFIG_HOTPLUG_CPU static void take_over_work(struct ehca_comp_pool *pool, int cpu) { @@ -735,7 +735,6 @@ static void take_over_work(struct ehca_comp_pool *pool, } -#ifdef CONFIG_HOTPLUG_CPU static int comp_pool_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -745,6 +744,7 @@ static int comp_pool_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu); if(!create_comp_task(pool, cpu)) { ehca_gen_err("Can't create comp_task for cpu: %x", cpu); @@ -752,24 +752,29 @@ static int comp_pool_callback(struct notifier_block *nfb, } break; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu); cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); kthread_bind(cct->task, any_online_cpu(cpu_online_map)); destroy_comp_task(pool, cpu); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu); cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); kthread_bind(cct->task, cpu); wake_up_process(cct->task); break; case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu); break; case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu); destroy_comp_task(pool, cpu); take_over_work(pool, cpu); diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h index e14b029..37e7fe0 100644 --- a/drivers/infiniband/hw/ehca/ehca_iverbs.h +++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h @@ -78,8 +78,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, int num_phys_buf, int mr_access_flags, u64 *iova_start); -struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, - struct ib_umem *region, +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int mr_access_flags, struct ib_udata *udata); int ehca_rereg_phys_mr(struct ib_mr *mr, diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 2d37054..c3f99f3 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -52,7 +52,7 @@ MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>"); MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); -MODULE_VERSION("SVNEHCA_0022"); +MODULE_VERSION("SVNEHCA_0023"); int ehca_open_aqp1 = 0; int ehca_debug_level = 0; @@ -62,7 +62,7 @@ int ehca_use_hp_mr = 0; int ehca_port_act_time = 30; int ehca_poll_all_eqs = 1; int ehca_static_rate = -1; -int ehca_scaling_code = 1; +int ehca_scaling_code = 0; module_param_named(open_aqp1, ehca_open_aqp1, int, 0); module_param_named(debug_level, ehca_debug_level, int, 0); @@ -98,6 +98,7 @@ MODULE_PARM_DESC(scaling_code, spinlock_t ehca_qp_idr_lock; spinlock_t ehca_cq_idr_lock; +spinlock_t hcall_lock; DEFINE_IDR(ehca_qp_idr); DEFINE_IDR(ehca_cq_idr); @@ -453,15 +454,14 @@ static ssize_t ehca_store_debug_level(struct device_driver *ddp, DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR, ehca_show_debug_level, ehca_store_debug_level); -void ehca_create_driver_sysfs(struct ibmebus_driver *drv) -{ - driver_create_file(&drv->driver, &driver_attr_debug_level); -} +static struct attribute *ehca_drv_attrs[] = { + &driver_attr_debug_level.attr, + NULL +}; -void ehca_remove_driver_sysfs(struct ibmebus_driver *drv) -{ - driver_remove_file(&drv->driver, &driver_attr_debug_level); -} +static struct attribute_group ehca_drv_attr_grp = { + .attrs = ehca_drv_attrs +}; #define EHCA_RESOURCE_ATTR(name) \ static ssize_t ehca_show_##name(struct device *dev, \ @@ -523,44 +523,28 @@ static ssize_t ehca_show_adapter_handle(struct device *dev, } static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL); +static struct attribute *ehca_dev_attrs[] = { + &dev_attr_adapter_handle.attr, + &dev_attr_num_ports.attr, + &dev_attr_hw_ver.attr, + &dev_attr_max_eq.attr, + &dev_attr_cur_eq.attr, + &dev_attr_max_cq.attr, + &dev_attr_cur_cq.attr, + &dev_attr_max_qp.attr, + &dev_attr_cur_qp.attr, + &dev_attr_max_mr.attr, + &dev_attr_cur_mr.attr, + &dev_attr_max_mw.attr, + &dev_attr_cur_mw.attr, + &dev_attr_max_pd.attr, + &dev_attr_max_ah.attr, + NULL +}; -void ehca_create_device_sysfs(struct ibmebus_dev *dev) -{ - device_create_file(&dev->ofdev.dev, &dev_attr_adapter_handle); - device_create_file(&dev->ofdev.dev, &dev_attr_num_ports); - device_create_file(&dev->ofdev.dev, &dev_attr_hw_ver); - device_create_file(&dev->ofdev.dev, &dev_attr_max_eq); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_eq); - device_create_file(&dev->ofdev.dev, &dev_attr_max_cq); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_cq); - device_create_file(&dev->ofdev.dev, &dev_attr_max_qp); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_qp); - device_create_file(&dev->ofdev.dev, &dev_attr_max_mr); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_mr); - device_create_file(&dev->ofdev.dev, &dev_attr_max_mw); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_mw); - device_create_file(&dev->ofdev.dev, &dev_attr_max_pd); - device_create_file(&dev->ofdev.dev, &dev_attr_max_ah); -} - -void ehca_remove_device_sysfs(struct ibmebus_dev *dev) -{ - device_remove_file(&dev->ofdev.dev, &dev_attr_adapter_handle); - device_remove_file(&dev->ofdev.dev, &dev_attr_num_ports); - device_remove_file(&dev->ofdev.dev, &dev_attr_hw_ver); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_eq); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_eq); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_cq); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_cq); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_qp); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_qp); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_mr); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mr); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_mw); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mw); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_pd); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_ah); -} +static struct attribute_group ehca_dev_attr_grp = { + .attrs = ehca_dev_attrs +}; static int __devinit ehca_probe(struct ibmebus_dev *dev, const struct of_device_id *id) @@ -570,7 +554,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, struct ib_pd *ibpd; int ret; - handle = get_property(dev->ofdev.node, "ibm,hca-handle", NULL); + handle = of_get_property(dev->ofdev.node, "ibm,hca-handle", NULL); if (!handle) { ehca_gen_err("Cannot get eHCA handle for adapter: %s.", dev->ofdev.node->full_name); @@ -668,7 +652,10 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, } } - ehca_create_device_sysfs(dev); + ret = sysfs_create_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_err(&shca->ib_device, + "Cannot create device attributes ret=%d", ret); spin_lock(&shca_list_lock); list_add(&shca->shca_list, &shca_list); @@ -720,7 +707,7 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev) struct ehca_shca *shca = dev->ofdev.dev.driver_data; int ret; - ehca_remove_device_sysfs(dev); + sysfs_remove_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp); if (ehca_open_aqp1 == 1) { int i; @@ -812,11 +799,12 @@ int __init ehca_module_init(void) int ret; printk(KERN_INFO "eHCA Infiniband Device Driver " - "(Rel.: SVNEHCA_0022)\n"); + "(Rel.: SVNEHCA_0023)\n"); idr_init(&ehca_qp_idr); idr_init(&ehca_cq_idr); spin_lock_init(&ehca_qp_idr_lock); spin_lock_init(&ehca_cq_idr_lock); + spin_lock_init(&hcall_lock); INIT_LIST_HEAD(&shca_list); spin_lock_init(&shca_list_lock); @@ -838,7 +826,9 @@ int __init ehca_module_init(void) goto module_init2; } - ehca_create_driver_sysfs(&ehca_driver); + ret = sysfs_create_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_gen_err("Cannot create driver attributes ret=%d", ret); if (ehca_poll_all_eqs != 1) { ehca_gen_err("WARNING!!!"); @@ -865,7 +855,7 @@ void __exit ehca_module_exit(void) if (ehca_poll_all_eqs == 1) del_timer_sync(&poll_eqs_timer); - ehca_remove_driver_sysfs(&ehca_driver); + sysfs_remove_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp); ibmebus_unregister_driver(&ehca_driver); ehca_destroy_slab_caches(); diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index d22ab56..add79bd 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -39,6 +39,8 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include <rdma/ib_umem.h> + #include <asm/current.h> #include "ehca_iverbs.h" @@ -238,10 +240,8 @@ reg_phys_mr_exit0: /*----------------------------------------------------------------------*/ -struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, - struct ib_umem *region, - int mr_access_flags, - struct ib_udata *udata) +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, + int mr_access_flags, struct ib_udata *udata) { struct ib_mr *ib_mr; struct ehca_mr *e_mr; @@ -257,11 +257,7 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, ehca_gen_err("bad pd=%p", pd); return ERR_PTR(-EFAULT); } - if (!region) { - ehca_err(pd->device, "bad input values: region=%p", region); - ib_mr = ERR_PTR(-EINVAL); - goto reg_user_mr_exit0; - } + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && @@ -275,17 +271,10 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, ib_mr = ERR_PTR(-EINVAL); goto reg_user_mr_exit0; } - if (region->page_size != PAGE_SIZE) { - ehca_err(pd->device, "page size not supported, " - "region->page_size=%x", region->page_size); - ib_mr = ERR_PTR(-EINVAL); - goto reg_user_mr_exit0; - } - if ((region->length == 0) || - ((region->virt_base + region->length) < region->virt_base)) { + if (length == 0 || virt + length < virt) { ehca_err(pd->device, "bad input values: length=%lx " - "virt_base=%lx", region->length, region->virt_base); + "virt_base=%lx", length, virt); ib_mr = ERR_PTR(-EINVAL); goto reg_user_mr_exit0; } @@ -297,40 +286,55 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, goto reg_user_mr_exit0; } + e_mr->umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags); + if (IS_ERR(e_mr->umem)) { + ib_mr = (void *) e_mr->umem; + goto reg_user_mr_exit1; + } + + if (e_mr->umem->page_size != PAGE_SIZE) { + ehca_err(pd->device, "page size not supported, " + "e_mr->umem->page_size=%x", e_mr->umem->page_size); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit2; + } + /* determine number of MR pages */ - num_pages_mr = (((region->virt_base % PAGE_SIZE) + region->length + - PAGE_SIZE - 1) / PAGE_SIZE); - num_pages_4k = (((region->virt_base % EHCA_PAGESIZE) + region->length + - EHCA_PAGESIZE - 1) / EHCA_PAGESIZE); + num_pages_mr = (((virt % PAGE_SIZE) + length + PAGE_SIZE - 1) / + PAGE_SIZE); + num_pages_4k = (((virt % EHCA_PAGESIZE) + length + EHCA_PAGESIZE - 1) / + EHCA_PAGESIZE); /* register MR on HCA */ pginfo.type = EHCA_MR_PGI_USER; pginfo.num_pages = num_pages_mr; pginfo.num_4k = num_pages_4k; - pginfo.region = region; - pginfo.next_4k = region->offset / EHCA_PAGESIZE; + pginfo.region = e_mr->umem; + pginfo.next_4k = e_mr->umem->offset / EHCA_PAGESIZE; pginfo.next_chunk = list_prepare_entry(pginfo.next_chunk, - (®ion->chunk_list), + (&e_mr->umem->chunk_list), list); - ret = ehca_reg_mr(shca, e_mr, (u64*)region->virt_base, - region->length, mr_access_flags, e_pd, &pginfo, - &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey); + ret = ehca_reg_mr(shca, e_mr, (u64*) virt, length, mr_access_flags, e_pd, + &pginfo, &e_mr->ib.ib_mr.lkey, &e_mr->ib.ib_mr.rkey); if (ret) { ib_mr = ERR_PTR(ret); - goto reg_user_mr_exit1; + goto reg_user_mr_exit2; } /* successful registration of all pages */ return &e_mr->ib.ib_mr; +reg_user_mr_exit2: + ib_umem_release(e_mr->umem); reg_user_mr_exit1: ehca_mr_delete(e_mr); reg_user_mr_exit0: if (IS_ERR(ib_mr)) - ehca_err(pd->device, "rc=%lx pd=%p region=%p mr_access_flags=%x" + ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x" " udata=%p", - PTR_ERR(ib_mr), pd, region, mr_access_flags, udata); + PTR_ERR(ib_mr), pd, mr_access_flags, udata); return ib_mr; } /* end ehca_reg_user_mr() */ @@ -596,6 +600,9 @@ int ehca_dereg_mr(struct ib_mr *mr) goto dereg_mr_exit0; } + if (e_mr->umem) + ib_umem_release(e_mr->umem); + /* successful deregistration */ ehca_mr_delete(e_mr); @@ -2043,13 +2050,10 @@ int ehca_mrmw_map_hrc_alloc(const u64 hipz_rc) switch (hipz_rc) { case H_SUCCESS: /* successful completion */ return 0; - case H_ADAPTER_PARM: /* invalid adapter handle */ - case H_RT_PARM: /* invalid resource type */ case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */ - case H_MLENGTH_PARM: /* invalid memory length */ - case H_MEM_ACCESS_PARM: /* invalid access controls */ case H_CONSTRAINED: /* resource constraint */ - return -EINVAL; + case H_NO_MEM: + return -ENOMEM; case H_BUSY: /* long busy */ return -EBUSY; default: diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index df0516f..b5bc787 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -523,6 +523,8 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, goto create_qp_exit1; } + my_qp->ib_qp.qp_num = my_qp->real_qp_num; + switch (init_attr->qp_type) { case IB_QPT_RC: if (isdaqp == 0) { @@ -568,7 +570,7 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, parms.act_nr_recv_wqes = init_attr->cap.max_recv_wr; parms.act_nr_send_sges = init_attr->cap.max_send_sge; parms.act_nr_recv_sges = init_attr->cap.max_recv_sge; - my_qp->real_qp_num = + my_qp->ib_qp.qp_num = (init_attr->qp_type == IB_QPT_SMI) ? 0 : 1; } @@ -595,7 +597,6 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, my_qp->ib_qp.recv_cq = init_attr->recv_cq; my_qp->ib_qp.send_cq = init_attr->send_cq; - my_qp->ib_qp.qp_num = my_qp->real_qp_num; my_qp->ib_qp.qp_type = init_attr->qp_type; my_qp->qp_type = init_attr->qp_type; @@ -968,17 +969,21 @@ static int internal_modify_qp(struct ib_qp *ibqp, ((ehca_mult - 1) / ah_mult) : 0; else mqpcb->max_static_rate = 0; - update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1); /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + + /* * only if GRH is TRUE we might consider SOURCE_GID_IDX * and DEST_GID otherwise phype will return H_ATTR_PARM!!! */ if (attr->ah_attr.ah_flags == IB_AH_GRH) { - mqpcb->send_grh_flag = 1 << 31; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + mqpcb->send_grh_flag = 1; + mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index; update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1); diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index b564fcd..5766ae3 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -154,7 +154,8 @@ static long ehca_plpar_hcall9(unsigned long opcode, unsigned long arg9) { long ret; - int i, sleep_msecs; + int i, sleep_msecs, lock_is_set = 0; + unsigned long flags; ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx " "arg5=%lx arg6=%lx arg7=%lx arg8=%lx arg9=%lx", @@ -162,10 +163,18 @@ static long ehca_plpar_hcall9(unsigned long opcode, arg8, arg9); for (i = 0; i < 5; i++) { + if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) { + spin_lock_irqsave(&hcall_lock, flags); + lock_is_set = 1; + } + ret = plpar_hcall9(opcode, outs, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); + if (lock_is_set) + spin_unlock_irqrestore(&hcall_lock, flags); + if (H_IS_LONG_BUSY(ret)) { sleep_msecs = get_longbusy_msecs(ret); msleep_interruptible(sleep_msecs); @@ -193,11 +202,11 @@ static long ehca_plpar_hcall9(unsigned long opcode, opcode, ret, outs[0], outs[1], outs[2], outs[3], outs[4], outs[5], outs[6], outs[7], outs[8]); return ret; - } return H_BUSY; } + u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, struct ehca_pfeq *pfeq, const u32 neq_control, @@ -322,7 +331,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, 0); qp->ipz_qp_handle.handle = outs[0]; qp->real_qp_num = (u32)outs[1]; - parms->act_nr_send_sges = + parms->act_nr_send_wqes = (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]); parms->act_nr_recv_wqes = (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]); diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 036ed1e..ebd5c7b 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -523,7 +523,7 @@ static int ipathfs_fill_super(struct super_block *sb, void *data, int ret; static struct tree_descr files[] = { - [1] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, {""}, }; diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 1b9c308..4e2e3df 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -747,7 +747,6 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd) static int ipath_pe_intconfig(struct ipath_devdata *dd) { - u64 val; u32 chiprev; /* @@ -760,9 +759,9 @@ static int ipath_pe_intconfig(struct ipath_devdata *dd) if ((chiprev & INFINIPATH_R_CHIPREVMINOR_MASK) > 1) { /* Rev2+ reports extra errors via internal GPIO pins */ dd->ipath_flags |= IPATH_GPIO_ERRINTRS; - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val |= IPATH_GPIO_ERRINTR_MASK; - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } return 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 45d0331..a90d3b5 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1056,7 +1056,7 @@ irqreturn_t ipath_intr(int irq, void *data) gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); chk0rcv = 1; } - if (unlikely(gpiostatus)) { + if (gpiostatus) { /* * Some unexpected bits remain. If they could have * caused the interrupt, complain and clear. @@ -1065,9 +1065,8 @@ irqreturn_t ipath_intr(int irq, void *data) * GPIO interrupts, possibly on a "three strikes" * basis. */ - u32 mask; - mask = ipath_read_kreg32( - dd, dd->ipath_kregs->kr_gpio_mask); + const u32 mask = (u32) dd->ipath_gpio_mask; + if (mask & gpiostatus) { ipath_dbg("Unexpected GPIO IRQ bits %x\n", gpiostatus & mask); diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index e900c25..12194f3 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -397,6 +397,8 @@ struct ipath_devdata { unsigned long ipath_pioavailshadow[8]; /* shadow of kr_gpio_out, for rmw ops */ u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; /* kr_revision shadow */ u64 ipath_revision; /* diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 31e7073..bdeef8d 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include <rdma/ib_umem.h> #include <rdma/ib_pack.h> #include <rdma/ib_smi.h> @@ -147,6 +148,7 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, mr->mr.offset = 0; mr->mr.access_flags = acc; mr->mr.max_segs = num_phys_buf; + mr->umem = NULL; m = 0; n = 0; @@ -170,46 +172,56 @@ bail: /** * ipath_reg_user_mr - register a userspace memory region * @pd: protection domain for this memory region - * @region: the user memory region + * @start: starting userspace address + * @length: length of region to register + * @virt_addr: virtual address to use (from HCA's point of view) * @mr_access_flags: access flags for this memory region * @udata: unused by the InfiniPath driver * * Returns the memory region on success, otherwise returns an errno. */ -struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int mr_access_flags, struct ib_udata *udata) +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) { struct ipath_mr *mr; + struct ib_umem *umem; struct ib_umem_chunk *chunk; int n, m, i; struct ib_mr *ret; - if (region->length == 0) { + if (length == 0) { ret = ERR_PTR(-EINVAL); goto bail; } + umem = ib_umem_get(pd->uobject->context, start, length, mr_access_flags); + if (IS_ERR(umem)) + return (void *) umem; + n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &umem->chunk_list, list) n += chunk->nents; mr = alloc_mr(n, &to_idev(pd->device)->lk_table); if (!mr) { ret = ERR_PTR(-ENOMEM); + ib_umem_release(umem); goto bail; } mr->mr.pd = pd; - mr->mr.user_base = region->user_base; - mr->mr.iova = region->virt_base; - mr->mr.length = region->length; - mr->mr.offset = region->offset; + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = umem->offset; mr->mr.access_flags = mr_access_flags; mr->mr.max_segs = n; + mr->umem = umem; m = 0; n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) { + list_for_each_entry(chunk, &umem->chunk_list, list) { for (i = 0; i < chunk->nents; i++) { void *vaddr; @@ -219,7 +231,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, goto bail; } mr->mr.map[m]->segs[n].vaddr = vaddr; - mr->mr.map[m]->segs[n].length = region->page_size; + mr->mr.map[m]->segs[n].length = umem->page_size; n++; if (n == IPATH_SEGSZ) { m++; @@ -253,6 +265,10 @@ int ipath_dereg_mr(struct ib_mr *ibmr) i--; kfree(mr->mr.map[i]); } + + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); return 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 12933e7..bb70845 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1387,13 +1387,12 @@ static int enable_timer(struct ipath_devdata *dd) * processing. */ if (dd->ipath_flags & IPATH_GPIO_INTR) { - u64 val; ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, 0x2074076542310ULL); /* Enable GPIO bit 2 interrupt */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val |= (u64) (1 << IPATH_GPIO_PORT0_BIT); - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } init_timer(&dd->verbs_timer); @@ -1412,8 +1411,9 @@ static int disable_timer(struct ipath_devdata *dd) u64 val; /* Disable GPIO bit 2 interrupt */ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); /* * We might want to undo changes to debugportselect, * but how? diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 7064fc2..088b837 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -251,6 +251,7 @@ struct ipath_sge { /* Memory region */ struct ipath_mr { struct ib_mr ibmr; + struct ib_umem *umem; struct ipath_mregion mr; /* must be last */ }; @@ -751,8 +752,8 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 *iova_start); -struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int mr_access_flags, +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, struct ib_udata *udata); int ipath_dereg_mr(struct ib_mr *ibmr); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c index 085e28b..dd691cf 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -165,10 +165,9 @@ static int ipath_mcast_add(struct ipath_ibdev *dev, { struct rb_node **n = &mcast_tree.rb_node; struct rb_node *pn = NULL; - unsigned long flags; int ret; - spin_lock_irqsave(&mcast_lock, flags); + spin_lock_irq(&mcast_lock); while (*n) { struct ipath_mcast *tmcast; @@ -228,7 +227,7 @@ static int ipath_mcast_add(struct ipath_ibdev *dev, ret = 0; bail: - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); return ret; } @@ -289,17 +288,16 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct ipath_mcast *mcast = NULL; struct ipath_mcast_qp *p, *tmp; struct rb_node *n; - unsigned long flags; int last = 0; int ret; - spin_lock_irqsave(&mcast_lock, flags); + spin_lock_irq(&mcast_lock); /* Find the GID in the mcast table. */ n = mcast_tree.rb_node; while (1) { if (n == NULL) { - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); ret = -EINVAL; goto bail; } @@ -334,7 +332,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) break; } - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); if (p) { /* @@ -348,9 +346,9 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) atomic_dec(&mcast->refcount); wait_event(mcast->wait, !atomic_read(&mcast->refcount)); ipath_mcast_free(mcast); - spin_lock(&dev->n_mcast_grps_lock); + spin_lock_irq(&dev->n_mcast_grps_lock); dev->n_mcast_grps_allocated--; - spin_unlock(&dev->n_mcast_grps_lock); + spin_unlock_irq(&dev->n_mcast_grps_lock); } ret = 0; diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig new file mode 100644 index 0000000..b8912cd --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -0,0 +1,9 @@ +config MLX4_INFINIBAND + tristate "Mellanox ConnectX HCA support" + depends on INFINIBAND + select MLX4_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox ConnectX PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile new file mode 100644 index 0000000..70f09c7 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o + +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c new file mode 100644 index 0000000..c75ac94 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx4_ib.h" + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + struct mlx4_ib_ah *ah; + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + memset(&ah->av, 0, sizeof ah->av); + + ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.g_slid = ah_attr->src_path_bits; + ah->av.dlid = cpu_to_be16(ah_attr->dlid); + if (ah_attr->static_rate) { + ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.stat_rate & dev->caps.stat_rate_support)) + --ah->av.stat_rate; + } + ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + if (ah_attr->ah_flags & IB_AH_GRH) { + ah->av.g_slid |= 0x80; + ah->av.gid_index = ah_attr->grh.sgid_index; + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.sl_tclass_flowlabel |= + cpu_to_be32((ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16); + } + + return &ah->ibah; +} + +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah = to_mah(ibah); + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = be16_to_cpu(ah->av.dlid); + ah_attr->sl = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; + ah_attr->port_num = be32_to_cpu(ah->av.port_pd) >> 24; + if (ah->av.stat_rate) + ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET; + ah_attr->src_path_bits = ah->av.g_slid & 0x7F; + + if (mlx4_ib_ah_grh_present(ah)) { + ah_attr->ah_flags = IB_AH_GRH; + + ah_attr->grh.traffic_class = + be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20; + ah_attr->grh.flow_label = + be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff; + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.sgid_index = ah->av.gid_index; + memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16); + } + + return 0; +} + +int mlx4_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c new file mode 100644 index 0000000..660b27a --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx4/cq.h> +#include <linux/mlx4/qp.h> + +#include "mlx4_ib.h" +#include "user.h" + +static void mlx4_ib_cq_comp(struct mlx4_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_cq *ibcq; + + if (type != MLX4_EVENT_TYPE_CQ_ERROR) { + printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + "on CQ %06x\n", type, cq->cqn); + return; + } + + ibcq = &to_mibcq(cq)->ibcq; + if (ibcq->event_handler) { + event.device = ibcq->device; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) +{ + int offset = n * sizeof (struct mlx4_cqe); + + if (buf->buf.nbufs == 1) + return buf->buf.u.direct.buf + offset; + else + return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} + +static void *get_cqe(struct mlx4_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n); +} + +static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + + return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_cq *cq; + struct mlx4_uar *uar; + int buf_size; + int err; + + if (entries < 1 || entries > dev->dev->caps.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + entries = roundup_pow_of_two(entries + 1); + cq->ibcq.cqe = entries - 1; + buf_size = entries * sizeof (struct mlx4_cqe); + spin_lock_init(&cq->lock); + + if (context) { + struct mlx4_ib_create_cq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_cq; + } + + cq->umem = ib_umem_get(context, ucmd.buf_addr, buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->umem)) { + err = PTR_ERR(cq->umem); + goto err_cq; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(cq->umem), + ilog2(cq->umem->page_size), &cq->buf.mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem); + if (err) + goto err_mtt; + + err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_mtt; + + uar = &to_mucontext(context)->uar; + } else { + err = mlx4_ib_db_alloc(dev, &cq->db, 1); + if (err) + goto err_cq; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &cq->buf.buf)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, cq->buf.buf.npages, cq->buf.buf.page_shift, + &cq->buf.mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &cq->buf.mtt, &cq->buf.buf); + if (err) + goto err_mtt; + + uar = &dev->priv_uar; + } + + err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, + cq->db.dma, &cq->mcq); + if (err) + goto err_dbmap; + + cq->mcq.comp = mlx4_ib_cq_comp; + cq->mcq.event = mlx4_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { + err = -EFAULT; + goto err_dbmap; + } + + return &cq->ibcq; + +err_dbmap: + if (context) + mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); + +err_buf: + if (context) + ib_umem_release(cq->umem); + else + mlx4_buf_free(dev->dev, entries * sizeof (struct mlx4_cqe), + &cq->buf.buf); + +err_db: + if (!context) + mlx4_ib_db_free(dev, &cq->db); + +err_cq: + kfree(cq); + + return ERR_PTR(err); +} + +int mlx4_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx4_ib_dev *dev = to_mdev(cq->device); + struct mlx4_ib_cq *mcq = to_mcq(cq); + + mlx4_cq_free(dev->dev, &mcq->mcq); + mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); + + if (cq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); + ib_umem_release(mcq->umem); + } else { + mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe), + &mcq->buf.buf); + mlx4_ib_db_free(dev, &mcq->db); + } + + kfree(mcq); + + return 0; +} + +static void dump_cqe(void *cqe) +{ + __be32 *buf = cqe; + + printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), + be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), + be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); +} + +static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, + struct ib_wc *wc) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { + printk(KERN_DEBUG "local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), + cqe->vendor_err_syndrome, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + dump_cqe(cqe); + } + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_syndrome; +} + +static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, + struct mlx4_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx4_cqe *cqe; + struct mlx4_qp *mqp; + struct mlx4_ib_wq *wq; + struct mlx4_ib_srq *srq; + int is_send; + int is_error; + u16 wqe_ctr; + + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + ++cq->mcq.cons_index; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + if (!*cur_qp || + (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->my_qpn)); + if (unlikely(!mqp)) { + printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n", + cq->mcq.cqn, be32_to_cpu(cqe->my_qpn) & 0xffffff); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + if (unlikely(is_error)) { + mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + return 0; + } + + wc->status = IB_WC_SUCCESS; + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_SEND: + wc->opcode = IB_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IB_WC_SEND; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + } + } else { + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + } + + wc->slid = be16_to_cpu(cqe->rlid); + wc->sl = cqe->sl >> 4; + wc->src_qp = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff; + wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f; + wc->wc_flags |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ? + IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) >> 16; + } + + return 0; +} + +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled) { + err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx4_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx4_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->uar_map, + MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); + + return 0; +} + +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + u32 prod_index; + int nfreed = 0; + struct mlx4_cqe *cqe, *dest; + u8 owner_bit; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx4_cq_set_ci(&cq->mcq); + } +} + +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + spin_lock_irq(&cq->lock); + __mlx4_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c new file mode 100644 index 0000000..1c36087 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/slab.h> + +#include "mlx4_ib.h" + +struct mlx4_ib_db_pgdir { + struct list_head list; + DECLARE_BITMAP(order0, MLX4_IB_DB_PER_PAGE); + DECLARE_BITMAP(order1, MLX4_IB_DB_PER_PAGE / 2); + unsigned long *bits[2]; + __be32 *db_page; + dma_addr_t db_dma; +}; + +static struct mlx4_ib_db_pgdir *mlx4_ib_alloc_db_pgdir(struct mlx4_ib_dev *dev) +{ + struct mlx4_ib_db_pgdir *pgdir; + + pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL); + if (!pgdir) + return NULL; + + bitmap_fill(pgdir->order1, MLX4_IB_DB_PER_PAGE / 2); + pgdir->bits[0] = pgdir->order0; + pgdir->bits[1] = pgdir->order1; + pgdir->db_page = dma_alloc_coherent(dev->ib_dev.dma_device, + PAGE_SIZE, &pgdir->db_dma, + GFP_KERNEL); + if (!pgdir->db_page) { + kfree(pgdir); + return NULL; + } + + return pgdir; +} + +static int mlx4_ib_alloc_db_from_pgdir(struct mlx4_ib_db_pgdir *pgdir, + struct mlx4_ib_db *db, int order) +{ + int o; + int i; + + for (o = order; o <= 1; ++o) { + i = find_first_bit(pgdir->bits[o], MLX4_IB_DB_PER_PAGE >> o); + if (i < MLX4_IB_DB_PER_PAGE >> o) + goto found; + } + + return -ENOMEM; + +found: + clear_bit(i, pgdir->bits[o]); + + i <<= o; + + if (o > order) + set_bit(i ^ 1, pgdir->bits[order]); + + db->u.pgdir = pgdir; + db->index = i; + db->db = pgdir->db_page + db->index; + db->dma = pgdir->db_dma + db->index * 4; + db->order = order; + + return 0; +} + +int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order) +{ + struct mlx4_ib_db_pgdir *pgdir; + int ret = 0; + + mutex_lock(&dev->pgdir_mutex); + + list_for_each_entry(pgdir, &dev->pgdir_list, list) + if (!mlx4_ib_alloc_db_from_pgdir(pgdir, db, order)) + goto out; + + pgdir = mlx4_ib_alloc_db_pgdir(dev); + if (!pgdir) { + ret = -ENOMEM; + goto out; + } + + list_add(&pgdir->list, &dev->pgdir_list); + + /* This should never fail -- we just allocated an empty page: */ + WARN_ON(mlx4_ib_alloc_db_from_pgdir(pgdir, db, order)); + +out: + mutex_unlock(&dev->pgdir_mutex); + + return ret; +} + +void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db) +{ + int o; + int i; + + mutex_lock(&dev->pgdir_mutex); + + o = db->order; + i = db->index; + + if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) { + clear_bit(i ^ 1, db->u.pgdir->order0); + ++o; + } + + i >>= o; + set_bit(i, db->u.pgdir->bits[o]); + + if (bitmap_full(db->u.pgdir->order1, MLX4_IB_DB_PER_PAGE / 2)) { + dma_free_coherent(dev->ib_dev.dma_device, PAGE_SIZE, + db->u.pgdir->db_page, db->u.pgdir->db_dma); + list_del(&db->u.pgdir->list); + kfree(db->u.pgdir); + } + + mutex_unlock(&dev->pgdir_mutex); +} + +struct mlx4_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_ib_db *db) +{ + struct mlx4_ib_user_db_page *page; + struct ib_umem_chunk *chunk; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof *page, GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); + db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c new file mode 100644 index 0000000..3330917 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> + +#include <linux/mlx4/cmd.h> + +#include "mlx4_ib.h" + +enum { + MLX4_IB_VENDOR_CLASS1 = 0x9, + MLX4_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + + inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + if (in_wc) { + struct { + __be32 my_qpn; + u32 reserved1; + __be32 rqpn; + u8 sl; + u8 g_path; + u16 reserved2[2]; + __be16 pkey; + u32 reserved3[11]; + u8 grh[40]; + } *ext_info; + + memset(inbox + 256, 0, 256); + ext_info = inbox + 256; + + ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); + ext_info->rqpn = cpu_to_be32(in_wc->src_qp); + ext_info->sl = in_wc->sl << 4; + ext_info->g_path = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + ext_info->pkey = cpu_to_be16(in_wc->pkey_index); + + if (in_grh) + memcpy(ext_info->grh, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, + in_modifier, op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + + if (!err); + memcpy(response_mad, outmailbox->buf, 256); + + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + mlx4_free_cmd_mailbox(dev->dev, outmailbox); + + return err; +} + +static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock(&dev->sm_lock); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock(&dev->sm_lock); +} + +/* + * Snoop SM MADs for port info and P_Key table sets, so we can + * synthesize LID change and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad) +{ + struct ib_event event; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) { + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + struct ib_port_info *pinfo = + (struct ib_port_info *) ((struct ib_smp *) mad)->data; + + update_sm_ah(to_mdev(ibdev), port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + event.device = ibdev; + event.element.port_num = port_num; + + if(pinfo->clientrereg_resv_subnetto & 0x80) + event.event = IB_EVENT_CLIENT_REREGISTER; + else + event.event = IB_EVENT_LID_CHANGE; + + ib_dispatch_event(&event); + } + + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { + event.device = ibdev; + event.event = IB_EVENT_PKEY_CHANGE; + event.element.port_num = port_num; + ib_dispatch_event(&event); + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + spin_lock(&to_mdev(dev)->sm_lock); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); + spin_unlock(&to_mdev(dev)->sm_lock); + } +} + +static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock(&dev->sm_lock); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock(&dev->sm_lock); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries or vendor-specific + * MADs -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO || + ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) == + IB_SMP_ATTR_VENDOR_MASK)) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + + err = mlx4_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + if (!out_mad->mad_hdr.status) { + smp_snoop(ibdev, port_num, in_mad); + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + + for (p = 0; p < dev->dev->caps.num_ports; ++p) + for (q = 0; q <= 1; ++q) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } + + return 0; + +err: + for (p = 0; p < dev->dev->caps.num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->dev->caps.num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + + if (dev->sm_ah[p]) + ib_destroy_ah(dev->sm_ah[p]); + } +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c new file mode 100644 index 0000000..c591616 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/main.c @@ -0,0 +1,658 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> + +#include <rdma/ib_smi.h> +#include <rdma/ib_user_verbs.h> + +#include <linux/mlx4/driver.h> +#include <linux/mlx4/cmd.h> + +#include "mlx4_ib.h" +#include "user.h" + +#define DRV_NAME "mlx4_ib" +#define DRV_VERSION "0.01" +#define DRV_RELDATE "May 1, 2006" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static const char mlx4_ib_version[] __devinitdata = + DRV_NAME ": Mellanox ConnectX InfiniBand driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static int mlx4_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof *props); + + props->fw_ver = dev->dev->caps.fw_ver; + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) + props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = dev->dev->caps.page_size_cap; + props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp_wr = dev->dev->caps.max_wqes; + props->max_sge = min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg); + props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cqe = dev->dev->caps.max_cqes; + props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; + props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; + props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; + props->max_srq_sge = dev->dev->caps.max_srq_sge; + props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; + props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->max_pkeys = dev->dev->caps.pkey_table_len[1]; + props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; + props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1; + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + spin_lock(&to_mdev(ibdev)->sm_lock); + memcpy(ibdev->node_desc, props->node_desc, 64); + spin_unlock(&to_mdev(ibdev)->sm_lock); + } + + return 0; +} + +static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memset(mailbox->buf, 0, 256); + + if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; + ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); + } else { + ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; + ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); + } + + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B); + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + return err; +} + +static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct ib_port_attr attr; + u32 cap_mask; + int err; + + mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); + + err = mlx4_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx4_SET_PORT(to_mdev(ibdev), port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); + +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp resp; + int err; + + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + err = ib_copy_to_udata(udata, &resp, sizeof resp); + if (err) { + mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + return &context->ibucontext; +} + +static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); + + mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); + kfree(context); + + return 0; +} + +static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx4_ib_dev *dev = to_mdev(context->device); + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_pgoff == 0) { + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { + /* FIXME want pgprot_writecombine() for BlueFlame pages */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else + return -EINVAL; + + return 0; +} + +static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_pd *pd; + int err; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) + if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + + return &pd->ibpd; +} + +static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +{ + mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); + kfree(pd); + + return 0; +} + +static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return mlx4_multicast_attach(to_mdev(ibqp->device)->dev, + &to_mqp(ibqp)->mqp, gid->raw); +} + +static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return mlx4_multicast_detach(to_mdev(ibqp->device)->dev, + &to_mqp(ibqp)->mqp, gid->raw); +} + +static int init_node_data(struct mlx4_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static void *mlx4_ib_add(struct mlx4_dev *dev) +{ + struct mlx4_ib_dev *ibdev; + + ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); + if (!ibdev) { + dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); + return NULL; + } + + if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) + goto err_dealloc; + + if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) + goto err_pd; + + ibdev->uar_map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + if (!ibdev->uar_map) + goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); + + INIT_LIST_HEAD(&ibdev->pgdir_list); + mutex_init(&ibdev->pgdir_mutex); + + ibdev->dev = dev; + + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; + ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; + ibdev->ib_dev.phys_port_cnt = dev->caps.num_ports; + ibdev->ib_dev.num_comp_vectors = 1; + ibdev->ib_dev.dma_device = &dev->pdev->dev; + + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + ibdev->ib_dev.query_device = mlx4_ib_query_device; + ibdev->ib_dev.query_port = mlx4_ib_query_port; + ibdev->ib_dev.query_gid = mlx4_ib_query_gid; + ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; + ibdev->ib_dev.modify_device = mlx4_ib_modify_device; + ibdev->ib_dev.modify_port = mlx4_ib_modify_port; + ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; + ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; + ibdev->ib_dev.mmap = mlx4_ib_mmap; + ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; + ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; + ibdev->ib_dev.create_ah = mlx4_ib_create_ah; + ibdev->ib_dev.query_ah = mlx4_ib_query_ah; + ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; + ibdev->ib_dev.create_srq = mlx4_ib_create_srq; + ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; + ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; + ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; + ibdev->ib_dev.create_qp = mlx4_ib_create_qp; + ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; + ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; + ibdev->ib_dev.post_send = mlx4_ib_post_send; + ibdev->ib_dev.post_recv = mlx4_ib_post_recv; + ibdev->ib_dev.create_cq = mlx4_ib_create_cq; + ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; + ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; + ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; + ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; + ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; + ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; + ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; + ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + + if (init_node_data(ibdev)) + goto err_map; + + spin_lock_init(&ibdev->sm_lock); + mutex_init(&ibdev->cap_mask_mutex); + + if (ib_register_device(&ibdev->ib_dev)) + goto err_map; + + if (mlx4_ib_mad_init(ibdev)) + goto err_reg; + + return ibdev; + +err_reg: + ib_unregister_device(&ibdev->ib_dev); + +err_map: + iounmap(ibdev->uar_map); + +err_uar: + mlx4_uar_free(dev, &ibdev->priv_uar); + +err_pd: + mlx4_pd_free(dev, ibdev->priv_pdn); + +err_dealloc: + ib_dealloc_device(&ibdev->ib_dev); + + return NULL; +} + +static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) +{ + struct mlx4_ib_dev *ibdev = ibdev_ptr; + int p; + + for (p = 1; p <= dev->caps.num_ports; ++p) + mlx4_CLOSE_PORT(dev, p); + + mlx4_ib_mad_cleanup(ibdev); + ib_unregister_device(&ibdev->ib_dev); + iounmap(ibdev->uar_map); + mlx4_uar_free(dev, &ibdev->priv_uar); + mlx4_pd_free(dev, ibdev->priv_pdn); + ib_dealloc_device(&ibdev->ib_dev); +} + +static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, + enum mlx4_dev_event event, int subtype, + int port) +{ + struct ib_event ibev; + + switch (event) { + case MLX4_EVENT_TYPE_PORT_CHANGE: + ibev.event = subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + break; + + case MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR: + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + default: + return; + } + + ibev.device = ibdev_ptr; + ibev.element.port_num = port; + + ib_dispatch_event(&ibev); +} + +static struct mlx4_interface mlx4_ib_interface = { + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event +}; + +static int __init mlx4_ib_init(void) +{ + return mlx4_register_interface(&mlx4_ib_interface); +} + +static void __exit mlx4_ib_cleanup(void) +{ + mlx4_unregister_interface(&mlx4_ib_interface); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h new file mode 100644 index 0000000..24ccadd --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_H +#define MLX4_IB_H + +#include <linux/compiler.h> +#include <linux/list.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> + +#include <linux/mlx4/device.h> +#include <linux/mlx4/doorbell.h> + +enum { + MLX4_IB_DB_PER_PAGE = PAGE_SIZE / 4 +}; + +struct mlx4_ib_db_pgdir; +struct mlx4_ib_user_db_page; + +struct mlx4_ib_db { + __be32 *db; + union { + struct mlx4_ib_db_pgdir *pgdir; + struct mlx4_ib_user_db_page *user_page; + } u; + dma_addr_t dma; + int index; + int order; +}; + +struct mlx4_ib_ucontext { + struct ib_ucontext ibucontext; + struct mlx4_uar uar; + struct list_head db_page_list; + struct mutex db_page_mutex; +}; + +struct mlx4_ib_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +struct mlx4_ib_cq_buf { + struct mlx4_buf buf; + struct mlx4_mtt mtt; +}; + +struct mlx4_ib_cq { + struct ib_cq ibcq; + struct mlx4_cq mcq; + struct mlx4_ib_cq_buf buf; + struct mlx4_ib_db db; + spinlock_t lock; + struct ib_umem *umem; +}; + +struct mlx4_ib_mr { + struct ib_mr ibmr; + struct mlx4_mr mmr; + struct ib_umem *umem; +}; + +struct mlx4_ib_wq { + u64 *wrid; + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; +}; + +struct mlx4_ib_qp { + struct ib_qp ibqp; + struct mlx4_qp mqp; + struct mlx4_buf buf; + + struct mlx4_ib_db db; + struct mlx4_ib_wq rq; + + u32 doorbell_qpn; + __be32 sq_signal_bits; + int sq_spare_wqes; + struct mlx4_ib_wq sq; + + struct ib_umem *umem; + struct mlx4_mtt mtt; + int buf_size; + struct mutex mutex; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 sq_no_prefetch; + u8 state; +}; + +struct mlx4_ib_srq { + struct ib_srq ibsrq; + struct mlx4_srq msrq; + struct mlx4_buf buf; + struct mlx4_ib_db db; + u64 *wrid; + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + struct mlx4_mtt mtt; + struct mutex mutex; +}; + +struct mlx4_ib_ah { + struct ib_ah ibah; + struct mlx4_av av; +}; + +struct mlx4_ib_dev { + struct ib_device ib_dev; + struct mlx4_dev *dev; + void __iomem *uar_map; + + struct list_head pgdir_list; + struct mutex pgdir_mutex; + + struct mlx4_uar priv_uar; + u32 priv_pdn; + MLX4_DECLARE_DOORBELL_LOCK(uar_lock); + + struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; + struct ib_ah *sm_ah[MLX4_MAX_PORTS]; + spinlock_t sm_lock; + + struct mutex cap_mask_mutex; +}; + +static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx4_ib_dev, ib_dev); +} + +static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); +} + +static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx4_ib_pd, ibpd); +} + +static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx4_ib_cq, ibcq); +} + +static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) +{ + return container_of(mcq, struct mlx4_ib_cq, mcq); +} + +static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx4_ib_mr, ibmr); +} + +static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx4_ib_qp, ibqp); +} + +static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_qp, mqp); +} + +static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); +} + +static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) +{ + return container_of(msrq, struct mlx4_ib_srq, msrq); +} + +static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx4_ib_ah, ibah); +} + +int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order); +void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db); +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_ib_db *db); +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db); + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem); +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx4_ib_dereg_mr(struct ib_mr *mr); + +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx4_ib_destroy_cq(struct ib_cq *cq); +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx4_ib_destroy_ah(struct ib_ah *ah); + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx4_ib_destroy_srq(struct ib_srq *srq); +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_qp(struct ib_qp *qp); +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); + +static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +{ + return !!(ah->av.g_slid & 0x80); +} + +#endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c new file mode 100644 index 0000000..85ae906 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx4_ib.h" + +static u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + MLX4_PERM_LOCAL_READ; +} + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0, + ~0ull, convert_access(acc), 0, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem) +{ + u64 *pages; + struct ib_umem_chunk *chunk; + int i, j, k; + int n; + int len; + int err = 0; + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) + return -ENOMEM; + + i = n = 0; + + list_for_each_entry(chunk, &umem->chunk_list, list) + for (j = 0; j < chunk->nmap; ++j) { + len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(&chunk->page_list[j]) + + umem->page_size * k; + /* + * Be friendly to WRITE_MTT firmware + * command, and pass it chunks of + * appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64) - 2) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; + } + } + } + + if (i) + err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); + +out: + free_page((unsigned long) pages); + return err; +} + +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int shift; + int err; + int n; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(pd->uobject->context, start, length, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err_free; + } + + n = ib_umem_page_count(mr->umem); + shift = ilog2(mr->umem->page_size); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + + return &mr->ibmr; + +err_mr: + mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + + mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c new file mode 100644 index 0000000..28a08bd --- /dev/null +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -0,0 +1,1457 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_cache.h> +#include <rdma/ib_pack.h> + +#include <linux/mlx4/qp.h> + +#include "mlx4_ib.h" +#include "user.h" + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate data. + */ + MLX4_IB_UD_HEADER_SIZE = 72 +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; +}; + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = __constant_cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_SEND_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), +}; + +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} + +static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + return qp->mqp.qpn >= dev->dev->caps.sqp_start && + qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; +} + +static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + return qp->mqp.qpn >= dev->dev->caps.sqp_start && + qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; +} + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + if (qp->buf.nbufs == 1) + return qp->buf.u.direct.buf + offset; + else + return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + u32 *wqe = get_send_wqe(qp, n); + int i; + + for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) + wqe[i] = 0xffffffff; +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int send_wqe_overhead(enum ib_qp_type type) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + case IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_SMI: + case IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_srq, struct mlx4_ib_qp *qp) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes || + cap->max_recv_sge > dev->dev->caps.max_rq_sg) + return -EINVAL; + + if (has_srq) { + /* QPs attached to an SRQ should have no RQ */ + if (cap->max_recv_wr) + return -EINVAL; + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + return -EINVAL; + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + } + + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum ib_qp_type type, struct mlx4_ib_qp *qp) +{ + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > dev->dev->caps.max_wqes || + cap->max_send_sge > dev->dev->caps.max_sq_sg || + cap->max_inline_data + send_wqe_overhead(type) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + return -EINVAL; + + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * + sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type))); + qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / + sizeof (struct mlx4_wqe_data_seg); + + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_sge = qp->sq.max_gs; + /* We don't support inline sends for kernel QPs (yet) */ + cap->max_inline_data = 0; + + return 0; +} + +static int set_user_sq_size(struct mlx4_ib_qp *qp, + struct mlx4_ib_create_qp *ucmd) +{ + qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; + qp->sq.wqe_shift = ucmd->log_sq_stride; + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + + return 0; +} + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) +{ + int err; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + qp->state = IB_QPS_RESET; + qp->atomic_rd_en = 0; + qp->resp_depth = 0; + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); + if (err) + goto err; + + if (pd->uobject) { + struct mlx4_ib_create_qp ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err; + } + + qp->sq_no_prefetch = ucmd.sq_no_prefetch; + + err = set_user_sq_size(qp, &ucmd); + if (err) + goto err; + + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0); + if (IS_ERR(qp->umem)) { + err = PTR_ERR(qp->umem); + goto err; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), + ilog2(qp->umem->page_size), &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); + if (err) + goto err_mtt; + + if (!init_attr->srq) { + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &qp->db); + if (err) + goto err_mtt; + } + } else { + qp->sq_no_prefetch = 0; + + err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); + if (err) + goto err; + + if (!init_attr->srq) { + err = mlx4_ib_db_alloc(dev, &qp->db, 0); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + if (err) + goto err_mtt; + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->rq.wrid) { + err = -ENOMEM; + goto err_wrid; + } + } + + err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp); + if (err) + goto err_wrid; + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; + + qp->mqp.event = mlx4_ib_qp_event; + + return 0; + +err_wrid: + if (pd->uobject && !init_attr->srq) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); + else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && !init_attr->srq) + mlx4_ib_db_free(dev, &qp->db); + +err: + return err; +} + +static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX4_QP_STATE_RST; + case IB_QPS_INIT: return MLX4_QP_STATE_INIT; + case IB_QPS_RTR: return MLX4_QP_STATE_RTR; + case IB_QPS_RTS: return MLX4_QP_STATE_RTS; + case IB_QPS_SQD: return MLX4_QP_STATE_SQD; + case IB_QPS_SQE: return MLX4_QP_STATE_SQER; + case IB_QPS_ERR: return MLX4_QP_STATE_ERR; + default: return -1; + } +} + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_lock_irq(&send_cq->lock); + else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_unlock_irq(&send_cq->lock); + else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + int is_user) +{ + struct mlx4_ib_cq *send_cq, *recv_cq; + + if (qp->state != IB_QPS_RESET) + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + + send_cq = to_mcq(qp->ibqp.send_cq); + recv_cq = to_mcq(qp->ibqp.recv_cq); + + mlx4_ib_lock_cqs(send_cq, recv_cq); + + if (!is_user) { + __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); + if (send_cq != recv_cq) + __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + + mlx4_ib_unlock_cqs(send_cq, recv_cq); + + mlx4_qp_free(dev->dev, &qp->mqp); + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + + if (is_user) { + if (!qp->ibqp.srq) + mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), + &qp->db); + ib_umem_release(qp->umem); + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (!qp->ibqp.srq) + mlx4_ib_db_free(dev, &qp->db); + } +} + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + int err; + + switch (init_attr->qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + { + qp = kmalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, 0, qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibqp.qp_num = qp->mqp.qpn; + + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + /* Userspace is not allowed to create special QPs: */ + if (pd->uobject) + return ERR_PTR(-EINVAL); + + sqp = kmalloc(sizeof *sqp, GFP_KERNEL); + if (!sqp) + return ERR_PTR(-ENOMEM); + + qp = &sqp->qp; + + err = create_qp_common(dev, pd, init_attr, udata, + dev->dev->caps.sqp_start + + (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + + init_attr->port_num - 1, + qp); + if (err) { + kfree(sqp); + return ERR_PTR(err); + } + + qp->port = init_attr->port_num; + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; + + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_ib_qp *mqp = to_mqp(qp); + + if (is_qp0(dev, mqp)) + mlx4_CLOSE_PORT(dev->dev, mqp->port); + + destroy_qp_common(dev, mqp, !!qp->pd->uobject); + + if (is_sqp(dev, mqp)) + kfree(to_msqp(mqp)); + else + kfree(mqp); + + return 0; +} + +static int to_mlx4_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX4_QP_ST_RC; + case IB_QPT_UC: return MLX4_QP_ST_UC; + case IB_QPT_UD: return MLX4_QP_ST_UD; + case IB_QPT_SMI: + case IB_QPT_GSI: return MLX4_QP_ST_MLX; + default: return -1; + } +} + +static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX4_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MLX4_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX4_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) +{ + path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); +} + +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx4_qp_path *path, u8 port) +{ + path->grh_mylmc = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + if (ah->static_rate) { + path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; + while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) + --path->static_rate; + } else + path->static_rate = 0; + path->counter_index = 0xff; + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { + printk(KERN_ERR "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); + return -1; + } + + path->grh_mylmc |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + + return 0; +} + +static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context *context; + enum mlx4_qp_optpar optpar = 0; + int sqd_event; + int err = -EINVAL; + + context = kzalloc(sizeof *context, GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | + (to_mlx4_st(ibqp->qp_type) << 16)); + context->flags |= cpu_to_be32(1 << 8); /* DE? */ + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + else { + optpar |= MLX4_QP_OPTPAR_PM_STATE; + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || + ibqp->qp_type == IB_QPT_UD) + context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; + else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { + printk(KERN_ERR "path MTU (%u) is invalid\n", + attr->path_mtu); + return -EINVAL; + } + context->mtu_msgmax = (attr->path_mtu << 5) | 31; + } + + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + + if (qp->sq.wqe_cnt) + context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; + context->sq_size_stride |= qp->sq.wqe_shift - 4; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + + if (qp->ibqp.uobject) + context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); + else + context->usr_page = cpu_to_be32(dev->priv_uar.index); + + if (attr_mask & IB_QP_DEST_QPN) + context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PORT) { + if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && + !(attr_mask & IB_QP_AV)) { + mlx4_set_sched(&context->pri_path, attr->port_num); + optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + context->pri_path.pkey_index = attr->pkey_index; + optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; + } + + if (attr_mask & IB_QP_AV) { + if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) { + err = -EINVAL; + goto out; + } + + optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX4_QP_OPTPAR_SCHED_QUEUE); + } + + if (attr_mask & IB_QP_TIMEOUT) { + context->pri_path.ackto = attr->timeout << 3; + optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num == 0 || + attr->alt_port_num > dev->dev->caps.num_ports) + return -EINVAL; + + if (attr->alt_pkey_index >= + dev->dev->caps.pkey_table_len[attr->alt_port_num]) + return -EINVAL; + + if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num)) + return -EINVAL; + + context->alt_path.pkey_index = attr->alt_pkey_index; + context->alt_path.ackto = attr->alt_timeout << 3; + optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; + } + + context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) { + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + optpar |= MLX4_QP_OPTPAR_RNR_RETRY; + } + + if (attr_mask & IB_QP_RETRY_CNT) { + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_SRA_MAX; + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_RRA_MAX; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); + optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; + } + + if (ibqp->srq) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; + } + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); + + if (attr_mask & IB_QP_QKEY) { + context->qkey = cpu_to_be32(attr->qkey); + optpar |= MLX4_QP_OPTPAR_Q_KEY; + } + + if (ibqp->srq) + context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); + + if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_INIT && + new_state == IB_QPS_RTR && + (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || + ibqp->qp_type == IB_QPT_UD)) { + context->pri_path.sched_queue = (qp->port - 1) << 6; + if (is_qp0(dev, qp)) + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; + else + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + /* + * Before passing a kernel QP to the HW, make sure that the + * ownership bits of the send queue are set and the SQ + * headroom is stamped so that the hardware doesn't start + * processing stale work requests. + */ + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = cpu_to_be32(1 << 31); + + stamp_send_wqe(qp, i); + } + } + + err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), + to_mlx4_state(new_state), context, optpar, + sqd_event, &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_sqp_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) + if (mlx4_INIT_PORT(dev->dev, qp->port)) + printk(KERN_WARNING "INIT_PORT failed for port %d\n", + qp->port); + + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx4_CLOSE_PORT(dev->dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq): NULL); + if (ibqp->send_cq != ibqp->recv_cq) + mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + if (!ibqp->srq) + *qp->db.db = 0; + } + +out: + kfree(context); + return err; +} + +static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 }; +static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), +}; + +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { + goto out; + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) { + err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr, + mlx4_ib_qp_attr_mask_table[ibqp->qp_type], + IB_QPS_RESET, IB_QPS_INIT); + if (err) + goto out; + cur_state = IB_QPS_INIT; + } + + err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe) +{ + struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header); + + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f); + if (mlx4_ib_ah_grh_present(ah)) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24, + ah->av.gid_index, &sqp->ud_header.grh.source_gid); + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->imm_data; + break; + default: + return -EINVAL; + } + + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + printk(KERN_ERR "built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + printk("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + return ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); +} + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + unsigned long flags; + int nreq; + int err = 0; + int ind; + int size; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->sq.head; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + qp->sq_signal_bits; + + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + ctrl->imm = wr->imm_data; + else + ctrl->imm = 0; + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (ibqp->qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + ((struct mlx4_wqe_raddr_seg *) wqe)->raddr = + cpu_to_be64(wr->wr.atomic.remote_addr); + ((struct mlx4_wqe_raddr_seg *) wqe)->rkey = + cpu_to_be32(wr->wr.atomic.rkey); + ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0; + + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add = + cpu_to_be64(wr->wr.atomic.swap); + ((struct mlx4_wqe_atomic_seg *) wqe)->compare = + cpu_to_be64(wr->wr.atomic.compare_add); + } else { + ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add = + cpu_to_be64(wr->wr.atomic.compare_add); + ((struct mlx4_wqe_atomic_seg *) wqe)->compare = 0; + } + + wqe += sizeof (struct mlx4_wqe_atomic_seg); + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ((struct mlx4_wqe_raddr_seg *) wqe)->raddr = + cpu_to_be64(wr->wr.rdma.remote_addr); + ((struct mlx4_wqe_raddr_seg *) wqe)->rkey = + cpu_to_be32(wr->wr.rdma.rkey); + ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0; + + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + + break; + + default: + /* No extra segments required for sends */ + break; + } + break; + + case IB_QPT_UD: + memcpy(((struct mlx4_wqe_datagram_seg *) wqe)->av, + &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + ((struct mlx4_wqe_datagram_seg *) wqe)->dqpn = + cpu_to_be32(wr->wr.ud.remote_qpn); + ((struct mlx4_wqe_datagram_seg *) wqe)->qkey = + cpu_to_be32(wr->wr.ud.remote_qkey); + + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + err = build_mlx_header(to_msqp(qp), wr, ctrl); + if (err < 0) { + *bad_wr = wr; + goto out; + } + wqe += err; + size += err / 16; + + err = 0; + break; + + default: + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + ((struct mlx4_wqe_data_seg *) wqe)->byte_count = + cpu_to_be32(wr->sg_list[i].length); + ((struct mlx4_wqe_data_seg *) wqe)->lkey = + cpu_to_be32(wr->sg_list[i].lkey); + ((struct mlx4_wqe_data_seg *) wqe)->addr = + cpu_to_be64(wr->sg_list[i].addr); + + wqe += sizeof (struct mlx4_wqe_data_seg); + size += sizeof (struct mlx4_wqe_data_seg) / 16; + } + + /* Add one more inline data segment for ICRC for MLX sends */ + if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) { + ((struct mlx4_wqe_inline_seg *) wqe)->byte_count = + cpu_to_be32((1 << 31) | 4); + ((u32 *) wqe)[1] = 0; + wqe += sizeof (struct mlx4_wqe_data_seg); + size += sizeof (struct mlx4_wqe_data_seg) / 16; + } + + ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + err = -EINVAL; + goto out; + } + + ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (wr->next) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); + + ++ind; + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel(qp->doorbell_qpn, + to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & + (qp->sq.wqe_cnt - 1)); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c new file mode 100644 index 0000000..12fac1c --- /dev/null +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx4/qp.h> +#include <linux/mlx4/srq.h> + +#include "mlx4_ib.h" +#include "user.h" + +static void *get_wqe(struct mlx4_ib_srq *srq, int n) +{ + int offset = n << srq->msrq.wqe_shift; + + if (srq->buf.nbufs == 1) + return srq->buf.u.direct.buf + offset; + else + return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} + +static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX4_EVENT_TYPE_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + "on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_srq *srq; + struct mlx4_wqe_srq_next_seg *next; + int desc_size; + int buf_size; + int err; + int i; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) + return ERR_PTR(-EINVAL); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = max(32UL, + roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) + + srq->msrq.max_gs * + sizeof (struct mlx4_wqe_data_seg))); + srq->msrq.wqe_shift = ilog2(desc_size); + + buf_size = srq->msrq.max * desc_size; + + if (pd->uobject) { + struct mlx4_ib_create_srq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_srq; + } + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + buf_size, 0); + if (IS_ERR(srq->umem)) { + err = PTR_ERR(srq->umem); + goto err_srq; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), + ilog2(srq->umem->page_size), &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem); + if (err) + goto err_mtt; + + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) + goto err_mtt; + } else { + err = mlx4_ib_db_alloc(dev, &srq->db, 0); + if (err) + goto err_srq; + + *srq->db.db = 0; + + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + err = -ENOMEM; + goto err_db; + } + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, + &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + if (err) + goto err_mtt; + + srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } + } + + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt, + srq->db.dma, &srq->msrq); + if (err) + goto err_wrid; + + srq->msrq.event = mlx4_ib_srq_event; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { + err = -EFAULT; + goto err_wrid; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_wrid: + if (pd->uobject) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + else + kfree(srq->wrid); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &srq->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(srq->umem); + else + mlx4_buf_free(dev->dev, buf_size, &srq->buf); + +err_db: + if (!pd->uobject) + mlx4_ib_db_free(dev, &srq->db); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx4_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx4_ib_dev *dev = to_mdev(srq->device); + struct mlx4_ib_srq *msrq = to_msrq(srq); + + mlx4_srq_free(dev->dev, &msrq->msrq); + mlx4_mtt_cleanup(dev->dev, &msrq->mtt); + + if (srq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + kfree(msrq->wrid); + mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, + &msrq->buf); + mlx4_ib_db_free(dev, &msrq->db); + } + + kfree(msrq); + + return 0; +} + +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) +{ + struct mlx4_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h new file mode 100644 index 0000000..e2d11be --- /dev/null +++ b/drivers/infiniband/hw/mlx4/user.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_USER_H +#define MLX4_IB_USER_H + +#include <linux/types.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX4_IB_UVERBS_ABI_VERSION 3 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +#endif /* MLX4_IB_USER_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c index 27caf3b..4b111a8 100644 --- a/drivers/infiniband/hw/mthca/mthca_av.c +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -279,6 +279,7 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff; header->grh.flow_label = ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff); + header->grh.hop_limit = ah->av->hop_limit; ib_get_cached_gid(&dev->ib_dev, be32_to_cpu(ah->av->port_pd) >> 24, ah->av->gid_index % dev->limits.gid_table_len, diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 7131446..f40558d 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -37,6 +37,7 @@ #include <linux/completion.h> #include <linux/pci.h> #include <linux/errno.h> +#include <linux/sched.h> #include <asm/io.h> #include <rdma/ib_mad.h> @@ -771,7 +772,7 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET); /* - * FW subminor version is at more signifant bits than minor + * FW subminor version is at more significant bits than minor * version, so swap here. */ dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) | diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index cf0868f..be6e1e0 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -37,6 +37,7 @@ */ #include <linux/hardirq.h> +#include <linux/sched.h> #include <asm/io.h> @@ -284,7 +285,7 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, { struct mthca_cqe *cqe; u32 prod_index; - int nfreed = 0; + int i, nfreed = 0; spin_lock_irq(&cq->lock); @@ -321,6 +322,8 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, } if (nfreed) { + for (i = 0; i < nfreed; ++i) + set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe)); wmb(); cq->cons_index += nfreed; update_cons_index(dev, cq, nfreed); diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 773145e..aa563e6 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1250,12 +1250,14 @@ static void __mthca_remove_one(struct pci_dev *pdev) int __mthca_restart_one(struct pci_dev *pdev) { struct mthca_dev *mdev; + int hca_type; mdev = pci_get_drvdata(pdev); if (!mdev) return -ENODEV; + hca_type = mdev->hca_type; __mthca_remove_one(pdev); - return __mthca_init_one(pdev, mdev->hca_type); + return __mthca_init_one(pdev, hca_type); } static int __devinit mthca_init_one(struct pci_dev *pdev, diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 48f7c65..e61f3e6 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -36,6 +36,7 @@ #include <linux/mm.h> #include <linux/scatterlist.h> +#include <linux/sched.h> #include <asm/page.h> diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 1c05486..6bcde1c 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -37,6 +37,7 @@ */ #include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> #include <linux/mm.h> @@ -908,6 +909,8 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) return ERR_PTR(err); } + mr->umem = NULL; + return &mr->ibmr; } @@ -1003,11 +1006,13 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, } kfree(page_list); + mr->umem = NULL; + return &mr->ibmr; } -static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int acc, struct ib_udata *udata) +static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; @@ -1018,20 +1023,26 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, int err = 0; int write_mtt_size; - shift = ffs(region->page_size) - 1; - mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); + mr->umem = ib_umem_get(pd->uobject->context, start, length, acc); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err; + } + + shift = ffs(mr->umem->page_size) - 1; + n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &mr->umem->chunk_list, list) n += chunk->nents; mr->mtt = mthca_alloc_mtt(dev, n); if (IS_ERR(mr->mtt)) { err = PTR_ERR(mr->mtt); - goto err; + goto err_umem; } pages = (u64 *) __get_free_page(GFP_KERNEL); @@ -1044,12 +1055,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); - list_for_each_entry(chunk, ®ion->chunk_list, list) + list_for_each_entry(chunk, &mr->umem->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; for (k = 0; k < len; ++k) { pages[i++] = sg_dma_address(&chunk->page_list[j]) + - region->page_size * k; + mr->umem->page_size * k; /* * Be friendly to write_mtt and pass it chunks * of appropriate size. @@ -1071,8 +1082,8 @@ mtt_done: if (err) goto err_mtt; - err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, region->virt_base, - region->length, convert_access(acc), mr); + err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length, + convert_access(acc), mr); if (err) goto err_mtt; @@ -1082,6 +1093,9 @@ mtt_done: err_mtt: mthca_free_mtt(dev, mr->mtt); +err_umem: + ib_umem_release(mr->umem); + err: kfree(mr); return ERR_PTR(err); @@ -1090,8 +1104,12 @@ err: static int mthca_dereg_mr(struct ib_mr *mr) { struct mthca_mr *mmr = to_mmr(mr); + mthca_free_mr(to_mdev(mr->device), mmr); + if (mmr->umem) + ib_umem_release(mmr->umem); kfree(mmr); + return 0; } diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 1d266ac..262616c 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -73,6 +73,7 @@ struct mthca_mtt; struct mthca_mr { struct ib_mr ibmr; + struct ib_umem *umem; struct mthca_mtt *mtt; }; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index fee60c8..eef415b 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -37,6 +37,7 @@ #include <linux/string.h> #include <linux/slab.h> +#include <linux/sched.h> #include <asm/io.h> @@ -295,7 +296,7 @@ static int to_mthca_st(int transport) } } -static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr, +static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr, int attr_mask) { if (attr_mask & IB_QP_PKEY_INDEX) @@ -327,7 +328,7 @@ static void init_port(struct mthca_dev *dev, int port) mthca_warn(dev, "INIT_IB returned status %02x.\n", status); } -static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr, +static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { u8 dest_rd_atomic; @@ -510,7 +511,7 @@ out: return err; } -static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah, +static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah, struct mthca_qp_path *path, u8 port) { path->g_mylmc = ah->src_path_bits & 0x7f; @@ -538,12 +539,12 @@ static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah, return 0; } -int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, - struct ib_udata *udata) +static int __mthca_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); - enum ib_qp_state cur_state, new_state; struct mthca_mailbox *mailbox; struct mthca_qp_param *qp_param; struct mthca_qp_context *qp_context; @@ -551,60 +552,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, u8 status; int err = -EINVAL; - mutex_lock(&qp->mutex); - - if (attr_mask & IB_QP_CUR_STATE) { - cur_state = attr->cur_qp_state; - } else { - spin_lock_irq(&qp->sq.lock); - spin_lock(&qp->rq.lock); - cur_state = qp->state; - spin_unlock(&qp->rq.lock); - spin_unlock_irq(&qp->sq.lock); - } - - new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { - mthca_dbg(dev, "Bad QP transition (transport %d) " - "%d->%d with attr 0x%08x\n", - qp->transport, cur_state, new_state, - attr_mask); - goto out; - } - - if (cur_state == new_state && cur_state == IB_QPS_RESET) { - err = 0; - goto out; - } - - if ((attr_mask & IB_QP_PKEY_INDEX) && - attr->pkey_index >= dev->limits.pkey_table_len) { - mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", - attr->pkey_index, dev->limits.pkey_table_len-1); - goto out; - } - - if ((attr_mask & IB_QP_PORT) && - (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { - mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); - goto out; - } - - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && - attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { - mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", - attr->max_rd_atomic, dev->limits.max_qp_init_rdma); - goto out; - } - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && - attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { - mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", - attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); - goto out; - } - mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) { err = PTR_ERR(mailbox); @@ -891,6 +838,98 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, out_mailbox: mthca_free_mailbox(dev, mailbox); +out: + return err; +} + +static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 }; +static const int dummy_init_attr_mask[] = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), +}; + +int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + + mutex_lock(&qp->mutex); + if (attr_mask & IB_QP_CUR_STATE) { + cur_state = attr->cur_qp_state; + } else { + spin_lock_irq(&qp->sq.lock); + spin_lock(&qp->rq.lock); + cur_state = qp->state; + spin_unlock(&qp->rq.lock); + spin_unlock_irq(&qp->sq.lock); + } + + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + mthca_dbg(dev, "Bad QP transition (transport %d) " + "%d->%d with attr 0x%08x\n", + qp->transport, cur_state, new_state, + attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= dev->limits.pkey_table_len) { + mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", + attr->pkey_index, dev->limits.pkey_table_len-1); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { + mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { + mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", + attr->max_rd_atomic, dev->limits.max_qp_init_rdma); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { + mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", + attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) { + err = __mthca_modify_qp(ibqp, &dummy_init_attr, + dummy_init_attr_mask[ibqp->qp_type], + IB_QPS_RESET, IB_QPS_INIT); + if (err) + goto out; + cur_state = IB_QPS_INIT; + } + + err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); out: mutex_unlock(&qp->mutex); @@ -1862,6 +1901,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + qp->rq.next_ind = ind; qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; size0 = 0; } @@ -2244,10 +2284,10 @@ void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, struct mthca_next_seg *next; /* - * For SRQs, all WQEs generate a CQE, so we're always at the - * end of the doorbell chain. + * For SRQs, all receive WQEs generate a CQE, so we're always + * at the end of the doorbell chain. */ - if (qp->ibqp.srq) { + if (qp->ibqp.srq && !is_send) { *new_wqe = 0; return; } diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 61974b0..b8f05a5 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/string.h> +#include <linux/sched.h> #include <asm/io.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 87310ee..285c143 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -132,12 +132,46 @@ struct ipoib_cm_data { __be32 mtu; }; +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + struct ipoib_cm_rx { struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; struct net_device *dev; unsigned long jiffies; + enum ipoib_cm_state state; }; struct ipoib_cm_tx { @@ -165,10 +199,15 @@ struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; struct ib_cm_id *id; - struct list_head passive_ids; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; struct work_struct skb_task; + struct work_struct rx_reap_task; struct delayed_work stale_task; struct sk_buff_head skb_queue; struct list_head start_list; @@ -201,15 +240,17 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; - struct delayed_work pkey_task; + struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct flush_task; struct work_struct restart_task; struct delayed_work ah_reap_task; + struct work_struct pkey_event_task; struct ib_device *ca; u8 port; u16 pkey; + u16 pkey_index; struct ib_pd *pd; struct ib_mr *mr; struct ib_cq *cq; @@ -333,12 +374,13 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_ib_dev_flush(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev); +int ipoib_ib_dev_stop(struct net_device *dev, int flush); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); @@ -386,6 +428,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_drain_cq(struct net_device *dev); #ifdef CONFIG_INFINIBAND_IPOIB_CM diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 785bc85..ea74d1e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -37,6 +37,7 @@ #include <net/dst.h> #include <net/icmp.h> #include <linux/icmpv6.h> +#include <linux/delay.h> #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; @@ -55,11 +56,15 @@ MODULE_PARM_DESC(cm_data_debug_level, #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) #define IPOIB_CM_RX_UPDATE_MASK (0x3) -struct ipoib_cm_id { - struct ib_cm_id *id; - int flags; - u32 remote_qpn; - u32 remote_mtu; +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .wr_id = IPOIB_CM_RX_DRAIN_WRID, + .opcode = IB_WR_SEND, }; static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, @@ -143,22 +148,61 @@ partial_error: ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); - for (; i >= 0; --i) - ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + for (; i > 0; --i) + ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); return NULL; } +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv) +{ + struct ib_send_wr *bad_wr; + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, struct ipoib_cm_rx *p) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { - .send_cq = priv->cq, /* does not matter, we never send anything */ + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->cq, /* For drain WR */ .recv_cq = priv->cq, .srq = priv->cm.srq, - .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */ + .cap.max_send_wr = 1, /* For drain WR */ .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, @@ -198,6 +242,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev, ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + return 0; } @@ -237,6 +302,11 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even return -ENOMEM; p->dev = dev; p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + p->qp = ipoib_cm_create_rx_qp(dev, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); @@ -248,22 +318,24 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even if (ret) goto err_modify; + spin_lock_irq(&priv->lock); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); if (ret) { ipoib_warn(priv, "failed to send REP: %d\n", ret); - goto err_rep; + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); } - - cm_id->context = p; - p->jiffies = jiffies; - spin_lock_irq(&priv->lock); - list_add(&p->list, &priv->cm.passive_ids); - spin_unlock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, - &priv->cm.stale_task, IPOIB_CM_RX_DELAY); return 0; -err_rep: err_modify: ib_destroy_qp(p->qp); err_qp: @@ -276,7 +348,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; - int ret; switch (event->event) { case IB_CM_REQ_RECEIVED: @@ -288,20 +359,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, case IB_CM_REJ_RECEIVED: p = cm_id->context; priv = netdev_priv(p->dev); - spin_lock_irq(&priv->lock); - if (list_empty(&p->list)) - ret = 0; /* Connection is going away already. */ - else { - list_del_init(&p->list); - ret = -ECONNRESET; - } - spin_unlock_irq(&priv->lock); - if (ret) { - ib_destroy_qp(p->qp); - kfree(p); - return ret; - } - return 0; + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ default: return 0; } @@ -353,8 +413,15 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { - ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", - wr_id, ipoib_recvq_size); + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); return; } @@ -373,13 +440,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { spin_lock_irqsave(&priv->lock, flags); p->jiffies = jiffies; - /* Move this entry to list head, but do - * not re-add it if it has been removed. */ - if (!list_empty(&p->list)) + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irqrestore(&priv->lock, flags); - queue_delayed_work(ipoib_workqueue, - &priv->cm.stale_task, IPOIB_CM_RX_DELAY); } } @@ -593,8 +658,7 @@ int ipoib_cm_dev_open(struct net_device *dev) if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); - priv->cm.id = NULL; - return ret; + goto err_cm; } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), @@ -602,34 +666,76 @@ int ipoib_cm_dev_open(struct net_device *dev) if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); - ib_destroy_cm_id(priv->cm.id); - priv->cm.id = NULL; - return ret; + goto err_listen; } + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; } void ipoib_cm_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_cm_rx *p; + struct ipoib_cm_rx *p, *n; + unsigned long begin; + LIST_HEAD(list); + int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; + spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); - list_del_init(&p->list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, &list); + list_splice_init(&priv->cm.rx_error_list, &list); + list_splice_init(&priv->cm.rx_drain_list, &list); + break; + } spin_unlock_irq(&priv->lock); + msleep(1); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + list_splice_init(&priv->cm.rx_reap_list, &list); + + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(p, n, &list, list) { ib_destroy_cm_id(p->id); ib_destroy_qp(p->qp); kfree(p); - spin_lock_irq(&priv->lock); } - spin_unlock_irq(&priv->lock); cancel_delayed_work(&priv->cm.stale_task); } @@ -646,9 +752,9 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even p->mtu = be32_to_cpu(data->mtu); - if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) { - ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n", - p->mtu, priv->dev->mtu); + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); return -EINVAL; } @@ -1080,26 +1186,50 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, queue_work(ipoib_workqueue, &priv->cm.skb_task); } +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task); + struct ipoib_cm_rx *p, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(p, n, &list, list) { + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + } +} + static void ipoib_cm_stale_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.stale_task.work); struct ipoib_cm_rx *p; + int ret; spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { - /* List if sorted by LRU, start from tail, + /* List is sorted by LRU, start from tail, * stop when we see a recently used entry */ p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) break; - list_del_init(&p->list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); - ib_destroy_cm_id(p->id); - ib_destroy_qp(p->qp); - kfree(p); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } @@ -1161,9 +1291,14 @@ int ipoib_cm_dev_init(struct net_device *dev) INIT_LIST_HEAD(&priv->cm.passive_ids); INIT_LIST_HEAD(&priv->cm.reap_list); INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); skb_queue_head_init(&priv->cm.skb_queue); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 68d72c6..8404f05 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -448,6 +448,13 @@ int ipoib_ib_dev_open(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { + ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + return -1; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = ipoib_init_qp(dev); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); @@ -457,14 +464,14 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev); + ipoib_ib_dev_stop(dev, 1); return -1; } ret = ipoib_cm_dev_open(dev); if (ret) { - ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev); + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + ipoib_ib_dev_stop(dev, 1); return -1; } @@ -516,7 +523,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_task); + cancel_delayed_work(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); if (flush) flush_workqueue(ipoib_workqueue); @@ -543,13 +550,30 @@ static int recvs_pending(struct net_device *dev) return pending; } -int ipoib_ib_dev_stop(struct net_device *dev) +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, n; + do { + n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); + } + } while (n == IPOIB_NUM_WC); +} + +int ipoib_ib_dev_stop(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; unsigned long begin; struct ipoib_tx_buf *tx_req; - int i, n; + int i; clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); netif_poll_disable(dev); @@ -604,17 +628,7 @@ int ipoib_ib_dev_stop(struct net_device *dev) goto timeout; } - do { - n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc); - for (i = 0; i < n; ++i) { - if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ) - ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); - else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) - ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); - else - ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); - } - } while (n == IPOIB_NUM_WC); + ipoib_drain_cq(dev); msleep(1); } @@ -629,7 +643,8 @@ timeout: /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); - flush_workqueue(ipoib_workqueue); + if (flush) + flush_workqueue(ipoib_workqueue); begin = jiffies; @@ -673,13 +688,24 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } -void ipoib_ib_dev_flush(struct work_struct *work) +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) { - struct ipoib_dev_priv *cpriv, *priv = - container_of(work, struct ipoib_dev_priv, flush_task); + struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; + u16 new_index; + + mutex_lock(&priv->vlan_mutex); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, pkey_event); - if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) { + mutex_unlock(&priv->vlan_mutex); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -689,10 +715,32 @@ void ipoib_ib_dev_flush(struct work_struct *work) return; } + if (pkey_event) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_pkey_dev_delay_open(dev); + return; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + /* restart QP only if P_Key index is changed */ + if (new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + priv->pkey_index = new_index; + } + ipoib_dbg(priv, "flushing\n"); ipoib_ib_dev_down(dev, 0); + if (pkey_event) { + ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_open(dev); + } + /* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up @@ -701,14 +749,24 @@ void ipoib_ib_dev_flush(struct work_struct *work) ipoib_ib_dev_up(dev); ipoib_mcast_restart_task(&priv->restart_task); } +} - mutex_lock(&priv->vlan_mutex); +void ipoib_ib_dev_flush(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_task); - /* Flush any child interfaces too */ - list_for_each_entry(cpriv, &priv->child_intfs, list) - ipoib_ib_dev_flush(&cpriv->flush_task); + ipoib_dbg(priv, "Flushing %s\n", priv->dev->name); + __ipoib_ib_dev_flush(priv, 0); +} - mutex_unlock(&priv->vlan_mutex); +void ipoib_pkey_event(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, pkey_event_task); + + ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name); + __ipoib_ib_dev_flush(priv, 1); } void ipoib_ib_dev_cleanup(struct net_device *dev) @@ -736,7 +794,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) void ipoib_pkey_poll(struct work_struct *work) { struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, pkey_task.work); + container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); struct net_device *dev = priv->dev; ipoib_pkey_dev_check_presence(dev); @@ -747,7 +805,7 @@ void ipoib_pkey_poll(struct work_struct *work) mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->pkey_task, + &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); } @@ -766,7 +824,7 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev) mutex_lock(&pkey_mutex); clear_bit(IPOIB_PKEY_STOP, &priv->flags); queue_delayed_work(ipoib_workqueue, - &priv->pkey_task, + &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); return 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0a428f2..894b1dcd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -107,7 +107,7 @@ int ipoib_open(struct net_device *dev) return -EINVAL; if (ipoib_ib_dev_up(dev)) { - ipoib_ib_dev_stop(dev); + ipoib_ib_dev_stop(dev, 1); return -EINVAL; } @@ -152,7 +152,7 @@ static int ipoib_stop(struct net_device *dev) flush_workqueue(ipoib_workqueue); ipoib_ib_dev_down(dev, 1); - ipoib_ib_dev_stop(dev); + ipoib_ib_dev_stop(dev, 1); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -988,7 +988,8 @@ static void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); - INIT_DELAYED_WORK(&priv->pkey_task, ipoib_pkey_poll); + INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); + INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 54fbead..aae3670 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -524,7 +524,7 @@ void ipoib_mcast_join_task(struct work_struct *work) return; if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) - ipoib_warn(priv, "ib_gid_entry_get() failed\n"); + ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 5c3c6a4..982eb88 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -33,8 +33,6 @@ * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $ */ -#include <rdma/ib_cache.h> - #include "ipoib.h" int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) @@ -49,7 +47,7 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) if (!qp_attr) goto out; - if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = -ENXIO; goto out; @@ -94,26 +92,16 @@ int ipoib_init_qp(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - u16 pkey_index; struct ib_qp_attr qp_attr; int attr_mask; - /* - * Search through the port P_Key table for the requested pkey value. - * The port has to be assigned to the respective IB partition in - * advance. - */ - ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index); - if (ret) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - return ret; - } - set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; qp_attr.qp_state = IB_QPS_INIT; qp_attr.qkey = 0; qp_attr.port_num = priv->port; - qp_attr.pkey_index = pkey_index; + qp_attr.pkey_index = priv->pkey_index; attr_mask = IB_QP_QKEY | IB_QP_PORT | @@ -185,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size = ipoib_sendq_size + ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) - size += ipoib_recvq_size; + size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */; priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); if (IS_ERR(priv->cq)) { @@ -259,14 +247,18 @@ void ipoib_event(struct ib_event_handler *handler, struct ipoib_dev_priv *priv = container_of(handler, struct ipoib_dev_priv, event_handler); - if ((record->event == IB_EVENT_PORT_ERR || - record->event == IB_EVENT_PKEY_CHANGE || - record->event == IB_EVENT_PORT_ACTIVE || - record->event == IB_EVENT_LID_CHANGE || - record->event == IB_EVENT_SM_CHANGE || - record->event == IB_EVENT_CLIENT_REREGISTER) && - record->element.port_num == priv->port) { + if (record->element.port_num != priv->port) + return; + + if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE || + record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { ipoib_dbg(priv, "Port state change event\n"); queue_work(ipoib_workqueue, &priv->flush_task); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port); + queue_work(ipoib_workqueue, &priv->pkey_event_task); } } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 89d6008..3702e23 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -35,7 +35,6 @@ #include <asm/io.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/delay.h> #include <linux/version.h> |