From 498c43949c7b8f57e0afb8195019cf5a7ba72de0 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 20 Jul 2015 10:14:08 -0600 Subject: NVMe: Unify SQ entry writing and doorbell ringing This patch changes sq_cmd writers to instead create their command on the stack. __nvme_submit_cmd copies the sq entry to the queue and writes the doorbell. Signed-off-by: Jon Derrick Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d844ec4..e09ad6c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -730,18 +730,16 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, struct nvme_iod *iod) { - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + struct nvme_command cmnd; - memcpy(cmnd, req->cmd, sizeof(struct nvme_command)); - cmnd->rw.command_id = req->tag; + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + cmnd.rw.command_id = req->tag; if (req->nr_phys_segments) { - cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); } - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); + __nvme_submit_cmd(nvmeq, &cmnd); } /* @@ -754,45 +752,41 @@ static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, { struct nvme_dsm_range *range = (struct nvme_dsm_range *)iod_list(iod)[0]; - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + struct nvme_command cmnd; range->cattr = cpu_to_le32(0); range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = req->tag; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.dsm.opcode = nvme_cmd_dsm; + cmnd.dsm.command_id = req->tag; + cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd.dsm.nr = 0; + cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); + __nvme_submit_cmd(nvmeq, &cmnd); } static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + struct nvme_command cmnd; - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.command_id = cmdid; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.common.opcode = nvme_cmd_flush; + cmnd.common.command_id = cmdid; + cmnd.common.nsid = cpu_to_le32(ns->ns_id); - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); + __nvme_submit_cmd(nvmeq, &cmnd); } static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { struct request *req = iod_get_private(iod); - struct nvme_command *cmnd; + struct nvme_command cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -804,19 +798,17 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - memset(cmnd, 0, sizeof(*cmnd)); - - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.command_id = req->tag; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd.rw.command_id = req->tag; + cmnd.rw.nsid = cpu_to_le32(ns->ns_id); + cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); if (blk_integrity_rq(req)) { - cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + cmnd.rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: control |= NVME_RW_PRINFO_PRCHK_GUARD; @@ -825,19 +817,17 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( + cmnd.rw.reftag = cpu_to_le32( nvme_block_nr(ns, blk_rq_pos(req))); break; } } else if (ns->ms) control |= NVME_RW_PRINFO_PRACT; - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + cmnd.rw.control = cpu_to_le16(control); + cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); + __nvme_submit_cmd(nvmeq, &cmnd); return 0; } -- cgit v0.10.2 From 8ffaadf7429270914b8f146ec13cf305e01df20d Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 20 Jul 2015 10:14:09 -0600 Subject: NVMe: Use CMB for the IO SQes if available Some controllers have a controller-side memory buffer available for use for submissions, completions, lists, or data. If a CMB is available, the entire CMB will be ioremapped and it will attempt to map the IO SQes onto the CMB. The queues will be shrunk as needed. The CMB will not be used if the queue depth is shrunk below some threshold where it may have reduced performance over a larger queue in system memory. Signed-off-by: Jon Derrick Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e09ad6c..82b4ffb 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -72,6 +72,10 @@ module_param(nvme_char_major, int, 0); static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); +static bool use_cmb_sqes = true; +module_param(use_cmb_sqes, bool, 0644); +MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); + static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; @@ -103,6 +107,7 @@ struct nvme_queue { char irqname[24]; /* nvme4294967295-65535\0 */ spinlock_t q_lock; struct nvme_command *sq_cmds; + struct nvme_command __iomem *sq_cmds_io; volatile struct nvme_completion *cqes; struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; @@ -383,7 +388,11 @@ static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { u16 tail = nvmeq->sq_tail; - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (nvmeq->sq_cmds_io) + memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); + else + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (++tail == nvmeq->q_depth) tail = 0; writel(tail, nvmeq->q_db); @@ -1364,7 +1373,8 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + if (nvmeq->sq_cmds) + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); kfree(nvmeq); } @@ -1437,6 +1447,46 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) spin_unlock_irq(&nvmeq->q_lock); } +static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, + int entry_size) +{ + int q_depth = dev->q_depth; + unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); + + if (q_size_aligned * nr_io_queues > dev->cmb_size) { + q_depth = rounddown(dev->cmb_size / nr_io_queues, + dev->page_size) / entry_size; + + /* + * Ensure the reduced q_depth is above some threshold where it + * would be better to map queues in system memory with the + * original depth + */ + if (q_depth < 64) + return -ENOMEM; + } + + return q_depth; +} + +static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, + int qid, int depth) +{ + if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + unsigned offset = (qid - 1) * + roundup(SQ_SIZE(depth), dev->page_size); + nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; + nvmeq->sq_cmds_io = dev->cmb + offset; + } else { + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + return -ENOMEM; + } + + return 0; +} + static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) { @@ -1449,9 +1499,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq->cqes) goto free_nvmeq; - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) + if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) goto free_cqdma; nvmeq->q_dmadev = dev->dev; @@ -2149,6 +2197,58 @@ static int set_queue_count(struct nvme_dev *dev, int count) return min(result & 0xffff, result >> 16) + 1; } +static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +{ + u64 szu, size, offset; + u32 cmbloc; + resource_size_t bar_size; + struct pci_dev *pdev = to_pci_dev(dev->dev); + void __iomem *cmb; + dma_addr_t dma_addr; + + if (!use_cmb_sqes) + return NULL; + + dev->cmbsz = readl(&dev->bar->cmbsz); + if (!(NVME_CMB_SZ(dev->cmbsz))) + return NULL; + + cmbloc = readl(&dev->bar->cmbloc); + + szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); + size = szu * NVME_CMB_SZ(dev->cmbsz); + offset = szu * NVME_CMB_OFST(cmbloc); + bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); + + if (offset > bar_size) + return NULL; + + /* + * Controllers may support a CMB size larger than their BAR, + * for example, due to being behind a bridge. Reduce the CMB to + * the reported size of the BAR + */ + if (size > bar_size - offset) + size = bar_size - offset; + + dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; + cmb = ioremap_wc(dma_addr, size); + if (!cmb) + return NULL; + + dev->cmb_dma_addr = dma_addr; + dev->cmb_size = size; + return cmb; +} + +static inline void nvme_release_cmb(struct nvme_dev *dev) +{ + if (dev->cmb) { + iounmap(dev->cmb); + dev->cmb = NULL; + } +} + static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) { return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); @@ -2167,6 +2267,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; + if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + result = nvme_cmb_qdepth(dev, nr_io_queues, + sizeof(struct nvme_command)); + if (result > 0) + dev->q_depth = result; + else + nvme_release_cmb(dev); + } + size = db_bar_size(dev, nr_io_queues); if (size > 8192) { iounmap(dev->bar); @@ -2430,6 +2539,8 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); dev->dbs = ((void __iomem *)dev->bar) + 4096; + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) + dev->cmb = nvme_map_cmb(dev); return 0; @@ -3135,6 +3246,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); + nvme_release_cmb(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c0d94ed..fa3fe16 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -32,6 +32,8 @@ struct nvme_bar { __u32 aqa; /* Admin Queue Attributes */ __u64 asq; /* Admin SQ Base Address */ __u64 acq; /* Admin CQ Base Address */ + __u32 cmbloc; /* Controller Memory Buffer Location */ + __u32 cmbsz; /* Controller Memory Buffer Size */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) @@ -40,6 +42,17 @@ struct nvme_bar { #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) +#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) +#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) +#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) + +#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) +#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) +#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) +#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) +#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + enum { NVME_CC_ENABLE = 1 << 0, NVME_CC_CSS_NVM = 0 << 4, @@ -100,6 +113,10 @@ struct nvme_dev { u32 max_hw_sectors; u32 stripe_size; u32 page_size; + void __iomem *cmb; + dma_addr_t cmb_dma_addr; + u64 cmb_size; + u32 cmbsz; u16 oncs; u16 abort_limit; u8 event_limit; -- cgit v0.10.2 From c45f5c9943ce0b16b299b543c2aae12408039027 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 21 Jul 2015 15:08:13 -0600 Subject: nvme: Fixes u64 division which breaks i386 builds Uses div_u64 for u64 division and round_down, a bitwise operation, instead of rounddown, which uses a modulus. Signed-off-by: Jon Derrick Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 82b4ffb..666e994 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1454,8 +1454,9 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); if (q_size_aligned * nr_io_queues > dev->cmb_size) { - q_depth = rounddown(dev->cmb_size / nr_io_queues, - dev->page_size) / entry_size; + u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); + mem_per_q = round_down(mem_per_q, dev->page_size); + q_depth = div_u64(mem_per_q, entry_size); /* * Ensure the reduced q_depth is above some threshold where it -- cgit v0.10.2 From 7e2893a16d3e71035a38122a77bc55848a29f0e4 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:00 +0200 Subject: nbd: Fix timeout detection At the moment the nbd timeout just detects hanging tcp operations. This is not enough to detect a hanging or bad connection as expected of a timeout. This patch redesigns the timeout detection to include some more cases. The timeout is now in relation to replies from the server. If the server does not send replies within the timeout the connection will be shut down. The patch adds a continous timer 'timeout_timer' that is setup in one of two cases: - The request list is empty and we are sending the first request out to the server. We want to have a reply within the given timeout, otherwise we consider the connection to be dead. - A server response was received. This means the server is still communicating with us. The timer is reset to the timeout value. The timer is not stopped if the list becomes empty. It will just trigger a timeout which will directly leave the handling routine again as the request list is empty. The whole patch does not use any additional explicit locking. The list_empty() calls are safe to be used concurrently. The timer is locked internally as we just use mod_timer and del_timer_sync(). The patch is based on the idea of Michal Belczyk with a previous different implementation. Cc: Michal Belczyk Cc: Hermann Lauer Signed-off-by: Markus Pargmann Tested-by: Hermann Lauer Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f169faf..f3536e6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -59,6 +59,10 @@ struct nbd_device { pid_t pid; /* pid of nbd-client, if attached */ int xmit_timeout; int disconnect; /* a disconnect has been requested by user */ + + struct timer_list timeout_timer; + struct task_struct *task_recv; + struct task_struct *task_send; }; #define NBD_MAGIC 0x68797548 @@ -121,6 +125,7 @@ static void sock_shutdown(struct nbd_device *nbd, int lock) dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); kernel_sock_shutdown(nbd->sock, SHUT_RDWR); nbd->sock = NULL; + del_timer_sync(&nbd->timeout_timer); } if (lock) mutex_unlock(&nbd->tx_lock); @@ -128,11 +133,23 @@ static void sock_shutdown(struct nbd_device *nbd, int lock) static void nbd_xmit_timeout(unsigned long arg) { - struct task_struct *task = (struct task_struct *)arg; + struct nbd_device *nbd = (struct nbd_device *)arg; + struct task_struct *task; + + if (list_empty(&nbd->queue_head)) + return; + + nbd->disconnect = 1; + + task = READ_ONCE(nbd->task_recv); + if (task) + force_sig(SIGKILL, task); - printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n", - task->comm, task->pid); - force_sig(SIGKILL, task); + task = READ_ONCE(nbd->task_send); + if (task) + force_sig(SIGKILL, nbd->task_send); + + dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n"); } /* @@ -171,33 +188,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; - if (send) { - struct timer_list ti; - - if (nbd->xmit_timeout) { - init_timer(&ti); - ti.function = nbd_xmit_timeout; - ti.data = (unsigned long)current; - ti.expires = jiffies + nbd->xmit_timeout; - add_timer(&ti); - } + if (send) result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (nbd->xmit_timeout) - del_timer_sync(&ti); - } else + else result = kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); - if (signal_pending(current)) { - siginfo_t info; - printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n", - task_pid_nr(current), current->comm, - dequeue_signal_lock(current, ¤t->blocked, &info)); - result = -EINTR; - sock_shutdown(nbd, !send); - break; - } - if (result <= 0) { if (result == 0) result = -EPIPE; /* short read */ @@ -210,6 +206,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, sigprocmask(SIG_SETMASK, &oldset, NULL); tsk_restore_flags(current, pflags, PF_MEMALLOC); + if (!send && nbd->xmit_timeout) + mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); + return result; } @@ -415,12 +414,26 @@ static int nbd_do_it(struct nbd_device *nbd) return ret; } + nbd->task_recv = current; + while ((req = nbd_read_stat(nbd)) != NULL) nbd_end_request(nbd, req); + nbd->task_recv = NULL; + + if (signal_pending(current)) { + siginfo_t info; + + ret = dequeue_signal_lock(current, ¤t->blocked, &info); + dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", + task_pid_nr(current), current->comm, ret); + sock_shutdown(nbd, 1); + ret = -ETIMEDOUT; + } + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->pid = 0; - return 0; + return ret; } static void nbd_clear_que(struct nbd_device *nbd) @@ -482,6 +495,9 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) nbd->active_req = req; + if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head)) + mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); + if (nbd_send_req(nbd, req) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; @@ -508,6 +524,8 @@ static int nbd_thread(void *data) struct nbd_device *nbd = data; struct request *req; + nbd->task_send = current; + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ @@ -515,6 +533,18 @@ static int nbd_thread(void *data) kthread_should_stop() || !list_empty(&nbd->waiting_queue)); + if (signal_pending(current)) { + siginfo_t info; + int ret; + + ret = dequeue_signal_lock(current, ¤t->blocked, + &info); + dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", + task_pid_nr(current), current->comm, ret); + sock_shutdown(nbd, 1); + break; + } + /* extract request */ if (list_empty(&nbd->waiting_queue)) continue; @@ -528,6 +558,9 @@ static int nbd_thread(void *data) /* handle request */ nbd_handle_req(nbd, req); } + + nbd->task_send = NULL; + return 0; } @@ -648,6 +681,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_TIMEOUT: nbd->xmit_timeout = arg * HZ; + if (arg) + mod_timer(&nbd->timeout_timer, + jiffies + nbd->xmit_timeout); + else + del_timer_sync(&nbd->timeout_timer); + return 0; case NBD_SET_FLAGS: @@ -842,6 +881,9 @@ static int __init nbd_init(void) spin_lock_init(&nbd_dev[i].queue_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); mutex_init(&nbd_dev[i].tx_lock); + init_timer(&nbd_dev[i].timeout_timer); + nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; + nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; init_waitqueue_head(&nbd_dev[i].active_wq); init_waitqueue_head(&nbd_dev[i].waiting_wq); nbd_dev[i].blksize = 1024; -- cgit v0.10.2 From 36e47bee7c9bc7771aad6e972dea92e2324338a4 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:01 +0200 Subject: nbd: sock_shutdown, remove conditional lock Move the conditional lock from sock_shutdown into the surrounding code. Signed-off-by: Markus Pargmann Acked-by: Pavel Machek Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f3536e6..bf0de30 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -117,18 +117,14 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req) /* * Forcibly shutdown the socket causing all listeners to error */ -static void sock_shutdown(struct nbd_device *nbd, int lock) +static void sock_shutdown(struct nbd_device *nbd) { - if (lock) - mutex_lock(&nbd->tx_lock); if (nbd->sock) { dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); kernel_sock_shutdown(nbd->sock, SHUT_RDWR); nbd->sock = NULL; del_timer_sync(&nbd->timeout_timer); } - if (lock) - mutex_unlock(&nbd->tx_lock); } static void nbd_xmit_timeout(unsigned long arg) @@ -427,7 +423,9 @@ static int nbd_do_it(struct nbd_device *nbd) ret = dequeue_signal_lock(current, ¤t->blocked, &info); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); - sock_shutdown(nbd, 1); + mutex_lock(&nbd->tx_lock); + sock_shutdown(nbd); + mutex_unlock(&nbd->tx_lock); ret = -ETIMEDOUT; } @@ -541,7 +539,9 @@ static int nbd_thread(void *data) &info); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); - sock_shutdown(nbd, 1); + mutex_lock(&nbd->tx_lock); + sock_shutdown(nbd); + mutex_unlock(&nbd->tx_lock); break; } @@ -735,7 +735,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_lock(&nbd->tx_lock); if (error) return error; - sock_shutdown(nbd, 0); + sock_shutdown(nbd); sock = nbd->sock; nbd->sock = NULL; nbd_clear_que(nbd); -- cgit v0.10.2 From 260bbce403e2ac601b422fd926f48b9924051f92 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:02 +0200 Subject: nbd: restructure sock_shutdown This patch restructures sock_shutdown to avoid having the main code path in an if block. Signed-off-by: Markus Pargmann Acked-by: Pavel Machek Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index bf0de30..137b45f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -119,12 +119,13 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req) */ static void sock_shutdown(struct nbd_device *nbd) { - if (nbd->sock) { - dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); - nbd->sock = NULL; - del_timer_sync(&nbd->timeout_timer); - } + if (!nbd->sock) + return; + + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; + del_timer_sync(&nbd->timeout_timer); } static void nbd_xmit_timeout(unsigned long arg) -- cgit v0.10.2 From 193918307f4e66eb6a811f30795991c6f7680b34 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:03 +0200 Subject: nbd: Remove 'harderror' and propagate error properly Instead of a variable 'harderror' we can simply try to correctly propagate errors to the userspace. This patch removes the harderror variable and passes errors through error pointers and nbd_do_it back to the userspace. Signed-off-by: Markus Pargmann Acked-by: Pavel Machek Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 137b45f..4b317ae 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -41,7 +41,6 @@ struct nbd_device { int flags; - int harderror; /* Code of hard error */ struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; @@ -329,26 +328,24 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); - goto harderror; + return ERR_PTR(result); } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); - result = -EPROTO; - goto harderror; + return ERR_PTR(-EPROTO); } req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) - goto harderror; + return ERR_PTR(result); dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", reply.handle); - result = -EBADR; - goto harderror; + return ERR_PTR(-EBADR); } if (ntohl(reply.error)) { @@ -376,9 +373,6 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) } } return req; -harderror: - nbd->harderror = result; - return NULL; } static ssize_t pid_show(struct device *dev, @@ -413,8 +407,15 @@ static int nbd_do_it(struct nbd_device *nbd) nbd->task_recv = current; - while ((req = nbd_read_stat(nbd)) != NULL) + while (1) { + req = nbd_read_stat(nbd); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + nbd_end_request(nbd, req); + } nbd->task_recv = NULL; @@ -734,8 +735,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, kthread_stop(thread); mutex_lock(&nbd->tx_lock); - if (error) - return error; + sock_shutdown(nbd); sock = nbd->sock; nbd->sock = NULL; @@ -754,7 +754,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, blkdev_reread_part(bdev); if (nbd->disconnect) /* user requested, ignore socket errors */ return 0; - return nbd->harderror; + return error; } case NBD_CLEAR_QUE: -- cgit v0.10.2 From e78273c80b213806ec7c51176ec81e034fe5cb9f Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:04 +0200 Subject: nbd: Move clear queue debug message This message was a warning without a reason. This patch moves it into nbd_clear_que and transforms it to a debug message. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4b317ae..a4f2c7b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -468,6 +468,7 @@ static void nbd_clear_que(struct nbd_device *nbd) req->errors++; nbd_end_request(nbd, req); } + dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } @@ -740,7 +741,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, sock = nbd->sock; nbd->sock = NULL; nbd_clear_que(nbd); - dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); set_device_ro(bdev, false); -- cgit v0.10.2 From 6521d39a64b3f9c3acb0fd25a34cfaf9a40e548e Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:05 +0200 Subject: nbd: Remove variable 'pid' This patch uses nbd->task_recv to determine the value of the previously used variable 'pid' for sysfs. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a4f2c7b..eeefa5c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -55,7 +55,6 @@ struct nbd_device { struct gendisk *disk; int blksize; loff_t bytesize; - pid_t pid; /* pid of nbd-client, if attached */ int xmit_timeout; int disconnect; /* a disconnect has been requested by user */ @@ -379,9 +378,9 @@ static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); + struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - return sprintf(buf, "%ld\n", - (long) ((struct nbd_device *)disk->private_data)->pid); + return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); } static struct device_attribute pid_attr = { @@ -397,16 +396,16 @@ static int nbd_do_it(struct nbd_device *nbd) BUG_ON(nbd->magic != NBD_MAGIC); sk_set_memalloc(nbd->sock->sk); - nbd->pid = task_pid_nr(current); + + nbd->task_recv = current; + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - nbd->pid = 0; + nbd->task_recv = NULL; return ret; } - nbd->task_recv = current; - while (1) { req = nbd_read_stat(nbd); if (IS_ERR(req)) { @@ -417,6 +416,8 @@ static int nbd_do_it(struct nbd_device *nbd) nbd_end_request(nbd, req); } + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->task_recv = NULL; if (signal_pending(current)) { @@ -431,8 +432,6 @@ static int nbd_do_it(struct nbd_device *nbd) ret = -ETIMEDOUT; } - device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - nbd->pid = 0; return ret; } @@ -708,7 +707,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, struct socket *sock; int error; - if (nbd->pid) + if (nbd->task_recv) return -EBUSY; if (!nbd->sock) return -EINVAL; -- cgit v0.10.2 From 30d53d9c11b6c2f71253a2be582969d7e6fa7f10 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:06 +0200 Subject: nbd: Add debugfs entries Add some debugfs files that help to understand the internal state of NBD. This exports the different sizes, flags, tasks and so on. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index eeefa5c..6dd8305 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -61,8 +62,18 @@ struct nbd_device { struct timer_list timeout_timer; struct task_struct *task_recv; struct task_struct *task_send; + +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *dbg_dir; +#endif }; +#if IS_ENABLED(CONFIG_DEBUG_FS) +static struct dentry *nbd_dbg_dir; +#endif + +#define nbd_name(nbd) ((nbd)->disk->disk_name) + #define NBD_MAGIC 0x68797548 static unsigned int nbds_max = 16; @@ -609,6 +620,9 @@ static void do_nbd_request(struct request_queue *q) } } +static int nbd_dev_dbg_init(struct nbd_device *nbd); +static void nbd_dev_dbg_close(struct nbd_device *nbd); + /* Must be called with tx_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, @@ -725,13 +739,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, blk_queue_flush(nbd->disk->queue, 0); thread = kthread_run(nbd_thread, nbd, "%s", - nbd->disk->disk_name); + nbd_name(nbd)); if (IS_ERR(thread)) { mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } + nbd_dev_dbg_init(nbd); error = nbd_do_it(nbd); + nbd_dev_dbg_close(nbd); kthread_stop(thread); mutex_lock(&nbd->tx_lock); @@ -797,6 +813,161 @@ static const struct block_device_operations nbd_fops = .ioctl = nbd_ioctl, }; +#if IS_ENABLED(CONFIG_DEBUG_FS) + +static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) +{ + struct nbd_device *nbd = s->private; + + if (nbd->task_recv) + seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); + if (nbd->task_send) + seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send)); + + return 0; +} + +static int nbd_dbg_tasks_open(struct inode *inode, struct file *file) +{ + return single_open(file, nbd_dbg_tasks_show, inode->i_private); +} + +static const struct file_operations nbd_dbg_tasks_ops = { + .open = nbd_dbg_tasks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int nbd_dbg_flags_show(struct seq_file *s, void *unused) +{ + struct nbd_device *nbd = s->private; + u32 flags = nbd->flags; + + seq_printf(s, "Hex: 0x%08x\n\n", flags); + + seq_puts(s, "Known flags:\n"); + + if (flags & NBD_FLAG_HAS_FLAGS) + seq_puts(s, "NBD_FLAG_HAS_FLAGS\n"); + if (flags & NBD_FLAG_READ_ONLY) + seq_puts(s, "NBD_FLAG_READ_ONLY\n"); + if (flags & NBD_FLAG_SEND_FLUSH) + seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); + if (flags & NBD_FLAG_SEND_TRIM) + seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + + return 0; +} + +static int nbd_dbg_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, nbd_dbg_flags_show, inode->i_private); +} + +static const struct file_operations nbd_dbg_flags_ops = { + .open = nbd_dbg_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int nbd_dev_dbg_init(struct nbd_device *nbd) +{ + struct dentry *dir; + struct dentry *f; + + dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); + if (IS_ERR_OR_NULL(dir)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n", + nbd_name(nbd), PTR_ERR(dir)); + return PTR_ERR(dir); + } + nbd->dbg_dir = dir; + + f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + return 0; +} + +static void nbd_dev_dbg_close(struct nbd_device *nbd) +{ + debugfs_remove_recursive(nbd->dbg_dir); +} + +static int nbd_dbg_init(void) +{ + struct dentry *dbg_dir; + + dbg_dir = debugfs_create_dir("nbd", NULL); + if (IS_ERR(dbg_dir)) + return PTR_ERR(dbg_dir); + + nbd_dbg_dir = dbg_dir; + + return 0; +} + +static void nbd_dbg_close(void) +{ + debugfs_remove_recursive(nbd_dbg_dir); +} + +#else /* IS_ENABLED(CONFIG_DEBUG_FS) */ + +static int nbd_dev_dbg_init(struct nbd_device *nbd) +{ + return 0; +} + +static void nbd_dev_dbg_close(struct nbd_device *nbd) +{ +} + +static int nbd_dbg_init(void) +{ + return 0; +} + +static void nbd_dbg_close(void) +{ +} + +#endif + /* * And here should be modules and kernel interface * (Just smiley confuses emacs :-) @@ -874,6 +1045,8 @@ static int __init nbd_init(void) printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR); + nbd_dbg_init(); + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = NBD_MAGIC; @@ -910,6 +1083,9 @@ out: static void __exit nbd_cleanup(void) { int i; + + nbd_dbg_close(); + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = 0; -- cgit v0.10.2 From 696697cb50e4b57d4163d59a496ad9e52f6bff8a Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:07 +0200 Subject: nbd: Change 'disconnect' to be boolean Signed-off-by: Markus Pargmann Acked-by: Pavel Machek Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6dd8305..929217e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -57,7 +57,7 @@ struct nbd_device { int blksize; loff_t bytesize; int xmit_timeout; - int disconnect; /* a disconnect has been requested by user */ + bool disconnect; /* a disconnect has been requested by user */ struct timer_list timeout_timer; struct task_struct *task_recv; @@ -145,7 +145,7 @@ static void nbd_xmit_timeout(unsigned long arg) if (list_empty(&nbd->queue_head)) return; - nbd->disconnect = 1; + nbd->disconnect = true; task = READ_ONCE(nbd->task_recv); if (task) @@ -646,7 +646,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (!nbd->sock) return -EINVAL; - nbd->disconnect = 1; + nbd->disconnect = true; nbd_send_req(nbd, &sreq); return 0; @@ -674,7 +674,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->sock = sock; if (max_part > 0) bdev->bd_invalidated = 1; - nbd->disconnect = 0; /* we're connected now */ + nbd->disconnect = false; /* we're connected now */ return 0; } return -EINVAL; -- cgit v0.10.2 From cad73b2703cff68b140d68221eeecd5a30322b44 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:08 +0200 Subject: nbd: Rename functions for clearness of recv/send path This patch renames functions so that it is clear what the function does. Otherwise it is not directly understandable what for example 'do_it' means. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 929217e..7a3eaaa 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -399,7 +399,7 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *nbd) +static int nbd_thread_recv(struct nbd_device *nbd) { struct request *req; int ret; @@ -530,7 +530,7 @@ error_out: nbd_end_request(nbd, req); } -static int nbd_thread(void *data) +static int nbd_thread_send(void *data) { struct nbd_device *nbd = data; struct request *req; @@ -584,7 +584,7 @@ static int nbd_thread(void *data) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ -static void do_nbd_request(struct request_queue *q) +static void nbd_request_handler(struct request_queue *q) __releases(q->queue_lock) __acquires(q->queue_lock) { struct request *req; @@ -738,7 +738,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, else blk_queue_flush(nbd->disk->queue, 0); - thread = kthread_run(nbd_thread, nbd, "%s", + thread = kthread_run(nbd_thread_send, nbd, "%s", nbd_name(nbd)); if (IS_ERR(thread)) { mutex_lock(&nbd->tx_lock); @@ -746,7 +746,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, } nbd_dev_dbg_init(nbd); - error = nbd_do_it(nbd); + error = nbd_thread_recv(nbd); nbd_dev_dbg_close(nbd); kthread_stop(thread); @@ -1021,7 +1021,7 @@ static int __init nbd_init(void) * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_init_queue(do_nbd_request, &nbd_lock); + disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock); if (!disk->queue) { put_disk(disk); goto out; -- cgit v0.10.2 From 22d109c1bb312dbf321f3d0ab1b0ed94f1a7e304 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 17 Aug 2015 08:20:09 +0200 Subject: nbd: flags is a u32 variable The flags variable is used as u32 variable. This patch changes the type to be u32. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7a3eaaa..293495a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -41,7 +41,7 @@ #include struct nbd_device { - int flags; + u32 flags; struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; -- cgit v0.10.2 From e824410ffcf4b245296b56c6fdf7b9797fce8c3e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 12 Aug 2015 16:17:54 -0600 Subject: NVMe: Set queue max segments This sets the queue's max segment size to match the device's capabilities. The default of 128 is usable until a device's transfer capability exceeds 512k, assuming a device page size of 4k. Many nvme devices exceed that transfer limit, so this lets the block layer know what kind of commands it to allow to form rather than unnecessarily split them. One additional segment is added to account for a transfer that may start in the middle of a page. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 666e994..2e16000 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2120,8 +2120,11 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) list_add_tail(&ns->list, &dev->namespaces); blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) + if (dev->max_hw_sectors) { blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + blk_queue_max_segments(ns->queue, + ((dev->max_hw_sectors << 9) / dev->page_size) + 1); + } if (dev->stripe_size) blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->vwc & NVME_CTRL_VWC_PRESENT) -- cgit v0.10.2 From b2b1ec9b55ed0840956db15f823c4a73383c08be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 18 Aug 2015 10:13:41 -0600 Subject: NVMe: removed unused nn var from nvme_dev_add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic in nvme_dev_add to enumerate namespaces was moved to nvme_dev_scan. When moved, the nn variable is no longer used. This patch removes it. Fixes: a5768aai ("NVMe: Automatic namespace rescan") Signed-off-by: Matias Bjørling Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2e16000..85cd33b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2447,7 +2447,6 @@ static int nvme_dev_add(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); int res; - unsigned nn; struct nvme_id_ctrl *ctrl; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; @@ -2457,7 +2456,6 @@ static int nvme_dev_add(struct nvme_dev *dev) return -EIO; } - nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; dev->vwc = ctrl->vwc; -- cgit v0.10.2 From dfbac8c7ac5f58448b2216fe42ff52aaf175421d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 10 Aug 2015 15:20:40 -0600 Subject: NVMe: Add nvme subsystem reset support Controllers part of an NVMe subsystem may be reset by any other controller in the subsystem. If the device is capable of subsystem resets, this patch adds detection for such events and performs appropriate controller initialization upon subsystem reset detection. The register bit is a RW1C type, so the driver needs to write a 1 to the status bit to clear the subsystem reset occured bit during initialization. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 85cd33b..e318a99 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1735,6 +1735,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) page_shift = dev_page_max; } + dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? + NVME_CAP_NSSRC(cap) : 0; + + if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, &dev->bar->csts); + result = nvme_disable_ctrl(dev, cap); if (result < 0) return result; @@ -2059,7 +2065,10 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { + u32 csts = readl(&dev->bar->csts); + + if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || + csts & NVME_CSTS_CFS) { if (work_busy(&dev->reset_work)) continue; list_del_init(&dev->node); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fa3fe16..d6b5600 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -39,6 +39,7 @@ struct nvme_bar { #define NVME_CAP_MQES(cap) ((cap) & 0xffff) #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) @@ -68,6 +69,7 @@ enum { NVME_CC_IOCQES = 4 << 20, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, + NVME_CSTS_NSSRO = 1 << 4, NVME_CSTS_SHST_NORMAL = 0 << 2, NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, @@ -110,6 +112,7 @@ struct nvme_dev { char serial[20]; char model[40]; char firmware_rev[8]; + bool subsystem; u32 max_hw_sectors; u32 stripe_size; u32 page_size; -- cgit v0.10.2 From 81f03fedcce7ee7e83c37237ecaa2f68aad236fd Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 10 Aug 2015 15:20:41 -0600 Subject: NVMe: Add nvme subsystem reset IOCTL Controllers can perform optional subsystem resets as introduced in NVMe 1.1. This patch adds an IOCTL to trigger the subsystem reset by writing "NVMe" to the NSSR register. Signed-off-by: Jon Derrick Acked-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e318a99..3a35c58 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1901,6 +1901,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, return status; } +static int nvme_subsys_reset(struct nvme_dev *dev) +{ + if (!dev->subsystem) + return -ENOTTY; + + writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ + return 0; +} + static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -2932,6 +2941,8 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) case NVME_IOCTL_RESET: dev_warn(dev->dev, "resetting controller\n"); return nvme_reset(dev); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_subsys_reset(dev); default: return -ENOTTY; } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d6b5600..b5812c3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -28,7 +28,7 @@ struct nvme_bar { __u32 cc; /* Controller Configuration */ __u32 rsvd1; /* Reserved */ __u32 csts; /* Controller Status */ - __u32 rsvd2; /* Reserved */ + __u32 nssr; /* Subsystem Reset */ __u32 aqa; /* Admin Queue Attributes */ __u64 asq; /* Admin SQ Base Address */ __u64 acq; /* Admin CQ Base Address */ diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 732b32e..8864194 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -584,5 +584,6 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) #define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) #define NVME_IOCTL_RESET _IO('N', 0x44) +#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) #endif /* _UAPI_LINUX_NVME_H */ -- cgit v0.10.2 From e3f879bf1ea3e03f433d292b0114807785f0754b Mon Sep 17 00:00:00 2001 From: Sunad Bhandary Date: Fri, 31 Jul 2015 18:56:58 +0530 Subject: NVMe:Remove unreachable code in nvme_abort_req Removing unreachable code from nvme_abort_req as nvme_submit_cmd has no failure status to return. Signed-off-by: Sunad Bhandary Acked-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3a35c58..caad00f 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -384,7 +384,8 @@ static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, * * Safe to use from interrupt context */ -static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static void __nvme_submit_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) { u16 tail = nvmeq->sq_tail; @@ -397,18 +398,14 @@ static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) tail = 0; writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; - - return 0; } -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { unsigned long flags; - int ret; spin_lock_irqsave(&nvmeq->q_lock, flags); - ret = __nvme_submit_cmd(nvmeq, cmd); + __nvme_submit_cmd(nvmeq, cmd); spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return ret; } static __le64 **iod_list(struct nvme_iod *iod) @@ -1079,7 +1076,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) c.common.command_id = req->tag; blk_mq_free_request(req); - return __nvme_submit_cmd(nvmeq, &c); + __nvme_submit_cmd(nvmeq, &c); + return 0; } static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, @@ -1102,7 +1100,8 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, cmd->common.command_id = req->tag; - return nvme_submit_cmd(nvmeq, cmd); + nvme_submit_cmd(nvmeq, cmd); + return 0; } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1314,12 +1313,7 @@ static void nvme_abort_req(struct request *req) dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, nvmeq->qid); - if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { - dev_warn(nvmeq->q_dmadev, - "Could not abort I/O %d QID %d", - req->tag, nvmeq->qid); - blk_mq_free_request(abort_req); - } + nvme_submit_cmd(dev->queues[0], &cmd); } static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) -- cgit v0.10.2 From e19b127f5b76ec03b9c52b64f117dc75bb39eda1 Mon Sep 17 00:00:00 2001 From: Alok Pandey Date: Wed, 26 Aug 2015 08:56:14 -0600 Subject: NVMe: Using PRACT bit to generate and verify PI by controller This patch enables the PRCHK and reftag support when PRACT bit is set, and block layer integrity is disabled. Signed-off-by: Alok Pandey Reviewed-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index caad00f..8de3de0 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -813,8 +813,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - if (blk_integrity_rq(req)) { - cmnd.rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + if (ns->ms) { switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: control |= NVME_RW_PRINFO_PRCHK_GUARD; @@ -827,8 +826,12 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, nvme_block_nr(ns, blk_rq_pos(req))); break; } - } else if (ns->ms) - control |= NVME_RW_PRINFO_PRACT; + if (blk_integrity_rq(req)) + cmnd.rw.metadata = + cpu_to_le64(sg_dma_address(iod->meta_sg)); + else + control |= NVME_RW_PRINFO_PRACT; + } cmnd.rw.control = cpu_to_le16(control); cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -2037,7 +2040,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) !ns->ext) nvme_init_integrity(ns); - if (ns->ms && !blk_get_integrity(disk)) + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); -- cgit v0.10.2