From 788e15abbb9408c9399d7e3445ac9afb3b2fd7d6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 Apr 2016 16:09:10 -0600 Subject: NVMe: Always use MSI/MSI-x interrupts Multiple users have reported device initialization failure due the driver not receiving legacy PCI interrupts. This is not unique to any particular controller, but has been observed on multiple platforms. There have been no issues reported or observed when with message signaled interrupts, so this patch attempts to use MSI-x during initialization, falling back to MSI. If that fails, legacy would become the default. The setup_io_queues error handling had to change as a result: the admin queue's msix_entry used to be initialized to the legacy IRQ. The case where nr_io_queues is 0 would fail request_irq when setting up the admin queue's interrupt since re-enabling MSI-x fails with 0 vectors, leaving the admin queue's msix_entry invalid. Instead, return success immediately. Reported-by: Tim Muhlemmer Reported-by: Jon Derrick Signed-off-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 24ccda3..94d45b0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1478,8 +1478,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result > 0) { dev_err(dev->ctrl.device, "Could not set queue count (%d)\n", result); - nr_io_queues = 0; - result = 0; + return 0; } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { @@ -1513,7 +1512,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * If we enable msix early due to not intx, disable it again before * setting up the full range we need. */ - if (!pdev->irq) + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) pci_disable_msix(pdev); for (i = 0; i < nr_io_queues; i++) @@ -1696,7 +1697,6 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (pci_enable_device_mem(pdev)) return result; - dev->entry[0].vector = pdev->irq; pci_set_master(pdev); if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && @@ -1709,13 +1709,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) } /* - * Some devices don't advertse INTx interrupts, pre-enable a single - * MSIX vec for setup. We'll adjust this later. + * Some devices and/or platforms don't advertise or work with INTx + * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll + * adjust this later. */ - if (!pdev->irq) { - result = pci_enable_msix(pdev, dev->entry, 1); - if (result < 0) - goto disable; + if (pci_enable_msix(pdev, dev->entry, 1)) { + pci_enable_msi(pdev); + dev->entry[0].vector = pdev->irq; + } + + if (!dev->entry[0].vector) { + result = -ENODEV; + goto disable; } cap = lo_hi_readq(dev->bar + NVME_REG_CAP); -- cgit v0.10.2 From 2e39e0f608c130411f52c9fe5648dbcda5e28528 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Tue, 5 Apr 2016 10:32:04 -0700 Subject: nvme: add missing lock nesting notation When unloading driver, nvme_disable_io_queues() calls nvme_delete_queue() that sends nvme_admin_delete_cq command to admin sq. So when the command completed, the lock acquired by nvme_irq() actually belongs to admin queue. While the lock that nvme_del_cq_end() trying to acquire belongs to io queue. So it will not deadlock. This patch adds lock nesting notation to fix following report. [ 109.840952] ============================================= [ 109.846379] [ INFO: possible recursive locking detected ] [ 109.851806] 4.5.0+ #180 Tainted: G E [ 109.856533] --------------------------------------------- [ 109.861958] swapper/0/0 is trying to acquire lock: [ 109.866771] (&(&nvmeq->q_lock)->rlock){-.....}, at: [] nvme_del_cq_end+0x26/0x70 [nvme] [ 109.876535] [ 109.876535] but task is already holding lock: [ 109.882398] (&(&nvmeq->q_lock)->rlock){-.....}, at: [] nvme_irq+0x1b/0x50 [nvme] [ 109.891547] [ 109.891547] other info that might help us debug this: [ 109.898107] Possible unsafe locking scenario: [ 109.898107] [ 109.904056] CPU0 [ 109.906515] ---- [ 109.908974] lock(&(&nvmeq->q_lock)->rlock); [ 109.913381] lock(&(&nvmeq->q_lock)->rlock); [ 109.917787] [ 109.917787] *** DEADLOCK *** [ 109.917787] [ 109.923738] May be due to missing lock nesting notation [ 109.923738] [ 109.930558] 1 lock held by swapper/0/0: [ 109.934413] #0: (&(&nvmeq->q_lock)->rlock){-.....}, at: [] nvme_irq+0x1b/0x50 [nvme] [ 109.944010] [ 109.944010] stack backtrace: [ 109.948389] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G E 4.5.0+ #180 [ 109.955734] Hardware name: Dell Inc. OptiPlex 7010/0YXT71, BIOS A15 08/12/2013 [ 109.962989] 0000000000000000 ffff88011e203c38 ffffffff81383d9c ffffffff81c13540 [ 109.970478] ffffffff826711d0 ffff88011e203ce8 ffffffff810bb429 0000000000000046 [ 109.977964] 0000000000000046 0000000000000000 0000000000b2e597 ffffffff81f4cb00 [ 109.985453] Call Trace: [ 109.987911] [] dump_stack+0x85/0xc9 [ 109.993711] [] __lock_acquire+0x19b9/0x1c60 [ 109.999575] [] ? trace_hardirqs_off+0xd/0x10 [ 110.005524] [] ? complete+0x3d/0x50 [ 110.010688] [] lock_acquire+0x90/0xf0 [ 110.016029] [] ? nvme_del_cq_end+0x26/0x70 [nvme] [ 110.022418] [] _raw_spin_lock_irqsave+0x4b/0x60 [ 110.028632] [] ? nvme_del_cq_end+0x26/0x70 [nvme] [ 110.035019] [] nvme_del_cq_end+0x26/0x70 [nvme] [ 110.041232] [] blk_mq_end_request+0x35/0x60 [ 110.047095] [] nvme_complete_rq+0x68/0x190 [nvme] [ 110.053481] [] __blk_mq_complete_request+0x8f/0x130 [ 110.060043] [] blk_mq_complete_request+0x31/0x40 [ 110.066343] [] __nvme_process_cq+0x83/0x240 [nvme] [ 110.072818] [] nvme_irq+0x25/0x50 [nvme] [ 110.078419] [] handle_irq_event_percpu+0x36/0x110 [ 110.084804] [] handle_irq_event+0x37/0x60 [ 110.090491] [] handle_edge_irq+0x93/0x150 [ 110.096180] [] handle_irq+0xa6/0x130 [ 110.101431] [] do_IRQ+0x5e/0x120 [ 110.106333] [] common_interrupt+0x8c/0x8c Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 94d45b0..154194e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1592,7 +1592,13 @@ static void nvme_del_cq_end(struct request *req, int error) if (!error) { unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); + /* + * We might be called with the AQ q_lock held + * and the I/O queue q_lock should always + * nest inside the AQ one. + */ + spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + SINGLE_DEPTH_NESTING); nvme_process_cq(nvmeq); spin_unlock_irqrestore(&nvmeq->q_lock, flags); } -- cgit v0.10.2 From 58b45602751ddf16e57170656670aa5a8f78eeca Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Tue, 22 Mar 2016 00:24:43 -0700 Subject: nvme: add helper nvme_map_len() The helper returns the number of bytes that need to be mapped using PRPs/SGL entries. Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f846da4..6376cd7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -173,6 +173,14 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } +static inline unsigned nvme_map_len(struct request *rq) +{ + if (rq->cmd_flags & REQ_DISCARD) + return sizeof(struct nvme_dsm_range); + else + return blk_rq_bytes(rq); +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 154194e..d23ede7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -334,16 +334,11 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + req->nr_phys_segments); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, unsigned size, + struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = rq->nr_phys_segments; - unsigned size; - - if (rq->cmd_flags & REQ_DISCARD) - size = sizeof(struct nvme_dsm_range); - else - size = blk_rq_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -637,6 +632,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; + unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -652,7 +648,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - ret = nvme_init_iod(req, dev); + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); if (ret) return ret; -- cgit v0.10.2 From 03b5929ebb20457e2fd13a701954efa2b2fb7ded Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Tue, 22 Mar 2016 00:24:45 -0700 Subject: nvme: rewrite discard support This rewrites nvme_setup_discard() with blk_add_request_payload(). It allocates only the necessary amount(16 bytes) for the payload. Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d23ede7..0bf7f61 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -363,6 +363,9 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; + if (req->cmd_flags & REQ_DISCARD) + kfree(req->completion_data); + if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -524,7 +527,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) + unsigned size, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -541,7 +544,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) goto out; - if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) + if (!nvme_setup_prps(dev, req, size)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -590,35 +593,41 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) nvme_free_iod(dev, req); } -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) +static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dsm_range *range; + struct page *page; + int offset; + unsigned int nr_bytes = blk_rq_bytes(req); - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); + range = kmalloc(sizeof(*range), GFP_ATOMIC); if (!range) return BLK_MQ_RQ_QUEUE_BUSY; - iod_list(req)[0] = (__le64 *)range; - iod->npages = 0; range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); cmnd->dsm.nr = 0; cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - return BLK_MQ_RQ_QUEUE_OK; + + req->completion_data = range; + page = virt_to_page(range); + offset = offset_in_page(range); + blk_add_request_payload(req, page, offset, sizeof(*range)); + + /* + * we set __data_len back to the size of the area to be discarded + * on disk. This allows us to report completion on the full amount + * of blocks described by the request. + */ + req->__data_len = nr_bytes; + + return 0; } /* @@ -653,19 +662,20 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - if (req->cmd_flags & REQ_DISCARD) { - ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); - } else { - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - else if (req->cmd_flags & REQ_FLUSH) - nvme_setup_flush(ns, &cmnd); - else - nvme_setup_rw(ns, req, &cmnd); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, &cmnd); + else if (req->cmd_flags & REQ_DISCARD) + ret = nvme_setup_discard(ns, req, &cmnd); + else + nvme_setup_rw(ns, req, &cmnd); - if (req->nr_phys_segments) - ret = nvme_map_data(dev, req, &cmnd); - } + if (ret) + goto out; + + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, map_len, &cmnd); if (ret) goto out; -- cgit v0.10.2 From 8093f7ca73c1633e458c16a74b51bcc3c94564c4 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Tue, 12 Apr 2016 13:10:14 -0600 Subject: nvme: add helper nvme_setup_cmd() This moves nvme_setup_{flush,discard,rw} calls into a common nvme_setup_cmd() helper. So we can eventually hide all the command setup in the core module and don't even need to update the fabrics drivers for any specific command type. Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 643f457..6631f38 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -138,6 +138,111 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + struct nvme_dsm_range *range; + struct page *page; + int offset; + unsigned int nr_bytes = blk_rq_bytes(req); + + range = kmalloc(sizeof(*range), GFP_ATOMIC); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + req->completion_data = range; + page = virt_to_page(range); + offset = offset_in_page(range); + blk_add_request_payload(req, page, offset, sizeof(*range)); + + /* + * we set __data_len back to the size of the area to be discarded + * on disk. This allows us to report completion on the full amount + * of blocks described by the request. + */ + req->__data_len = nr_bytes; + + return 0; +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd) +{ + int ret = 0; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(cmd, req->cmd, sizeof(*cmd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, cmd); + else if (req->cmd_flags & REQ_DISCARD) + ret = nvme_setup_discard(ns, req, cmd); + else + nvme_setup_rw(ns, req, cmd); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_setup_cmd); + /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6376cd7..8e8fae8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -181,57 +181,6 @@ static inline unsigned nvme_map_len(struct request *rq) return blk_rq_bytes(rq); } -static inline void nvme_setup_flush(struct nvme_ns *ns, - struct nvme_command *cmnd) -{ - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); -} - -static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) -{ - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.command_id = req->tag; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (ns->ms) { - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; - } - if (!blk_integrity_rq(req)) - control |= NVME_RW_PRINFO_PRACT; - } - - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); -} - - static inline int nvme_error_status(u16 status) { switch (status & 0x7ff) { @@ -269,6 +218,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); void nvme_requeue_req(struct request *req); +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0bf7f61..008c9ee 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -593,43 +593,6 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) nvme_free_iod(dev, req); } -static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) -{ - struct nvme_dsm_range *range; - struct page *page; - int offset; - unsigned int nr_bytes = blk_rq_bytes(req); - - range = kmalloc(sizeof(*range), GFP_ATOMIC); - if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; - - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - req->completion_data = range; - page = virt_to_page(range); - offset = offset_in_page(range); - blk_add_request_payload(req, page, offset, sizeof(*range)); - - /* - * we set __data_len back to the size of the area to be discarded - * on disk. This allows us to report completion on the full amount - * of blocks described by the request. - */ - req->__data_len = nr_bytes; - - return 0; -} - /* * NOTE: ns is NULL when called on the admin queue. */ @@ -662,15 +625,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - else if (req->cmd_flags & REQ_FLUSH) - nvme_setup_flush(ns, &cmnd); - else if (req->cmd_flags & REQ_DISCARD) - ret = nvme_setup_discard(ns, req, &cmnd); - else - nvme_setup_rw(ns, req, &cmnd); - + ret = nvme_setup_cmd(ns, req, &cmnd); if (ret) goto out; -- cgit v0.10.2 From 21f033f7c72e9505c46c6555b019b907dc39dfcd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Apr 2016 11:13:11 -0600 Subject: NVMe: Skip async events for degraded controllers If the controller is degraded, the driver should stay out of the way so the user can recover the drive. This patch skips driver initiated async event requests when the drive is in this state. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 008c9ee..7508a0a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1855,8 +1855,16 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; - queue_work(nvme_workq, &dev->async_work); + /* + * A controller that can not execute IO typically requires user + * intervention to correct. For such degraded controllers, the driver + * should not submit commands the user did not request, so skip + * registering for asynchronous event notification on this condition. + */ + if (dev->online_queues > 1) { + dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; + queue_work(nvme_workq, &dev->async_work); + } mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); -- cgit v0.10.2 From 82b4552b91c40626a90a20291aab1137c638b512 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 12 Apr 2016 15:07:15 -0600 Subject: nvme: Use blk-mq helper for IO termination blk-mq offers a tagset iterator so let's use that instead of using nvme_clear_queues. Note, we changed nvme_queue_cancel_ios name to nvme_cancel_io as there is no concept of a queue now in this function (we also lost the print). Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Acked-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7508a0a..bd0d394 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -965,16 +965,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } -static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) +static void nvme_cancel_io(struct request *req, void *data, bool reserved) { - struct nvme_queue *nvmeq = data; + struct nvme_dev *dev = data; int status; if (!blk_mq_request_started(req)) return; - dev_dbg_ratelimited(nvmeq->dev->ctrl.device, - "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + dev_dbg_ratelimited(dev->ctrl.device, "Cancelling I/O %d", req->tag); status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) @@ -1031,14 +1030,6 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) return 0; } -static void nvme_clear_queue(struct nvme_queue *nvmeq) -{ - spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->tags && *nvmeq->tags) - blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = dev->queues[0]; @@ -1765,8 +1756,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_pci_disable(dev); - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_clear_queue(dev->queues[i]); + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev); mutex_unlock(&dev->shutdown_lock); } -- cgit v0.10.2 From 6d125de40bbc5b29fd8e6dce74e1a661a085dab1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 10 Mar 2016 13:58:48 +0200 Subject: mtip32xx: Convert to use blk_mq_tagset_busy_iter Only a single tags array anyway. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 25824c1..404ae98 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3000,14 +3000,14 @@ restart_eh: "Completion workers still active!"); spin_lock(dd->queue->queue_lock); - blk_mq_all_tag_busy_iter(*dd->tags.tags, + blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd); spin_unlock(dd->queue->queue_lock); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags); if (mtip_device_reset(dd)) - blk_mq_all_tag_busy_iter(*dd->tags.tags, + blk_mq_tagset_busy_iter(&dd->tags, mtip_abort_cmd, dd); clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags); @@ -4174,7 +4174,7 @@ static int mtip_block_remove(struct driver_data *dd) blk_mq_freeze_queue_start(dd->queue); blk_mq_stop_hw_queues(dd->queue); - blk_mq_all_tag_busy_iter(dd->tags.tags[0], mtip_no_dev_cleanup, dd); + blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); /* * Delete our gendisk structure. This also removes the device -- cgit v0.10.2 From e8f1e1630b0a98685d1a3521e8aba0dc7e68082c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 10 Mar 2016 13:58:49 +0200 Subject: blk-mq: Make blk_mq_all_tag_busy_iter static No caller outside the blk-mq code so we can settle with it static. Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2fd0428..56a0c37 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -464,15 +464,14 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, } } -void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, - void *priv) +static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, + busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, false); } -EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c808fec..2498fdf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -238,8 +238,6 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, - void *priv); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); -- cgit v0.10.2 From eb310e23dbafa5f4010df37f35999622af2cf795 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:06:11 -0600 Subject: sd: switch to using blk_queue_write_cache() Switch to the newer interface, instead of using blk_queue_flush() directly. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig Acked-by: Martin K. Petersen diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 69b0a4a..428c03e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -137,15 +137,15 @@ static const char *sd_cache_types[] = { static void sd_set_flush_flag(struct scsi_disk *sdkp) { - unsigned flush = 0; + bool wc = false, fua = false; if (sdkp->WCE) { - flush |= REQ_FLUSH; + wc = true; if (sdkp->DPOFUA) - flush |= REQ_FUA; + fua = true; } - blk_queue_flush(sdkp->disk->queue, flush); + blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t -- cgit v0.10.2 From 7c88cb00f2a26637bade6c62a17d17f31a954e30 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 15:43:09 -0600 Subject: NVMe: switch to using blk_queue_write_cache() Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6631f38..4eb5759 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -999,6 +999,8 @@ EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { + bool vwc = false; + if (ctrl->max_hw_sectors) { u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; @@ -1008,9 +1010,10 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, } if (ctrl->stripe_size) blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(q, ctrl->page_size - 1); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); } /* -- cgit v0.10.2 From fe8fb75e3a1f6f7fac8dc68024eb0062191fe424 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:09:01 -0600 Subject: drbd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa20977..2ba1494 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2761,7 +2761,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info.congested_data = device; blk_queue_make_request(q, drbd_make_request); - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); -- cgit v0.10.2 From 21d0727f639e4ba2bd194b2eb9b38ac840bbbc87 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:09:35 -0600 Subject: loop: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 423f4ca..7e5e27a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -937,7 +937,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_flush(lo->lo_queue, REQ_FLUSH); + blk_queue_write_cache(lo->lo_queue, true, false); loop_update_dio(lo); set_capacity(lo->lo_disk, size); -- cgit v0.10.2 From 17fe95f4605c3a2ec3df763876e3abf2eca790a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 15:45:39 -0600 Subject: mtip32xx: remove call to blk_queue_flush() The driver calls it with 0 for flags, since it doesn't have a writeback cache. Just remove the call, as it's a no-op right now. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 404ae98..6053e46 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4023,12 +4023,6 @@ skip_create_disk: blk_queue_io_min(dd->queue, 4096); blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); - /* - * write back cache is not supported in the device. FUA depends on - * write back cache support, hence setting flush support to zero. - */ - blk_queue_flush(dd->queue, 0); - /* Signal trim support */ if (dd->trim_supp == true) { set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); -- cgit v0.10.2 From aafb1eecbb79ac135fa8028f961596333ecbb80b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:10:53 -0600 Subject: nbd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 08afbc7..31e73a7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -693,9 +693,9 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (nbd->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + blk_queue_write_cache(nbd->disk->queue, true, false); else - blk_queue_flush(nbd->disk->queue, 0); + blk_queue_write_cache(nbd->disk->queue, false, false); } static int nbd_dev_dbg_init(struct nbd_device *nbd); -- cgit v0.10.2 From 177febc8430e0efa808caca721a59898c5422668 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:11:15 -0600 Subject: osdblk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1b709a4..c2854a2 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -437,7 +437,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); blk_queue_prep_rq(q, blk_queue_start_tag); - blk_queue_flush(q, REQ_FLUSH); + blk_queue_write_cache(q, true, false); disk->queue = q; -- cgit v0.10.2 From 6975f7327f62b7c545aeada57c40ba018d66c93c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:11:42 -0600 Subject: skd_main: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 9a9ec21..41aaae3 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4412,7 +4412,7 @@ static int skd_cons_disk(struct skd_device *skdev) disk->queue = q; q->queuedata = skdev; - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); blk_queue_max_segments(q, skdev->sgs_per_request); blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); -- cgit v0.10.2 From 12c95f137d09450b2c2e86fb12da5c7cf585b322 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:12:13 -0600 Subject: ps3disk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c120d70..4b7e405 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -468,7 +468,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_flush(queue, REQ_FLUSH); + blk_queue_write_cache(queue, true, false); blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); -- cgit v0.10.2 From ad9126ac723f9e8ed900194d226a0608ffeae45e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:12:58 -0600 Subject: virtio_blk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 28cff0d..42758b5 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -493,11 +493,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback) - blk_queue_flush(vblk->disk->queue, REQ_FLUSH); - else - blk_queue_flush(vblk->disk->queue, 0); - + blk_queue_write_cache(vblk->disk->queue, writeback, false); revalidate_disk(vblk->disk); } -- cgit v0.10.2 From 84b4ff9ef22a97231e5d6aeca544a243d0ac5d81 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:13:22 -0600 Subject: bcache: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a296425..f5dbb4e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); - blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + blk_queue_write_cache(q, true, true); return 0; } -- cgit v0.10.2 From 519a7e16f9bea99c77fbf549c4b3ef7916199a33 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:14:14 -0600 Subject: dm: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9e8f0b..4b1ffc0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1506,7 +1506,7 @@ static bool dm_table_supports_discards(struct dm_table *t) void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - unsigned flush = 0; + bool wc = false, fua = false; /* * Copy table's limits to the DM device's request_queue @@ -1519,11 +1519,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); if (dm_table_supports_flush(t, REQ_FLUSH)) { - flush |= REQ_FLUSH; + wc = true; if (dm_table_supports_flush(t, REQ_FUA)) - flush |= REQ_FUA; + fua = true; } - blk_queue_flush(q, flush); + blk_queue_write_cache(q, wc, fua); if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; -- cgit v0.10.2 From bfd230ac4e592f3c03cabdf1e69f170a70aaeef7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:15:21 -0600 Subject: xen-blkfront: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6405b65..ca13df8 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -998,7 +998,8 @@ static const char *flush_info(unsigned int feature_flush) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_flush(info->rq, info->feature_flush); + blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH, + info->feature_flush & REQ_FUA); pr_info("blkfront: %s: %s %s %s %s %s\n", info->gd->disk_name, flush_info(info->feature_flush), "persistent grants:", info->feature_persistent ? -- cgit v0.10.2 From a98239dec6b20cff7b10b670c5d650538f8c3bcb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:16:25 -0600 Subject: ide-disk: update to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 37a8a90..05dbcce 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -522,7 +522,7 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect) static void update_flush(ide_drive_t *drive) { u16 *id = drive->id; - unsigned flush = 0; + bool wc = false; if (drive->dev_flags & IDE_DFLAG_WCACHE) { unsigned long long capacity; @@ -546,12 +546,12 @@ static void update_flush(ide_drive_t *drive) drive->name, barrier ? "" : "not "); if (barrier) { - flush = REQ_FLUSH; + wc = true; blk_queue_prep_rq(drive->queue, idedisk_prep_fn); } } - blk_queue_flush(drive->queue, flush); + blk_queue_write_cache(drive->queue, wc, false); } ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE); -- cgit v0.10.2 From 56883a7ec85f5bc2da7eca67905de54a475daebc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:16:53 -0600 Subject: md: update to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/md/md.c b/drivers/md/md.c index 194580f..5d61e76 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5037,7 +5037,7 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. -- cgit v0.10.2 From e9d5c746246c88a891b0ed21980535909cb39af7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:17:20 -0600 Subject: mmc/block: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 3bdbe50..32daf43 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2286,7 +2286,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || card->ext_csd.rel_sectors)) { md->flags |= MMC_BLK_REL_WR; - blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(md->queue.queue, true, true); } if (mmc_card_mmc(card) && -- cgit v0.10.2 From fec3ff5d1eef8d81b2922deb4eb5a5e84a93bf4c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:17:47 -0600 Subject: mtd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f470118..74ae243 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -409,7 +409,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto error3; if (tr->flush) - blk_queue_flush(new->rq, REQ_FLUSH); + blk_queue_write_cache(new->rq, true, false); new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); -- cgit v0.10.2 From f935a8ce0a60acf0a7fe4da8d2a1e5a70c598e55 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:19:17 -0600 Subject: um: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 39ba207..17e96dc 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -862,7 +862,7 @@ static int ubd_add(int n, char **error_out) goto out; } ubd_dev->queue->queuedata = ubd_dev; - blk_queue_flush(ubd_dev->queue, REQ_FLUSH); + blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); -- cgit v0.10.2 From 2245f6de6c68b225986229a2de78c240536f7f38 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:19:30 -0600 Subject: block: kill blk_queue_flush() We don't have any drivers left using it, so kill it off. Update documentation to use the newer blk_queue_write_cache(). Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt index 83407d3..59e0516 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.txt @@ -71,7 +71,7 @@ requests that have a payload. For devices with volatile write caches the driver needs to tell the block layer that it supports flushing caches by doing: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH); + blk_queue_write_cache(sdkp->disk->queue, true, false); and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that REQ_FLUSH requests with a payload are automatically turned into a sequence @@ -79,7 +79,7 @@ of an empty REQ_FLUSH request followed by the actual write by the block layer. For devices that also support the FUA bit the block layer needs to be told to pass through the REQ_FUA bit using: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(sdkp->disk->queue, true, true); and the driver must handle write requests that have the REQ_FUA bit set in prep_fn/request_fn. If the FUA bit is not natively supported the block diff --git a/block/blk-settings.c b/block/blk-settings.c index c903bee..80d9327 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -820,26 +820,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); -/** - * blk_queue_flush - configure queue's cache flush capability - * @q: the request queue for the device - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA - * - * Tell block layer cache flush capability of @q. If it supports - * flushing, REQ_FLUSH should be set. If it supports bypassing - * write cache for individual writes, REQ_FUA should be set. - */ -void blk_queue_flush(struct request_queue *q, unsigned int flush) -{ - WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); - - if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) - flush &= ~REQ_FUA; - - q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); -} -EXPORT_SYMBOL_GPL(blk_queue_flush); - void blk_queue_flush_queueable(struct request_queue *q, bool queueable) { q->flush_not_queueable = !queueable; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba72687..f3f232f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1009,7 +1009,6 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); -- cgit v0.10.2 From 7e19793096994d43d213f440f4bbea926828a727 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 16:11:11 -0600 Subject: NVMe: silence warning about unused 'dev' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depending on options, we might not be using dev in nvme_cancel_io(): drivers/nvme/host/pci.c: In function ‘nvme_cancel_io’: drivers/nvme/host/pci.c:970:19: warning: unused variable ‘dev’ [-Wunused-variable] struct nvme_dev *dev = data; ^ So get rid of it, and just cast for the dev_dbg_ratelimited() call. Fixes: 82b4552b91c4 ("nvme: Use blk-mq helper for IO termination") Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bd0d394..36b6cdf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -967,13 +967,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_cancel_io(struct request *req, void *data, bool reserved) { - struct nvme_dev *dev = data; int status; if (!blk_mq_request_started(req)) return; - dev_dbg_ratelimited(dev->ctrl.device, "Cancelling I/O %d", req->tag); + dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device, + "Cancelling I/O %d", req->tag); status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) -- cgit v0.10.2 From c875a7093f0479215cf9bf51356d7638f2ec5746 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 13 Apr 2016 11:08:20 -0300 Subject: nvme: Avoid reset work on watchdog timer function during error recovery This patch adds a check on nvme_watchdog_timer() function to avoid the call to reset_work() when an error recovery process is ongoing on controller. The check is made by looking at pci_channel_offline() result. If we don't check for this on nvme_watchdog_timer(), error recovery mechanism can't recover well, because reset_work() won't be able to do its job (since we're in the middle of an error) and so the controller is removed from the system before error recovery mechanism can perform slot reset (which would allow the adapter to recover). In this patch we also have split the huge condition expression on nvme_watchdog_timer() by introducing an auxiliary function to help make the code more readable. Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Guilherme G. Piccoli Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 36b6cdf..ff3c8d7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1303,22 +1303,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (work_busy(&dev->reset_work)) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + static void nvme_watchdog_timer(unsigned long data) { struct nvme_dev *dev = (struct nvme_dev *)data; u32 csts = readl(dev->bar + NVME_REG_CSTS); - /* - * Skip controllers currently under reset. - */ - if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) && - ((csts & NVME_CSTS_CFS) || - (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) { - if (queue_work(nvme_workq, &dev->reset_work)) { + /* Skip controllers under certain specific conditions. */ + if (nvme_should_reset(dev, csts)) { + if (queue_work(nvme_workq, &dev->reset_work)) dev_warn(dev->dev, "Failed status: 0x%x, reset controller.\n", csts); - } return; } -- cgit v0.10.2 From c888a8f95ae5b1067855235b3b71c1ebccf504f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Apr 2016 13:33:19 -0600 Subject: block: kill off q->flush_flags Now that we converted everything to the newer block write cache interface, kill off the queue flush_flags and queueable flush entries. Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index c502277..2475b1c7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1964,7 +1964,8 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); if (!nr_sectors) { err = 0; diff --git a/block/blk-flush.c b/block/blk-flush.c index 9c423e5..b1c91d2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,17 +95,18 @@ enum { static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq); -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; - if (fflags & REQ_FLUSH) { + if (fflags & (1UL << QUEUE_FLAG_WC)) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } return policy; @@ -384,7 +385,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) void blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); @@ -393,7 +394,7 @@ void blk_insert_flush(struct request *rq) * REQ_FLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_FLUSH; - if (!(fflags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 80d9327..f679ae1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -822,7 +822,12 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_queue_flush_queueable(struct request_queue *q, bool queueable) { - q->flush_not_queueable = !queueable; + spin_lock_irq(q->queue_lock); + if (queueable) + clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + else + set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); @@ -837,16 +842,13 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { spin_lock_irq(q->queue_lock); - if (wc) { + if (wc) queue_flag_set(QUEUE_FLAG_WC, q); - q->flush_flags = REQ_FLUSH; - } else + else queue_flag_clear(QUEUE_FLAG_WC, q); - if (fua) { - if (wc) - q->flush_flags |= REQ_FUA; + if (fua) queue_flag_set(QUEUE_FLAG_FUA, q); - } else + else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 26aa080..3355f1c 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -477,7 +477,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->type |= VDISK_REMOVABLE; q = bdev_get_queue(bdev); - if (q && q->flush_flags) + if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; if (q && blk_queue_secdiscard(q)) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b1ffc0..626a5ec 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1348,13 +1348,13 @@ static void dm_table_verify_integrity(struct dm_table *t) static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - unsigned flush = (*(unsigned *)data); + unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->flush_flags & flush); + return q && (q->queue_flags & flush); } -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { struct dm_target *ti; unsigned i = 0; @@ -1375,7 +1375,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) return true; if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, &flush)) + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) return true; } @@ -1518,9 +1518,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - if (dm_table_supports_flush(t, REQ_FLUSH)) { + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; - if (dm_table_supports_flush(t, REQ_FUA)) + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) fua = true; } blk_queue_write_cache(q, wc, fua); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9531f5f..26f1497 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1188,6 +1188,7 @@ ioerr: int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { + struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; if (PAGE_SIZE != 4096) @@ -1197,7 +1198,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; - log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 026a758..7c4efb4 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -687,10 +687,10 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, * Force writethrough using WRITE_FUA if a volatile write cache * is not enabled, or if initiator set the Force Unit Access bit. */ - if (q->flush_flags & REQ_FUA) { + if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) { if (cmd->se_cmd_flags & SCF_FUA) rw = WRITE_FUA; - else if (!(q->flush_flags & REQ_FLUSH)) + else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) rw = WRITE_FUA; else rw = WRITE; @@ -836,7 +836,7 @@ static bool iblock_get_write_cache(struct se_device *dev) struct block_device *bd = ib_dev->ibd_bd; struct request_queue *q = bdev_get_queue(bd); - return q->flush_flags & REQ_FLUSH; + return test_bit(QUEUE_FLAG_WC, &q->queue_flags); } static const struct target_backend_ops iblock_ops = { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3f232f..57c0859 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -433,8 +433,6 @@ struct request_queue { /* * for flush operations */ - unsigned int flush_flags; - unsigned int flush_not_queueable:1; struct blk_flush_queue *fq; struct list_head requeue_list; @@ -493,6 +491,7 @@ struct request_queue { #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 23 /* Write back caching */ #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ +#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -1365,7 +1364,7 @@ static inline unsigned int block_size(struct block_device *bdev) static inline bool queue_flush_queueable(struct request_queue *q) { - return !q->flush_not_queueable; + return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); } typedef struct {struct page *v;} Sector; -- cgit v0.10.2 From 49bdedb36271fe6259dd251bb63c5879fa7834e1 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 25 Apr 2016 19:12:38 -0600 Subject: skd: remove broken discard support Simply creating a file system on an skd device, followed by mount and fstrim will result in errors in the logs and then a BUG(). Let's remove discard support from that driver. As far as I can tell, it hasn't worked right since it was merged. This patch also has a side-effect of cleaning up an unintentional shadowed declaration inside of skd_end_request. I tested to ensure that I can still do I/O to the device using xfstests ./check -g quick. I didn't do anything more extensive than that, though. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 41aaae3..910e065 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -133,7 +133,6 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) #define INQ_STD_NBYTES 36 -#define SKD_DISCARD_CDB_LENGTH 24 enum skd_drvr_state { SKD_DRVR_STATE_LOAD, @@ -212,7 +211,6 @@ struct skd_request_context { struct request *req; u8 flush_cmd; - u8 discard_page; u32 timeout_stamp; u8 sg_data_dir; @@ -230,7 +228,6 @@ struct skd_request_context { }; #define SKD_DATA_DIR_HOST_TO_CARD 1 #define SKD_DATA_DIR_CARD_TO_HOST 2 -#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */ struct skd_special_context { struct skd_request_context req; @@ -540,31 +537,6 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, scsi_req->cdb[9] = 0; } -static void -skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, - struct skd_request_context *skreq, - struct page *page, - u32 lba, u32 count) -{ - char *buf; - unsigned long len; - struct request *req; - - buf = page_address(page); - len = SKD_DISCARD_CDB_LENGTH; - - scsi_req->cdb[0] = UNMAP; - scsi_req->cdb[8] = len; - - put_unaligned_be16(6 + 16, &buf[0]); - put_unaligned_be16(16, &buf[2]); - put_unaligned_be64(lba, &buf[8]); - put_unaligned_be32(count, &buf[16]); - - req = skreq->req; - blk_add_request_payload(req, page, 0, len); -} - static void skd_request_fn_not_online(struct request_queue *q); static void skd_request_fn(struct request_queue *q) @@ -575,7 +547,6 @@ static void skd_request_fn(struct request_queue *q) struct skd_request_context *skreq; struct request *req = NULL; struct skd_scsi_request *scsi_req; - struct page *page; unsigned long io_flags; int error; u32 lba; @@ -669,7 +640,6 @@ static void skd_request_fn(struct request_queue *q) skreq->flush_cmd = 0; skreq->n_sg = 0; skreq->sg_byte_count = 0; - skreq->discard_page = 0; /* * OK to now dequeue request from q. @@ -735,18 +705,7 @@ static void skd_request_fn(struct request_queue *q) else skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; - if (io_flags & REQ_DISCARD) { - page = alloc_page(GFP_ATOMIC | __GFP_ZERO); - if (!page) { - pr_err("request_fn:Page allocation failed.\n"); - skd_end_request(skdev, skreq, -ENOMEM); - break; - } - skreq->discard_page = 1; - req->completion_data = page; - skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); - - } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { skd_prep_zerosize_flush_cdb(scsi_req, skreq); SKD_ASSERT(skreq->flush_cmd == 1); @@ -851,16 +810,6 @@ skip_sg: static void skd_end_request(struct skd_device *skdev, struct skd_request_context *skreq, int error) { - struct request *req = skreq->req; - unsigned int io_flags = req->cmd_flags; - - if ((io_flags & REQ_DISCARD) && - (skreq->discard_page == 1)) { - pr_debug("%s:%s:%d, free the page!", - skdev->name, __func__, __LINE__); - __free_page(req->completion_data); - } - if (unlikely(error)) { struct request *req = skreq->req; char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; @@ -4419,12 +4368,6 @@ static int skd_cons_disk(struct skd_device *skdev) /* set sysfs ptimal_io_size to 8K */ blk_queue_io_opt(q, 8192); - /* DISCARD Flag initialization. */ - q->limits.discard_granularity = 8192; - q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - q->limits.discard_zeroes_data = 1; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); -- cgit v0.10.2 From 76e3914ae51714b0535c38d9472d89124e0b6b96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 Apr 2016 14:57:58 -0400 Subject: nvme: fix cntlid type Controller IDs in NVMe are unsigned 16-bit types. In the Fabrics driver we actually pass ctrl->id by reference, so we need it to have the correct type. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8e8fae8..9b96e75 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -84,7 +84,7 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; - int cntlid; + u16 cntlid; u32 ctrl_config; -- cgit v0.10.2 From b31356dfde571d925768783f1bb63ca8e156d0b3 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 20 Apr 2016 10:04:32 +0800 Subject: NVMe: small typo in section BLK_DEV_NVME_SCSI of host/Kconfig "as well as " is miss typed "as well a " in section "config BLK_DEV_NVME_SCSI" Signed-off-by: Wang Sheng-Hui Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c894841..d296fc3 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -18,7 +18,7 @@ config BLK_DEV_NVME_SCSI depends on NVME_CORE ---help--- This adds support for the SG_IO ioctl on the NVMe character - and block devices nodes, as well a a translation for a small + and block devices nodes, as well as a translation for a small number of selected SCSI commands to NVMe commands to the NVMe driver. If you don't know what this means you probably want to say N here, unless you run a distro that abuses the SCSI -- cgit v0.10.2 From 3b24774e1fb90a40836e96e39a851a774679efff Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Apr 2016 15:51:18 -0600 Subject: NVMe: Fix check_flush_dependency warning If the controller fails and is degraded after a reset, we need to kill off all requests queues before removing the inaccessble namespaces. This will prevent del_gendisk from syncing dirty data, which we can't due from a WQ_MEM_RECLAIM work queue. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ff3c8d7..cc46fdf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1887,6 +1887,7 @@ static void nvme_reset_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->ctrl.device, "IO queues not created\n"); + nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); -- cgit v0.10.2 From 23bd63ceea30878758c303baaf9f8e28f299c578 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 28 Apr 2016 16:19:31 +0800 Subject: NVMe: nvme_core_exit() should do cleanup in the reverse order as nvme_core_init does nvme_core_init does: 1) register_blkdev 2) __register_chrdev 3) class_create nvme_core_exit should do cleanup in the reverse order. Signed-off-by: Wang Sheng-Hui Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4eb5759..20559ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1715,9 +1715,9 @@ int __init nvme_core_init(void) void nvme_core_exit(void) { - unregister_blkdev(nvme_major, "nvme"); class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev(nvme_major, "nvme"); } MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 04a934d4c7251e6458a7898c2b4d6c2da29b132c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Apr 2016 13:51:56 +0200 Subject: nvme: remove the io_incapable method It's unused since "NVMe: Move error handling to failed reset handler". Signed-off-by: Christoph Hellwig Acked-by: Jon Derrick Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9b96e75..41e922b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -136,7 +136,6 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - bool (*io_incapable)(struct nvme_ctrl *ctrl); int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); }; @@ -150,17 +149,6 @@ static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) return val & NVME_CSTS_RDY; } -static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) -{ - u32 val = 0; - - if (ctrl->ops->io_incapable(ctrl)) - return true; - if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return true; - return val & NVME_CSTS_CFS; -} - static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { if (!ctrl->subsystem) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cc46fdf..321b8e0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1942,13 +1942,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - - return !dev->bar || dev->online_queues < 2; -} - static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) { return nvme_reset(to_nvme_dev(ctrl)); @@ -1959,7 +1952,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .io_incapable = nvme_pci_io_incapable, .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, }; -- cgit v0.10.2 From bb8d261e088811ef2b564d745afcd1633428010a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Apr 2016 13:51:57 +0200 Subject: nvme: introduce a controller state machine Replace the adhoc flags in the PCI driver with a state machine in the core code. Based on code from Sagi Grimberg for the Fabrics driver. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Acked-by Jon Derrick: Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 20559ad..bd8f598 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -58,6 +58,55 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state) +{ + enum nvme_ctrl_state old_state = ctrl->state; + bool changed = false; + + spin_lock_irq(&ctrl->lock); + switch (new_state) { + case NVME_CTRL_LIVE: + switch (old_state) { + case NVME_CTRL_RESETTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RESETTING: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_DELETING: + switch (old_state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + default: + break; + } + spin_unlock_irq(&ctrl->lock); + + if (changed) + ctrl->state = new_state; + + return changed; +} +EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); + static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); @@ -1583,6 +1632,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, { int ret; + ctrl->state = NVME_CTRL_NEW; + spin_lock_init(&ctrl->lock); INIT_LIST_HEAD(&ctrl->namespaces); mutex_init(&ctrl->namespaces_mutex); kref_init(&ctrl->kref); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 41e922b..4135626 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -67,7 +67,16 @@ enum nvme_quirks { NVME_QUIRK_DISCARD_ZEROES = (1 << 2), }; +enum nvme_ctrl_state { + NVME_CTRL_NEW, + NVME_CTRL_LIVE, + NVME_CTRL_RESETTING, + NVME_CTRL_DELETING, +}; + struct nvme_ctrl { + enum nvme_ctrl_state state; + spinlock_t lock; const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct device *dev; @@ -187,6 +196,8 @@ static inline bool nvme_req_needs_retry(struct request *req, u16 status) (jiffies - req->start_time) < req->timeout; } +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 321b8e0..0cee236 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -102,11 +102,6 @@ struct nvme_dev { dma_addr_t cmb_dma_addr; u64 cmb_size; u32 cmbsz; - unsigned long flags; - -#define NVME_CTRL_RESETTING 0 -#define NVME_CTRL_REMOVING 1 - struct nvme_ctrl ctrl; struct completion ioq_wait; }; @@ -277,9 +272,8 @@ static void nvme_queue_scan(struct nvme_dev *dev) * Do not queue new scan work when a controller is reset during * removal. */ - if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) - return; - queue_work(nvme_workq, &dev->scan_work); + if (dev->ctrl.state != NVME_CTRL_DELETING) + queue_work(nvme_workq, &dev->scan_work); } static void nvme_complete_async_event(struct nvme_dev *dev, @@ -901,7 +895,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_HANDLED. */ - if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + if (dev->ctrl.state == NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); @@ -1835,7 +1829,7 @@ static void nvme_reset_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); int result = -ENODEV; - if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) goto out; /* @@ -1845,7 +1839,8 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); - set_bit(NVME_CTRL_RESETTING, &dev->flags); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) + goto out; result = nvme_pci_enable(dev); if (result) @@ -1894,7 +1889,10 @@ static void nvme_reset_work(struct work_struct *work) nvme_dev_add(dev); } - clear_bit(NVME_CTRL_RESETTING, &dev->flags); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + goto out; + } return; out: @@ -2067,7 +2065,8 @@ static void nvme_remove(struct pci_dev *pdev) del_timer_sync(&dev->watchdog_timer); - set_bit(NVME_CTRL_REMOVING, &dev->flags); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + pci_set_drvdata(pdev, NULL); flush_work(&dev->async_work); flush_work(&dev->scan_work); -- cgit v0.10.2 From 92911a55d42084cd285250c275d9f238783638c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Apr 2016 13:51:58 +0200 Subject: nvme: tighten up state check for namespace scanning We only should be scanning namespaces if the controller is live. Currently we call the function just before setting it live, so fix the code up to move the call to nvme_queue_scan to just below the state change. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Acked-by Jon Derrick: Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0cee236..9b2deba 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -272,7 +272,7 @@ static void nvme_queue_scan(struct nvme_dev *dev) * Do not queue new scan work when a controller is reset during * removal. */ - if (dev->ctrl.state != NVME_CTRL_DELETING) + if (dev->ctrl.state == NVME_CTRL_LIVE) queue_work(nvme_workq, &dev->scan_work); } @@ -1659,7 +1659,6 @@ static int nvme_dev_add(struct nvme_dev *dev) nvme_free_queues(dev, dev->online_queues); } - nvme_queue_scan(dev); return 0; } @@ -1893,6 +1892,9 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->ctrl.device, "failed to mark controller live\n"); goto out; } + + if (dev->online_queues > 1) + nvme_queue_scan(dev); return; out: -- cgit v0.10.2 From 5955be2144b3b56182e2175e7e3d2ddf27fb485d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Apr 2016 13:51:59 +0200 Subject: nvme: move namespace scanning to core Move the scan work item and surrounding code to the common code. For now we need a new finish_scan method to allow the PCI driver to set the irq affinity hints, but I have plans in the works to obsolete this as well. Note that this moves the namespace scanning from nvme_wq to the system workqueue, but as we don't rely on namespace scanning to finish from reset or I/O this should be fine. Signed-off-by: Christoph Hellwig Acked-by Jon Derrick: Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bd8f598..899bb41 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1518,7 +1518,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) return ret; } -static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns, *next; unsigned i; @@ -1534,11 +1534,16 @@ static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) } } -void nvme_scan_namespaces(struct nvme_ctrl *ctrl) +static void nvme_scan_work(struct work_struct *work) { + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, scan_work); struct nvme_id_ctrl *id; unsigned nn; + if (ctrl->state != NVME_CTRL_LIVE) + return; + if (nvme_identify_ctrl(ctrl, &id)) return; @@ -1549,13 +1554,26 @@ void nvme_scan_namespaces(struct nvme_ctrl *ctrl) if (!nvme_scan_ns_list(ctrl, nn)) goto done; } - __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + nvme_scan_ns_sequential(ctrl, nn); done: list_sort(NULL, &ctrl->namespaces, ns_cmp); mutex_unlock(&ctrl->namespaces_mutex); kfree(id); + + if (ctrl->ops->post_scan) + ctrl->ops->post_scan(ctrl); } -EXPORT_SYMBOL_GPL(nvme_scan_namespaces); + +void nvme_queue_scan(struct nvme_ctrl *ctrl) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->scan_work); +} +EXPORT_SYMBOL_GPL(nvme_queue_scan); void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { @@ -1597,6 +1615,9 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + flush_work(&ctrl->scan_work); + nvme_remove_namespaces(ctrl); + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); @@ -1640,6 +1661,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; + INIT_WORK(&ctrl->scan_work, nvme_scan_work); ret = nvme_set_instance(ctrl); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4135626..9b63e71 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -108,6 +108,7 @@ struct nvme_ctrl { u32 vs; bool subsystem; unsigned long quirks; + struct work_struct scan_work; }; /* @@ -147,6 +148,7 @@ struct nvme_ctrl_ops { int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); + void (*post_scan)(struct nvme_ctrl *ctrl); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -207,7 +209,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); -void nvme_scan_namespaces(struct nvme_ctrl *ctrl); +void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9b2deba..15bc337 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -92,7 +92,6 @@ struct nvme_dev { struct msix_entry *entry; void __iomem *bar; struct work_struct reset_work; - struct work_struct scan_work; struct work_struct remove_work; struct work_struct async_work; struct timer_list watchdog_timer; @@ -266,16 +265,6 @@ static int nvme_init_request(void *data, struct request *req, return 0; } -static void nvme_queue_scan(struct nvme_dev *dev) -{ - /* - * Do not queue new scan work when a controller is reset during - * removal. - */ - if (dev->ctrl.state == NVME_CTRL_LIVE) - queue_work(nvme_workq, &dev->scan_work); -} - static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { @@ -293,7 +282,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev, switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(dev->ctrl.device, "rescanning\n"); - nvme_queue_scan(dev); + nvme_queue_scan(&dev->ctrl); default: dev_warn(dev->ctrl.device, "async event result %08x\n", result); } @@ -1520,8 +1509,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } -static void nvme_set_irq_hints(struct nvme_dev *dev) +static void nvme_pci_post_scan(struct nvme_ctrl *ctrl) { + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq; int i; @@ -1536,16 +1526,6 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) } } -static void nvme_dev_scan(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - - if (!dev->tagset.tags) - return; - nvme_scan_namespaces(&dev->ctrl); - nvme_set_irq_hints(dev); -} - static void nvme_del_queue_end(struct request *req, int error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1894,7 +1874,7 @@ static void nvme_reset_work(struct work_struct *work) } if (dev->online_queues > 1) - nvme_queue_scan(dev); + nvme_queue_scan(&dev->ctrl); return; out: @@ -1954,6 +1934,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read64 = nvme_pci_reg_read64, .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, + .post_scan = nvme_pci_post_scan, }; static int nvme_dev_map(struct nvme_dev *dev) @@ -2005,7 +1986,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); INIT_WORK(&dev->async_work, nvme_async_event_work); @@ -2071,8 +2051,6 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->async_work); - flush_work(&dev->scan_work); - nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); flush_work(&dev->reset_work); -- cgit v0.10.2 From f866fc4282a81673ef973ad54c68235a3263b42e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Apr 2016 13:52:00 +0200 Subject: nvme: move AER handling to common code The transport driver still needs to do the actual submission, but all the higher level code can be shared. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 899bb41..3cf366a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1584,6 +1584,54 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static void nvme_async_event_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, async_event_work); + + spin_lock_irq(&ctrl->lock); + while (ctrl->event_limit > 0) { + int aer_idx = --ctrl->event_limit; + + spin_unlock_irq(&ctrl->lock); + ctrl->ops->submit_async_event(ctrl, aer_idx); + spin_lock_irq(&ctrl->lock); + } + spin_unlock_irq(&ctrl->lock); +} + +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe) +{ + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { + ++ctrl->event_limit; + schedule_work(&ctrl->async_event_work); + } + + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(ctrl->device, "rescanning\n"); + nvme_queue_scan(ctrl); + break; + default: + dev_warn(ctrl->device, "async event result %08x\n", result); + } +} +EXPORT_SYMBOL_GPL(nvme_complete_async_event); + +void nvme_queue_async_events(struct nvme_ctrl *ctrl) +{ + ctrl->event_limit = NVME_NR_AERS; + schedule_work(&ctrl->async_event_work); +} +EXPORT_SYMBOL_GPL(nvme_queue_async_events); + static DEFINE_IDA(nvme_instance_ida); static int nvme_set_instance(struct nvme_ctrl *ctrl) @@ -1615,6 +1663,7 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); nvme_remove_namespaces(ctrl); @@ -1662,6 +1711,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->ops = ops; ctrl->quirks = quirks; INIT_WORK(&ctrl->scan_work, nvme_scan_work); + INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); ret = nvme_set_instance(ctrl); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9b63e71..631a11e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -109,6 +109,7 @@ struct nvme_ctrl { bool subsystem; unsigned long quirks; struct work_struct scan_work; + struct work_struct async_event_work; }; /* @@ -149,6 +150,7 @@ struct nvme_ctrl_ops { int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*post_scan)(struct nvme_ctrl *ctrl); + void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -212,6 +214,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +#define NVME_NR_AERS 1 +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe); +void nvme_queue_async_events(struct nvme_ctrl *ctrl); + void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 15bc337..82a0fc2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -54,8 +54,7 @@ * We handle AEN commands ourselves and don't even let the * block layer know about them. */ -#define NVME_NR_AEN_COMMANDS 1 -#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -93,7 +92,6 @@ struct nvme_dev { void __iomem *bar; struct work_struct reset_work; struct work_struct remove_work; - struct work_struct async_work; struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; @@ -265,29 +263,6 @@ static int nvme_init_request(void *data, struct request *req, return 0; } -static void nvme_complete_async_event(struct nvme_dev *dev, - struct nvme_completion *cqe) -{ - u16 status = le16_to_cpu(cqe->status) >> 1; - u32 result = le32_to_cpu(cqe->result); - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { - ++dev->ctrl.event_limit; - queue_work(nvme_workq, &dev->async_work); - } - - if (status != NVME_SC_SUCCESS) - return; - - switch (result & 0xff07) { - case NVME_AER_NOTICE_NS_CHANGED: - dev_info(dev->ctrl.device, "rescanning\n"); - nvme_queue_scan(&dev->ctrl); - default: - dev_warn(dev->ctrl.device, "async event result %08x\n", result); - } -} - /** * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -709,7 +684,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) */ if (unlikely(nvmeq->qid == 0 && cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(nvmeq->dev, &cqe); + nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe); continue; } @@ -778,21 +753,18 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static void nvme_async_event_work(struct work_struct *work) +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; spin_lock_irq(&nvmeq->q_lock); - while (dev->ctrl.event_limit > 0) { - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + - --dev->ctrl.event_limit; - __nvme_submit_cmd(nvmeq, &c); - } + __nvme_submit_cmd(nvmeq, &c); spin_unlock_irq(&nvmeq->q_lock); } @@ -1848,10 +1820,8 @@ static void nvme_reset_work(struct work_struct *work) * should not submit commands the user did not request, so skip * registering for asynchronous event notification on this condition. */ - if (dev->online_queues > 1) { - dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; - queue_work(nvme_workq, &dev->async_work); - } + if (dev->online_queues > 1) + nvme_queue_async_events(&dev->ctrl); mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); @@ -1935,6 +1905,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, .post_scan = nvme_pci_post_scan, + .submit_async_event = nvme_pci_submit_async_event, }; static int nvme_dev_map(struct nvme_dev *dev) @@ -1988,7 +1959,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - INIT_WORK(&dev->async_work, nvme_async_event_work); setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, (unsigned long)dev); mutex_init(&dev->shutdown_lock); @@ -2050,7 +2020,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); pci_set_drvdata(pdev, NULL); - flush_work(&dev->async_work); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); flush_work(&dev->reset_work); -- cgit v0.10.2 From 6904242db1ac07403c331b18796f6c2bf5382aec Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Mon, 25 Apr 2016 14:33:20 -0700 Subject: nvme: add helper nvme_cleanup_cmd() This hides command cleanup into nvme.h and fabrics drivers will also use it. Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 631a11e..114b928 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -182,6 +182,12 @@ static inline unsigned nvme_map_len(struct request *rq) return blk_rq_bytes(rq); } +static inline void nvme_cleanup_cmd(struct request *req) +{ + if (req->cmd_flags & REQ_DISCARD) + kfree(req->completion_data); +} + static inline int nvme_error_status(u16 status) { switch (status & 0x7ff) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 82a0fc2..077e9bf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -321,8 +321,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; - if (req->cmd_flags & REQ_DISCARD) - kfree(req->completion_data); + nvme_cleanup_cmd(req); if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); -- cgit v0.10.2 From a5b714ad395803a6aa91793b9e52a81b176b8ba9 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 27 Apr 2016 20:10:16 +0800 Subject: NVMe: correct comment for offset enum of controller registers in nvme.h Section 3.1 gives the comment for the offset of controller registers in the specification 1.2a. Some are mis-copied in the header file nvme.h. Correct them. Signed-off-by: Wang Sheng-Hui Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a55986f..7d51b29 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -21,13 +21,13 @@ enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ NVME_REG_VS = 0x0008, /* Version */ NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ - NVME_REG_INTMC = 0x0010, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */ NVME_REG_CC = 0x0014, /* Controller Configuration */ NVME_REG_CSTS = 0x001c, /* Controller Status */ NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ - NVME_REG_ACQ = 0x0030, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ }; -- cgit v0.10.2 From 0bf77e9dbb5247ae159342db6f8fdb48aba24b56 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Mon, 25 Apr 2016 14:20:18 -0700 Subject: nvme: switch to RCU freeing the namespace Switch to RCU freeing the namespace structure so that nvme_start_queues, nvme_stop_queues and nvme_kill_queues would be able to get away with only a RCU read side critical section. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimerg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3cf366a..3428c02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1429,7 +1429,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; - list_add_tail(&ns->list, &ctrl->namespaces); + list_add_tail_rcu(&ns->list, &ctrl->namespaces); kref_get(&ctrl->kref); if (ns->type == NVME_NS_LIGHTNVM) return; @@ -1467,6 +1467,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); mutex_unlock(&ns->ctrl->namespaces_mutex); + synchronize_rcu(); nvme_put_ns(ns); } @@ -1751,8 +1752,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; - mutex_lock(&ctrl->namespaces_mutex); - list_for_each_entry(ns, &ctrl->namespaces, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (!kref_get_unless_zero(&ns->kref)) continue; @@ -1769,7 +1770,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) nvme_put_ns(ns); } - mutex_unlock(&ctrl->namespaces_mutex); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nvme_kill_queues); @@ -1777,8 +1778,8 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; - mutex_lock(&ctrl->namespaces_mutex); - list_for_each_entry(ns, &ctrl->namespaces, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { spin_lock_irq(ns->queue->queue_lock); queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); spin_unlock_irq(ns->queue->queue_lock); @@ -1786,7 +1787,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) blk_mq_cancel_requeue_work(ns->queue); blk_mq_stop_hw_queues(ns->queue); } - mutex_unlock(&ctrl->namespaces_mutex); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nvme_stop_queues); @@ -1794,13 +1795,13 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; - mutex_lock(&ctrl->namespaces_mutex); - list_for_each_entry(ns, &ctrl->namespaces, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); blk_mq_kick_requeue_list(ns->queue); } - mutex_unlock(&ctrl->namespaces_mutex); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nvme_start_queues); -- cgit v0.10.2 From b7b9c2278752e37dc7ae918cda823aa2a078e03b Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Mon, 25 Apr 2016 14:20:19 -0700 Subject: nvme: fix nvme_ns_remove() deadlock On receipt of a namespace attribute changed AER, we acquire the namespace mutex lock before proceeding to scan and validate the namespace list. In case of namespace detach/delete command, nvme_ns_remove function deadlocks trying to acquire the already held lock. All callers, except nvme_remove_namespaces(), of nvme_ns_remove() already held namespaces_mutex. So we can simply fix the deadlock by not acquiring the mutex in nvme_ns_remove() and acquiring it in nvme_remove_namespaces(). Reported-by: Sunad Bhandary S Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimerg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3428c02..2de248b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1452,6 +1452,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) static void nvme_ns_remove(struct nvme_ns *ns) { + lockdep_assert_held(&ns->ctrl->namespaces_mutex); + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; @@ -1464,9 +1466,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } - mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); - mutex_unlock(&ns->ctrl->namespaces_mutex); synchronize_rcu(); nvme_put_ns(ns); } @@ -1580,8 +1580,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); + mutex_unlock(&ctrl->namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); -- cgit v0.10.2 From 87c32077819c695cbc5ab00226a28010cd5806c3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 Apr 2016 16:11:02 -0600 Subject: NVMe: Fix reset/remove race This fixes a scenario where device is present and being reset, but a request to unbind the driver occurs. A previous patch series addressing a device failure removal scenario flushed reset_work after controller disable to unblock reset_work waiting on a completion that wouldn't occur. This isn't safe as-is. The broken scenario can potentially be induced with: modprobe nvme && modprobe -r nvme To fix, the reset work is flushed immediately after setting the controller removing flag, and any subsequent reset will not proceed with controller initialization if the flag is set. The controller status must be polled while active, so the watchdog timer is also left active until the controller is disabled to cleanup requests that may be stuck during namespace removal. [Fixes: ff23a2a15a2117245b4599c1352343c8b8fb4c43] Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 077e9bf..fb741d0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2014,11 +2014,10 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - del_timer_sync(&dev->watchdog_timer); - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); pci_set_drvdata(pdev, NULL); + flush_work(&dev->reset_work); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); flush_work(&dev->reset_work); -- cgit v0.10.2 From 57aac2f1be4a0711b7f24f7d367e8672ebaa3844 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 6 May 2016 20:02:54 +0200 Subject: =?UTF-8?q?lightnvm:=20fix=20"warning:=20=E2=80=98ret=E2=80=99=20m?= =?UTF-8?q?ay=20be=20used=20uninitialized"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following warnings: drivers/lightnvm/sysblk.c:125:9: warning: ‘ret’ may be used uninitialized in this function drivers/lightnvm/sysblk.c:275:15: warning: ‘ret’ may be used uninitialized in this function In both cases, ret is only set from within a loop that may not be entered. Signed-off-by: Jeff Mahoney Reviewed-by: Johannes Thumshirn Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index 321de1f..b1e1404 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -122,7 +122,7 @@ static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, struct ppa_addr *ppas, nvm_bb_update_fn *fn) { struct ppa_addr dppa; - int i, ret; + int i, ret = 0; s->nr_ppas = 0; @@ -272,7 +272,7 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, { struct nvm_system_block nvmsb; void *buf; - int i, sect, ret, bufsz; + int i, sect, ret = 0, bufsz; struct ppa_addr *ppas; nvm_cpu_to_sysblk(&nvmsb, info); -- cgit v0.10.2 From ecfb40c6aa5691257054eac81bc8bdfd5442e8e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:02:55 +0200 Subject: lightnvm: handle submit_io failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device ->submit_io() callback might fail to submit I/O to device. In that case, the nvm_submit_ppa function should not wait for completion. Instead return the ->submit_io() error. Reviewed by: Johannes Thumshirn Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0dc9a80..c2ef53a 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -351,6 +351,11 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, nvm_generic_to_addr_mode(dev, &rqd); ret = dev->ops->submit_io(dev, &rqd); + if (ret) { + nvm_free_rqd_ppalist(dev, &rqd); + bio_put(bio); + return ret; + } /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; -- cgit v0.10.2 From 1145e6351a9fefe0965df4c6dba2a04156dc47d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:02:56 +0200 Subject: lightnvm: implement nvm_submit_ppa_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_submit_ppa function assumes that users manage all plane blocks as a single block. Extend the API with nvm_submit_ppa_list to allow the user to send its own ppa list. If the user submits more than a single PPA, the user must take care to allocate and free the corresponding ppa list. Reviewed by: Johannes Thumshirn Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index c2ef53a..f4e04a5 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -322,11 +322,10 @@ static void nvm_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, - int opcode, int flags, void *buf, int len) +int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, + int flags, void *buf, int len) { DECLARE_COMPLETION_ONSTACK(wait); - struct nvm_rq rqd; struct bio *bio; int ret; unsigned long hang_check; @@ -335,24 +334,17 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, if (IS_ERR_OR_NULL(bio)) return -ENOMEM; - memset(&rqd, 0, sizeof(struct nvm_rq)); - ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas); - if (ret) { - bio_put(bio); - return ret; - } + nvm_generic_to_addr_mode(dev, rqd); - rqd.opcode = opcode; - rqd.bio = bio; - rqd.wait = &wait; - rqd.dev = dev; - rqd.end_io = nvm_end_io_sync; - rqd.flags = flags; - nvm_generic_to_addr_mode(dev, &rqd); + rqd->dev = dev; + rqd->opcode = opcode; + rqd->flags = flags; + rqd->bio = bio; + rqd->wait = &wait; + rqd->end_io = nvm_end_io_sync; - ret = dev->ops->submit_io(dev, &rqd); + ret = dev->ops->submit_io(dev, rqd); if (ret) { - nvm_free_rqd_ppalist(dev, &rqd); bio_put(bio); return ret; } @@ -364,9 +356,67 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, else wait_for_completion_io(&wait); + return rqd->error; +} + +/** + * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must + * take to free ppa list if necessary. + * @dev: device + * @ppa_list: user created ppa_list + * @nr_ppas: length of ppa_list + * @opcode: device opcode + * @flags: device flags + * @buf: data buffer + * @len: data buffer length + */ +int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list, + int nr_ppas, int opcode, int flags, void *buf, int len) +{ + struct nvm_rq rqd; + + if (dev->ops->max_phys_sect < nr_ppas) + return -EINVAL; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + rqd.nr_pages = nr_ppas; + if (nr_ppas > 1) + rqd.ppa_list = ppa_list; + else + rqd.ppa_addr = ppa_list[0]; + + return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); +} +EXPORT_SYMBOL(nvm_submit_ppa_list); + +/** + * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded + * as single, dual, quad plane PPAs depending on device type. + * @dev: device + * @ppa: user created ppa_list + * @nr_ppas: length of ppa_list + * @opcode: device opcode + * @flags: device flags + * @buf: data buffer + * @len: data buffer length + */ +int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, + int opcode, int flags, void *buf, int len) +{ + struct nvm_rq rqd; + int ret; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas); + if (ret) + return ret; + + ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); + nvm_free_rqd_ppalist(dev, &rqd); - return rqd.error; + return ret; } EXPORT_SYMBOL(nvm_submit_ppa); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cdcb2cc..38814e2 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -534,6 +534,8 @@ extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); +extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, + int, void *, int); /* sysblk.c */ #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ -- cgit v0.10.2 From 4891d120b9cd419f4350b11e1231083745dcdc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:02:57 +0200 Subject: lightnvm: add fpg_size and pfpg_size to struct nvm_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The flash page size (fpg) and size across planes (pfpg) are convenient to know when allocating buffer sizes. This has previously been a calculated in various places. Replace with the pre-calculated values. Reviewed by: Johannes Thumshirn Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index f4e04a5..652b8c7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -476,6 +476,8 @@ static int nvm_core_init(struct nvm_dev *dev) dev->pgs_per_blk = grp->num_pg; dev->blks_per_lun = grp->num_blk; dev->nr_planes = grp->num_pln; + dev->fpg_size = grp->fpg_sz; + dev->pfpg_size = grp->fpg_sz * grp->num_pln; dev->sec_size = grp->csecs; dev->oob_size = grp->sos; dev->sec_per_pg = grp->fpg_sz / grp->csecs; diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index b1e1404..b6971f8 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -154,13 +154,12 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, struct nvm_system_block *sblk) { struct nvm_system_block *cur; - int pg, cursz, ret, found = 0; + int pg, ret, found = 0; /* the full buffer for a flash page is allocated. Only the first of it * contains the system block information */ - cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; - cur = kmalloc(cursz, GFP_KERNEL); + cur = kmalloc(dev->pfpg_size, GFP_KERNEL); if (!cur) return -ENOMEM; @@ -169,7 +168,7 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, ppa->g.pg = ppa_to_slc(dev, pg); ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, - cur, cursz); + cur, dev->pfpg_size); if (ret) { if (ret == NVM_RSP_ERR_EMPTYPAGE) { pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", @@ -272,14 +271,12 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, { struct nvm_system_block nvmsb; void *buf; - int i, sect, ret = 0, bufsz; + int i, sect, ret = 0; struct ppa_addr *ppas; nvm_cpu_to_sysblk(&nvmsb, info); - /* buffer for flash page */ - bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; - buf = kzalloc(bufsz, GFP_KERNEL); + buf = kzalloc(dev->pfpg_size, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); @@ -309,7 +306,7 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, } ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE, - NVM_IO_SLC_MODE, buf, bufsz); + NVM_IO_SLC_MODE, buf, dev->pfpg_size); if (ret) { pr_err("nvm: sysblk failed program (%u %u %u)\n", ppas[0].g.ch, @@ -319,7 +316,7 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, } ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD, - NVM_IO_SLC_MODE, buf, bufsz); + NVM_IO_SLC_MODE, buf, dev->pfpg_size); if (ret) { pr_err("nvm: sysblk failed read (%u %u %u)\n", ppas[0].g.ch, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 38814e2..f7c607f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -323,6 +323,8 @@ struct nvm_dev { int sec_per_pg; /* only sectors for a single page */ int pgs_per_blk; int blks_per_lun; + int fpg_size; + int pfpg_size; /* size of buffer if all pages are to be read */ int sec_size; int oob_size; int mccap; -- cgit v0.10.2 From 22e8c9766a669d49cf3749d397082a5cd93374a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:02:58 +0200 Subject: lightnvm: move block fold outside of get_bb_tbl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get block table command returns a list of blocks and planes with their associated state. Users, such as gennvm and sysblk, manages all planes as a single virtual block. It was therefore natural to fold the bad block list before it is returned. However, to allow users, which manages on a per-plane block level, to also use the interface, the get_bb_tbl interface is changed to not fold by default and instead let the caller fold if necessary. Reviewed by: Johannes Thumshirn Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 652b8c7..4cadbe0 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -420,6 +420,41 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, } EXPORT_SYMBOL(nvm_submit_ppa); +/* + * folds a bad block list from its plane representation to its virtual + * block representation. The fold is done in place and reduced size is + * returned. + * + * If any of the planes status are bad or grown bad block, the virtual block + * is marked bad. If not bad, the first plane state acts as the block state. + */ +int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) +{ + int blk, offset, pl, blktype; + + if (nr_blks != dev->blks_per_lun * dev->plane_mode) + return -EINVAL; + + for (blk = 0; blk < dev->blks_per_lun; blk++) { + offset = blk * dev->plane_mode; + blktype = blks[offset]; + + /* Bad blocks on any planes take precedence over other types */ + for (pl = 0; pl < dev->plane_mode; pl++) { + if (blks[offset + pl] & + (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { + blktype = blks[offset + pl]; + break; + } + } + + blks[blk] = blktype; + } + + return dev->blks_per_lun; +} +EXPORT_SYMBOL(nvm_bb_tbl_fold); + static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) { int i; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 72e124a..6096077 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -129,18 +129,21 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, - void *private) +static int gennvm_block_bb(struct nvm_dev *dev, struct ppa_addr ppa, + u8 *blks, int nr_blks, void *private) { struct gen_nvm *gn = private; - struct nvm_dev *dev = gn->dev; struct gen_lun *lun; struct nvm_block *blk; int i; + nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); + if (nr_blks < 0) + return nr_blks; + lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun]; - for (i = 0; i < nr_blocks; i++) { + for (i = 0; i < nr_blks; i++) { if (blks[i] == 0) continue; @@ -250,8 +253,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa = generic_to_dev_addr(dev, ppa); ret = dev->ops->get_bb_tbl(dev, ppa, - dev->blks_per_lun, - gennvm_block_bb, gn); + gennvm_block_bb, gn); if (ret) pr_err("gennvm: could not read BB table\n"); } diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index b6971f8..7fce588 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -93,12 +93,16 @@ void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); } -static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, - void *private) +static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa, + u8 *blks, int nr_blks, void *private) { struct sysblk_scan *s = private; int i, nr_sysblk = 0; + nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); + if (nr_blks < 0) + return nr_blks; + for (i = 0; i < nr_blks; i++) { if (blks[i] != NVM_BLK_T_HOST) continue; @@ -130,7 +134,7 @@ static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, dppa = generic_to_dev_addr(dev, ppas[i]); s->row = i; - ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s); + ret = dev->ops->get_bb_tbl(dev, dppa, fn, s); if (ret) { pr_err("nvm: failed bb tbl for ppa (%u %u)\n", ppas[i].g.ch, @@ -235,13 +239,17 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) return 0; } -static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, - void *private) +static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa, + u8 *blks, int nr_blks, void *private) { struct sysblk_scan *s = private; struct ppa_addr *sppa; int i, blkid = 0; + nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); + if (nr_blks < 0) + return nr_blks; + for (i = 0; i < nr_blks; i++) { if (blks[i] == NVM_BLK_T_HOST) return -EEXIST; @@ -578,13 +586,16 @@ static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) BITS_PER_LONG; } -static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, - void *private) +static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, + u8 *blks, int nr_blks, void *private) { struct factory_blks *f = private; - struct nvm_dev *dev = f->dev; int i, lunoff; + nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); + if (nr_blks < 0) + return nr_blks; + lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun); /* non-set bits correspond to the block must be erased */ @@ -661,7 +672,7 @@ static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, dev_ppa = generic_to_dev_addr(dev, ppa); - ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv); + ret = dev->ops->get_bb_tbl(dev, dev_ppa, fn, priv); if (ret) pr_err("nvm: failed bb tbl for ch%u lun%u\n", ppa.g.ch, ppa.g.blk); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 9461dd6..d289980 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -387,41 +387,16 @@ out: return ret; } -static void nvme_nvm_bb_tbl_fold(struct nvm_dev *nvmdev, - int nr_dst_blks, u8 *dst_blks, - int nr_src_blks, u8 *src_blks) -{ - int blk, offset, pl, blktype; - - for (blk = 0; blk < nr_dst_blks; blk++) { - offset = blk * nvmdev->plane_mode; - blktype = src_blks[offset]; - - /* Bad blocks on any planes take precedence over other types */ - for (pl = 0; pl < nvmdev->plane_mode; pl++) { - if (src_blks[offset + pl] & - (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { - blktype = src_blks[offset + pl]; - break; - } - } - - dst_blks[blk] = blktype; - } -} - static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - int nr_dst_blks, nvm_bb_update_fn *update_bbtbl, - void *priv) + nvm_bb_update_fn *update_bbtbl, void *priv) { struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - u8 *dst_blks = NULL; - int nr_src_blks = nr_dst_blks * nvmdev->plane_mode; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_src_blks; + int nr_blks = nvmdev->blks_per_lun * nvmdev->plane_mode; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; @@ -432,12 +407,6 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - dst_blks = kzalloc(nr_dst_blks, GFP_KERNEL); - if (!dst_blks) { - ret = -ENOMEM; - goto out; - } - ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { @@ -459,21 +428,17 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - if (le32_to_cpu(bb_tbl->tblks) != nr_src_blks) { + if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { ret = -EINVAL; dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_src_blks); + le32_to_cpu(bb_tbl->tblks), nr_blks); goto out; } - nvme_nvm_bb_tbl_fold(nvmdev, nr_dst_blks, dst_blks, - nr_src_blks, bb_tbl->blk); - ppa = dev_to_generic_addr(nvmdev, ppa); - ret = update_bbtbl(ppa, nr_dst_blks, dst_blks, priv); + ret = update_bbtbl(nvmdev, ppa, bb_tbl->blk, nr_blks, priv); out: - kfree(dst_blks); kfree(bb_tbl); return ret; } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index f7c607f..dacaa28 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -41,11 +41,12 @@ struct nvm_id; struct nvm_dev; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); +typedef int (nvm_bb_update_fn)(struct nvm_dev *, struct ppa_addr, u8 *, int, + void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); @@ -538,6 +539,7 @@ extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); +extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); /* sysblk.c */ #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ -- cgit v0.10.2 From 7f7c5d03c0e8e177d1ed1c6412ae035e8eff35de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:02:59 +0200 Subject: lightnvm: avoid memory leak when lun_map kcalloc fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A memory leak occurs if the lower page table is initialized and the following dev->lun_map fails on allocation. Rearrange the initialization of lower page table to allow dev->lun_map to fail gracefully without memory leak. Reviewed by: Johannes Thumshirn Move kfree of dev->lun_map to nvm_free() Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 4cadbe0..74fb049 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -504,6 +504,7 @@ static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; struct nvm_id_group *grp = &id->groups[0]; + int ret; /* device values */ dev->nr_chnls = grp->num_ch; @@ -522,33 +523,16 @@ static int nvm_core_init(struct nvm_dev *dev) dev->plane_mode = NVM_PLANE_SINGLE; dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size; - if (grp->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - switch (grp->fmtype) { - case NVM_ID_FMTYPE_SLC: - if (nvm_init_slc_tbl(dev, grp)) - return -ENOMEM; - break; - case NVM_ID_FMTYPE_MLC: - if (nvm_init_mlc_tbl(dev, grp)) - return -ENOMEM; - break; - default: - pr_err("nvm: flash type not supported\n"); - return -EINVAL; - } - - if (!dev->lps_per_blk) - pr_info("nvm: lower page programming table missing\n"); - if (grp->mpos & 0x020202) dev->plane_mode = NVM_PLANE_DOUBLE; if (grp->mpos & 0x040404) dev->plane_mode = NVM_PLANE_QUAD; + if (grp->mtype != 0) { + pr_err("nvm: memory type not supported\n"); + return -EINVAL; + } + /* calculated values */ dev->sec_per_pl = dev->sec_per_pg * dev->nr_planes; dev->sec_per_blk = dev->sec_per_pl * dev->pgs_per_blk; @@ -560,11 +544,34 @@ static int nvm_core_init(struct nvm_dev *dev) sizeof(unsigned long), GFP_KERNEL); if (!dev->lun_map) return -ENOMEM; + + switch (grp->fmtype) { + case NVM_ID_FMTYPE_SLC: + if (nvm_init_slc_tbl(dev, grp)) { + ret = -ENOMEM; + goto err_fmtype; + } + break; + case NVM_ID_FMTYPE_MLC: + if (nvm_init_mlc_tbl(dev, grp)) { + ret = -ENOMEM; + goto err_fmtype; + } + break; + default: + pr_err("nvm: flash type not supported\n"); + ret = -EINVAL; + goto err_fmtype; + } + INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); spin_lock_init(&dev->lock); return 0; +err_fmtype: + kfree(dev->lun_map); + return ret; } static void nvm_free(struct nvm_dev *dev) @@ -576,6 +583,7 @@ static void nvm_free(struct nvm_dev *dev) dev->mt->unregister_mgr(dev); kfree(dev->lptbl); + kfree(dev->lun_map); } static int nvm_init(struct nvm_dev *dev) @@ -705,7 +713,6 @@ void nvm_unregister(char *disk_name) up_write(&nvm_lock); nvm_exit(dev); - kfree(dev->lun_map); kfree(dev); } EXPORT_SYMBOL(nvm_unregister); -- cgit v0.10.2 From 66e3d07f75c6472d7198920488330634c118b255 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Fri, 6 May 2016 20:03:00 +0200 Subject: lightnvm: calculate rrpc total blocks and sectors up front MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calculate rrpc total blocks and sectors up front, make sense to use them. For example, we use rrpc->nr_sects to calculate rrpc area size, but it makes no sense if we don't initialize it up front, since it would be zero until we finish rrpc luns init. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 3ab6495..c0e303c 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1207,10 +1207,6 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); spin_lock_init(&rlun->lock); - - rrpc->total_blocks += dev->blks_per_lun; - rrpc->nr_sects += dev->sec_per_lun; - } return 0; @@ -1388,6 +1384,8 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk, INIT_WORK(&rrpc->ws_requeue, rrpc_requeue); rrpc->nr_luns = lun_end - lun_begin + 1; + rrpc->total_blocks = (unsigned long)dev->blks_per_lun * rrpc->nr_luns; + rrpc->nr_sects = (unsigned long long)dev->sec_per_lun * rrpc->nr_luns; /* simple round-robin strategy */ atomic_set(&rrpc->next_lun, -1); -- cgit v0.10.2 From 909049a7199947abbc6a923e4cf5cff1857d4205 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Fri, 6 May 2016 20:03:01 +0200 Subject: lightnvm: store rrpc->soffset in device sector size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we mainly use soffset in device sector size, we therefore store this value in rrpc->soffset, instead of the offset in 512byte sector size. This eliminates the "(ilog2(dev->sec_size) - 9)" calculation on each I/O. Signed-off-by: Wenwei Tao Updated patch description. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index c0e303c..3143b98 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1039,11 +1039,8 @@ static int rrpc_map_init(struct rrpc *rrpc) { struct nvm_dev *dev = rrpc->dev; sector_t i; - u64 slba; int ret; - slba = rrpc->soffset >> (ilog2(dev->sec_size) - 9); - rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects); if (!rrpc->trans_map) return -ENOMEM; @@ -1065,8 +1062,8 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev, slba, rrpc->nr_sects, rrpc_l2p_update, - rrpc); + ret = dev->ops->get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects, + rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); return -EINVAL; @@ -1220,18 +1217,24 @@ static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin) struct nvm_dev *dev = rrpc->dev; struct nvmm_type *mt = dev->mt; sector_t size = rrpc->nr_sects * dev->sec_size; + int ret; size >>= 9; - return mt->get_area(dev, begin, size); + ret = mt->get_area(dev, begin, size); + if (!ret) + *begin >>= (ilog2(dev->sec_size) - 9); + + return ret; } static void rrpc_area_free(struct rrpc *rrpc) { struct nvm_dev *dev = rrpc->dev; struct nvmm_type *mt = dev->mt; + sector_t begin = rrpc->soffset << (ilog2(dev->sec_size) - 9); - mt->put_area(dev, rrpc->soffset); + mt->put_area(dev, begin); } static void rrpc_free(struct rrpc *rrpc) -- cgit v0.10.2 From 6063fe399d0913e7bd4462650cd4f31b479a83c9 Mon Sep 17 00:00:00 2001 From: "Simon A. F. Lund" Date: Fri, 6 May 2016 20:03:02 +0200 Subject: lightnvm: rename nvm_targets to nvm_tgt_type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions nvm_register_target(), nvm_unregister_target() and associated list refers to a target type that is being registered by a target type module. Rename nvm_*_targets() to nvm_*_tgt_type(), so that the intension is clear. This enables target instances to use the _nvm_*_targets() naming. Signed-off-by: Simon A. F. Lund Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 74fb049..240b473 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -30,7 +30,7 @@ #include #include -static LIST_HEAD(nvm_targets); +static LIST_HEAD(nvm_tgt_types); static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); static DECLARE_RWSEM(nvm_lock); @@ -39,14 +39,14 @@ static struct nvm_tgt_type *nvm_find_target_type(const char *name) { struct nvm_tgt_type *tt; - list_for_each_entry(tt, &nvm_targets, list) + list_for_each_entry(tt, &nvm_tgt_types, list) if (!strcmp(name, tt->name)) return tt; return NULL; } -int nvm_register_target(struct nvm_tgt_type *tt) +int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; @@ -54,14 +54,14 @@ int nvm_register_target(struct nvm_tgt_type *tt) if (nvm_find_target_type(tt->name)) ret = -EEXIST; else - list_add(&tt->list, &nvm_targets); + list_add(&tt->list, &nvm_tgt_types); up_write(&nvm_lock); return ret; } -EXPORT_SYMBOL(nvm_register_target); +EXPORT_SYMBOL(nvm_register_tgt_type); -void nvm_unregister_target(struct nvm_tgt_type *tt) +void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) { if (!tt) return; @@ -70,7 +70,7 @@ void nvm_unregister_target(struct nvm_tgt_type *tt) list_del(&tt->list); up_write(&nvm_lock); } -EXPORT_SYMBOL(nvm_unregister_target); +EXPORT_SYMBOL(nvm_unregister_tgt_type); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) @@ -1020,7 +1020,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) info->version[2] = NVM_VERSION_PATCH; down_write(&nvm_lock); - list_for_each_entry(tt, &nvm_targets, list) { + list_for_each_entry(tt, &nvm_tgt_types, list) { struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; tgt->version[0] = tt->version[0]; diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 3143b98..c7fef71 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1469,12 +1469,12 @@ static struct nvm_tgt_type tt_rrpc = { static int __init rrpc_module_init(void) { - return nvm_register_target(&tt_rrpc); + return nvm_register_tgt_type(&tt_rrpc); } static void rrpc_module_exit(void) { - nvm_unregister_target(&tt_rrpc); + nvm_unregister_tgt_type(&tt_rrpc); } module_init(rrpc_module_init); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index dacaa28..497da91 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -453,8 +453,8 @@ struct nvm_tgt_type { struct list_head list; }; -extern int nvm_register_target(struct nvm_tgt_type *); -extern void nvm_unregister_target(struct nvm_tgt_type *); +extern int nvm_register_tgt_type(struct nvm_tgt_type *); +extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); -- cgit v0.10.2 From 6f8645cba54a4b2983bbd46f462b5891ffbd72fc Mon Sep 17 00:00:00 2001 From: "Simon A. F. Lund" Date: Fri, 6 May 2016 20:03:03 +0200 Subject: lightnvm: refactor dev->online_target to global nvm_targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A target name must be unique. However, a per-device registration of targets is maintained on a dev->online_targets list, with a per-device search for targets upon registration. This results in a name collision when two targets, with the same name, are created on two different targets, where the per-device list is not shared. Signed-off-by: Simon A. F. Lund Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 240b473..0296223 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -33,8 +33,20 @@ static LIST_HEAD(nvm_tgt_types); static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); +static LIST_HEAD(nvm_targets); static DECLARE_RWSEM(nvm_lock); +static struct nvm_target *nvm_find_target(const char *name) +{ + struct nvm_target *tgt; + + list_for_each_entry(tgt, &nvm_targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + static struct nvm_tgt_type *nvm_find_target_type(const char *name) { struct nvm_tgt_type *tt; @@ -564,7 +576,6 @@ static int nvm_core_init(struct nvm_dev *dev) goto err_fmtype; } - INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); spin_lock_init(&dev->lock); @@ -744,12 +755,11 @@ static int nvm_create_target(struct nvm_dev *dev, return -EINVAL; } - list_for_each_entry(t, &dev->online_targets, list) { - if (!strcmp(create->tgtname, t->disk->disk_name)) { - pr_err("nvm: target name already exists.\n"); - up_write(&nvm_lock); - return -EINVAL; - } + t = nvm_find_target(create->tgtname); + if (t) { + pr_err("nvm: target name already exists.\n"); + up_write(&nvm_lock); + return -EINVAL; } up_write(&nvm_lock); @@ -789,7 +799,7 @@ static int nvm_create_target(struct nvm_dev *dev, t->disk = tdisk; down_write(&nvm_lock); - list_add_tail(&t->list, &dev->online_targets); + list_add_tail(&t->list, &nvm_targets); up_write(&nvm_lock); return 0; @@ -852,26 +862,19 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) static int __nvm_configure_remove(struct nvm_ioctl_remove *remove) { - struct nvm_target *t = NULL; - struct nvm_dev *dev; - int ret = -1; + struct nvm_target *t; down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) - list_for_each_entry(t, &dev->online_targets, list) { - if (!strcmp(remove->tgtname, t->disk->disk_name)) { - nvm_remove_target(t); - ret = 0; - break; - } - } - up_write(&nvm_lock); - - if (ret) { + t = nvm_find_target(remove->tgtname); + if (!t) { pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname); + up_write(&nvm_lock); return -EINVAL; } + nvm_remove_target(t); + up_write(&nvm_lock); + return 0; } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 497da91..5eabdba 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -308,7 +308,6 @@ struct nvm_dev { struct nvm_dev_ops *ops; struct list_head devices; - struct list_head online_targets; /* Media manager */ struct nvmm_type *mt; -- cgit v0.10.2 From 5136061ce705210b501ed9ecd673a67b74ebe017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:04 +0200 Subject: lightnvm: introduce nvm_for_each_lun_ppa() macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users that wish to iterate all luns on a device. Must create a struct ppa_addr and separate iterators for channels and luns. To set the iterators, two loops are required, one to iterate channels, and another to iterate luns. This leads to decrease in readability. Introduce nvm_for_each_lun_ppa, which implements the nested loop and sets ppa, channel, and lun variable for each loop body, eliminating the boilerplate code. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index 7fce588..a48093d 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -631,33 +631,28 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, while (!done) { done = 1; - for (ch = 0; ch < dev->nr_chnls; ch++) { - for (lun = 0; lun < dev->luns_per_chnl; lun++) { - idx = factory_blk_offset(dev, ch, lun); - offset = &f->blks[idx]; - - blkid = find_first_zero_bit(offset, - dev->blks_per_lun); - if (blkid >= dev->blks_per_lun) - continue; - set_bit(blkid, offset); - - ppa.ppa = 0; - ppa.g.ch = ch; - ppa.g.lun = lun; - ppa.g.blk = blkid; - pr_debug("nvm: erase ppa (%u %u %u)\n", - ppa.g.ch, - ppa.g.lun, - ppa.g.blk); - - erase_list[ppa_cnt] = ppa; - ppa_cnt++; - done = 0; - - if (ppa_cnt == max_ppas) - return ppa_cnt; - } + nvm_for_each_lun_ppa(dev, ppa, ch, lun) { + idx = factory_blk_offset(dev, ch, lun); + offset = &f->blks[idx]; + + blkid = find_first_zero_bit(offset, + dev->blks_per_lun); + if (blkid >= dev->blks_per_lun) + continue; + set_bit(blkid, offset); + + ppa.g.blk = blkid; + pr_debug("nvm: erase ppa (%u %u %u)\n", + ppa.g.ch, + ppa.g.lun, + ppa.g.blk); + + erase_list[ppa_cnt] = ppa; + ppa_cnt++; + done = 0; + + if (ppa_cnt == max_ppas) + return ppa_cnt; } } @@ -684,17 +679,10 @@ static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) int ch, lun, ret; struct ppa_addr ppa; - ppa.ppa = 0; - for (ch = 0; ch < dev->nr_chnls; ch++) { - for (lun = 0; lun < dev->luns_per_chnl; lun++) { - ppa.g.ch = ch; - ppa.g.lun = lun; - - ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, - f); - if (ret) - return ret; - } + nvm_for_each_lun_ppa(dev, ppa, ch, lun) { + ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, f); + if (ret) + return ret; } return 0; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 5eabdba..3f25635 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -559,6 +559,13 @@ extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_dev_factory(struct nvm_dev *, int flags); + +#define nvm_for_each_lun_ppa(dev, ppa, chid, lunid) \ + for ((chid) = 0, (ppa).ppa = 0; (chid) < (dev)->nr_chnls; \ + (chid)++, (ppa).g.ch = (chid)) \ + for ((lunid) = 0; (lunid) < (dev)->luns_per_chnl; \ + (lunid)++, (ppa).g.lun = (lunid)) + #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v0.10.2 From e11903f5dfeb4f59fe93316d47f2ee5982e91e60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:05 +0200 Subject: lightnvm: refactor device ops->get_bb_tbl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device ops->get_bb_tbl() takes a callback, that allows the caller to use its own callback function to update its data structures in the returning function. This makes it difficult to send parameters to the callback, and usually is circumvented by small private structures, that both carry the callers state and any flags needed to fulfill the update. Refactor ops->get_bb_tbl() to fill a data buffer with the status of the blocks returned, and let the user call the callback function manually. That will provide the necessary flags and data structures and simplify the logic around ops->get_bb_tbl(). Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0296223..e6d7a98 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -467,6 +467,14 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) } EXPORT_SYMBOL(nvm_bb_tbl_fold); +int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks) +{ + ppa = generic_to_dev_addr(dev, ppa); + + return dev->ops->get_bb_tbl(dev, ppa, blks); +} +EXPORT_SYMBOL(nvm_get_bb_tbl); + static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) { int i; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 6096077..9c6b141 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -129,10 +129,10 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static int gennvm_block_bb(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, void *private) +static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, + u8 *blks, int nr_blks) { - struct gen_nvm *gn = private; + struct nvm_dev *dev = gn->dev; struct gen_lun *lun; struct nvm_block *blk; int i; @@ -219,13 +219,21 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) struct gen_lun *lun; struct nvm_block *block; sector_t lun_iter, blk_iter, cur_block_id = 0; - int ret; + int ret, nr_blks; + u8 *blks; + + nr_blks = dev->blks_per_lun * dev->plane_mode; + blks = kmalloc(nr_blks, GFP_KERNEL); + if (!blks) + return -ENOMEM; gennvm_for_each_lun(gn, lun, lun_iter) { lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) * dev->blks_per_lun); - if (!lun->vlun.blocks) + if (!lun->vlun.blocks) { + kfree(blks); return -ENOMEM; + } for (blk_iter = 0; blk_iter < dev->blks_per_lun; blk_iter++) { block = &lun->vlun.blocks[blk_iter]; @@ -250,12 +258,14 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.ppa = 0; ppa.g.ch = lun->vlun.chnl_id; ppa.g.lun = lun->vlun.id; - ppa = generic_to_dev_addr(dev, ppa); - ret = dev->ops->get_bb_tbl(dev, ppa, - gennvm_block_bb, gn); + ret = nvm_get_bb_tbl(dev, ppa, blks); + if (ret) + pr_err("gennvm: could not get BB table\n"); + + ret = gennvm_block_bb(gn, ppa, blks, nr_blks); if (ret) - pr_err("gennvm: could not read BB table\n"); + pr_err("gennvm: BB table map failed\n"); } } @@ -268,6 +278,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } } + kfree(blks); return 0; } diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index a48093d..dc99c0a 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -93,10 +93,45 @@ void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); } +static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa, + u8 *blks, int nr_blks, + struct sysblk_scan *s) +{ + struct ppa_addr *sppa; + int i, blkid = 0; + + nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); + if (nr_blks < 0) + return nr_blks; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] == NVM_BLK_T_HOST) + return -EEXIST; + + if (blks[i] != NVM_BLK_T_FREE) + continue; + + sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; + sppa->g.ch = ppa.g.ch; + sppa->g.lun = ppa.g.lun; + sppa->g.blk = i; + s->nr_ppas++; + blkid++; + + pr_debug("nvm: use (%u %u %u) as sysblk\n", + sppa->g.ch, sppa->g.lun, sppa->g.blk); + if (blkid > MAX_BLKS_PR_SYSBLK - 1) + return 0; + } + + pr_err("nvm: sysblk failed get sysblk\n"); + return -EINVAL; +} + static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, void *private) + u8 *blks, int nr_blks, + struct sysblk_scan *s) { - struct sysblk_scan *s = private; int i, nr_sysblk = 0; nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); @@ -123,26 +158,42 @@ static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa, } static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, - struct ppa_addr *ppas, nvm_bb_update_fn *fn) + struct ppa_addr *ppas, int get_free) { - struct ppa_addr dppa; - int i, ret = 0; + int i, nr_blks, ret = 0; + u8 *blks; s->nr_ppas = 0; + nr_blks = dev->blks_per_lun * dev->plane_mode; + + blks = kmalloc(nr_blks, GFP_KERNEL); + if (!blks) + return -ENOMEM; for (i = 0; i < s->nr_rows; i++) { - dppa = generic_to_dev_addr(dev, ppas[i]); s->row = i; - ret = dev->ops->get_bb_tbl(dev, dppa, fn, s); + ret = nvm_get_bb_tbl(dev, ppas[i], blks); if (ret) { pr_err("nvm: failed bb tbl for ppa (%u %u)\n", ppas[i].g.ch, ppas[i].g.blk); - return ret; + goto err_get; } + + if (get_free) + ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks, + s); + else + ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks, + s); + + if (ret) + goto err_get; } +err_get: + kfree(blks); return ret; } @@ -239,41 +290,6 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) return 0; } -static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, void *private) -{ - struct sysblk_scan *s = private; - struct ppa_addr *sppa; - int i, blkid = 0; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - for (i = 0; i < nr_blks; i++) { - if (blks[i] == NVM_BLK_T_HOST) - return -EEXIST; - - if (blks[i] != NVM_BLK_T_FREE) - continue; - - sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; - sppa->g.ch = ppa.g.ch; - sppa->g.lun = ppa.g.lun; - sppa->g.blk = i; - s->nr_ppas++; - blkid++; - - pr_debug("nvm: use (%u %u %u) as sysblk\n", - sppa->g.ch, sppa->g.lun, sppa->g.blk); - if (blkid > MAX_BLKS_PR_SYSBLK - 1) - return 0; - } - - pr_err("nvm: sysblk failed get sysblk\n"); - return -EINVAL; -} - static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, struct sysblk_scan *s) { @@ -393,7 +409,7 @@ int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); if (ret) goto err_sysblk; @@ -453,7 +469,7 @@ int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); if (ret) goto err_sysblk; @@ -551,7 +567,7 @@ int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1); if (ret) goto err_mark; @@ -587,9 +603,9 @@ static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) } static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, void *private) + u8 *blks, int nr_blks, + struct factory_blks *f) { - struct factory_blks *f = private; int i, lunoff; nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); @@ -659,32 +675,29 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, return ppa_cnt; } -static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, - nvm_bb_update_fn *fn, void *priv) -{ - struct ppa_addr dev_ppa; - int ret; - - dev_ppa = generic_to_dev_addr(dev, ppa); - - ret = dev->ops->get_bb_tbl(dev, dev_ppa, fn, priv); - if (ret) - pr_err("nvm: failed bb tbl for ch%u lun%u\n", - ppa.g.ch, ppa.g.blk); - return ret; -} - static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) { - int ch, lun, ret; struct ppa_addr ppa; + int ch, lun, nr_blks, ret; + u8 *blks; + + nr_blks = dev->blks_per_lun * dev->plane_mode; + blks = kmalloc(nr_blks, GFP_KERNEL); + if (!blks) + return -ENOMEM; nvm_for_each_lun_ppa(dev, ppa, ch, lun) { - ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, f); + ret = nvm_get_bb_tbl(dev, ppa, blks); + if (ret) + pr_err("nvm: failed bb tbl for ch%u lun%u\n", + ppa.g.ch, ppa.g.blk); + + ret = nvm_factory_blks(dev, ppa, blks, nr_blks, f); if (ret) return ret; } + kfree(blks); return 0; } @@ -722,8 +735,7 @@ int nvm_dev_factory(struct nvm_dev *dev, int flags) if (flags & NVM_FACTORY_RESET_HOST_BLKS) { nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, - sysblk_get_host_blks); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); if (!ret) ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); mutex_unlock(&dev->mlock); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d289980..45e3511 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -388,7 +388,7 @@ out: } static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - nvm_bb_update_fn *update_bbtbl, void *priv) + u8 *blks) { struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; @@ -435,9 +435,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - ppa = dev_to_generic_addr(nvmdev, ppa); - ret = update_bbtbl(nvmdev, ppa, bb_tbl->blk, nr_blks, priv); - + memcpy(blks, bb_tbl->blk, nvmdev->blks_per_lun * nvmdev->plane_mode); out: kfree(bb_tbl); return ret; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3f25635..16d4f2e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -41,13 +41,10 @@ struct nvm_id; struct nvm_dev; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(struct nvm_dev *, struct ppa_addr, u8 *, int, - void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, - nvm_bb_update_fn *, void *); +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); @@ -539,6 +536,7 @@ extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); +extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); /* sysblk.c */ #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ -- cgit v0.10.2 From 6659d4d80c6fc6b4b5bc2e99988f32b1b3865f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:06 +0200 Subject: lightnvm: remove struct factory_blks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that device ops->get_bb_table no longer uses a callback, the struct factory_blks can be removed. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index dc99c0a..bca6902 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -582,29 +582,23 @@ err_mark: return ret; } -struct factory_blks { - struct nvm_dev *dev; - int flags; - unsigned long *blks; -}; - static int factory_nblks(int nblks) { /* Round up to nearest BITS_PER_LONG */ return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); } -static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) +static unsigned int factory_blk_offset(struct nvm_dev *dev, struct ppa_addr ppa) { int nblks = factory_nblks(dev->blks_per_lun); - return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) / + return ((ppa.g.ch * dev->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) / BITS_PER_LONG; } static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks, int nr_blks, - struct factory_blks *f) + unsigned long *blk_bitmap, int flags) { int i, lunoff; @@ -612,25 +606,25 @@ static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, if (nr_blks < 0) return nr_blks; - lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun); + lunoff = factory_blk_offset(dev, ppa); /* non-set bits correspond to the block must be erased */ for (i = 0; i < nr_blks; i++) { switch (blks[i]) { case NVM_BLK_T_FREE: - if (f->flags & NVM_FACTORY_ERASE_ONLY_USER) - set_bit(i, &f->blks[lunoff]); + if (flags & NVM_FACTORY_ERASE_ONLY_USER) + set_bit(i, &blk_bitmap[lunoff]); break; case NVM_BLK_T_HOST: - if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS)) - set_bit(i, &f->blks[lunoff]); + if (!(flags & NVM_FACTORY_RESET_HOST_BLKS)) + set_bit(i, &blk_bitmap[lunoff]); break; case NVM_BLK_T_GRWN_BAD: - if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS)) - set_bit(i, &f->blks[lunoff]); + if (!(flags & NVM_FACTORY_RESET_GRWN_BBLKS)) + set_bit(i, &blk_bitmap[lunoff]); break; default: - set_bit(i, &f->blks[lunoff]); + set_bit(i, &blk_bitmap[lunoff]); break; } } @@ -639,7 +633,7 @@ static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, } static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, - int max_ppas, struct factory_blks *f) + int max_ppas, unsigned long *blk_bitmap) { struct ppa_addr ppa; int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; @@ -648,8 +642,8 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, while (!done) { done = 1; nvm_for_each_lun_ppa(dev, ppa, ch, lun) { - idx = factory_blk_offset(dev, ch, lun); - offset = &f->blks[idx]; + idx = factory_blk_offset(dev, ppa); + offset = &blk_bitmap[idx]; blkid = find_first_zero_bit(offset, dev->blks_per_lun); @@ -675,10 +669,11 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, return ppa_cnt; } -static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) +static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap, + int flags) { struct ppa_addr ppa; - int ch, lun, nr_blks, ret; + int ch, lun, nr_blks, ret = 0; u8 *blks; nr_blks = dev->blks_per_lun * dev->plane_mode; @@ -692,43 +687,42 @@ static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) pr_err("nvm: failed bb tbl for ch%u lun%u\n", ppa.g.ch, ppa.g.blk); - ret = nvm_factory_blks(dev, ppa, blks, nr_blks, f); + ret = nvm_factory_blks(dev, ppa, blks, nr_blks, blk_bitmap, + flags); if (ret) - return ret; + break; } kfree(blks); - return 0; + return ret; } int nvm_dev_factory(struct nvm_dev *dev, int flags) { - struct factory_blks f; struct ppa_addr *ppas; int ppa_cnt, ret = -ENOMEM; int max_ppas = dev->ops->max_phys_sect / dev->nr_planes; struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; struct sysblk_scan s; + unsigned long *blk_bitmap; - f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns, + blk_bitmap = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns, GFP_KERNEL); - if (!f.blks) + if (!blk_bitmap) return ret; ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); if (!ppas) goto err_blks; - f.dev = dev; - f.flags = flags; - /* create list of blks to be erased */ - ret = nvm_fact_select_blks(dev, &f); + ret = nvm_fact_select_blks(dev, blk_bitmap, flags); if (ret) goto err_ppas; /* continue to erase until list of blks until empty */ - while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0) + while ((ppa_cnt = + nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0) nvm_erase_ppa(dev, ppas, ppa_cnt); /* mark host reserved blocks free */ @@ -743,7 +737,7 @@ int nvm_dev_factory(struct nvm_dev *dev, int flags) err_ppas: kfree(ppas); err_blks: - kfree(f.blks); + kfree(blk_bitmap); return ret; } EXPORT_SYMBOL(nvm_dev_factory); -- cgit v0.10.2 From 5ebc7d9fe13ff9bd3622d0be3cd39c8751459be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:07 +0200 Subject: lightnvm: make nvm_set_rqd_ppalist() aware of vblks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A virtual block enables a block to identify multiple physical blocks. This is useful for metadata where a device media supports multiple planes. In that case, a block, with multiple planes can be managed as a single vblk. Reducing the metadata required by one forth. nvm_set_rqd_ppalist() takes care of expanding a ppa_list with vblks automatically. However, for some use-cases, where only a single physical block is required, the ppa_list should not be expanded. Therefore, add a vblk parameter to nvm_set_rqd_ppalist(), and only expand the ppa_list if vblk is set. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index e6d7a98..de5db7b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -251,33 +251,36 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) EXPORT_SYMBOL(nvm_generic_to_addr_mode); int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, - struct ppa_addr *ppas, int nr_ppas) + struct ppa_addr *ppas, int nr_ppas, int vblk) { int i, plane_cnt, pl_idx; - if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_pages = 1; + if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { + rqd->nr_pages = nr_ppas; rqd->ppa_addr = ppas[0]; return 0; } - plane_cnt = dev->plane_mode; - rqd->nr_pages = plane_cnt * nr_ppas; - - if (dev->ops->max_phys_sect < rqd->nr_pages) - return -EINVAL; - + rqd->nr_pages = nr_ppas; rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); if (!rqd->ppa_list) { pr_err("nvm: failed to allocate dma memory\n"); return -ENOMEM; } - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + if (!vblk) { + for (i = 0; i < nr_ppas; i++) + rqd->ppa_list[i] = ppas[i]; + } else { + plane_cnt = dev->plane_mode; + rqd->nr_pages *= plane_cnt; + for (i = 0; i < nr_ppas; i++) { - ppas[i].g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppas[i].g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + } } } @@ -304,7 +307,7 @@ int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas) memset(&rqd, 0, sizeof(struct nvm_rq)); - ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); if (ret) return ret; @@ -420,7 +423,7 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, int ret; memset(&rqd, 0, sizeof(struct nvm_rq)); - ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1); if (ret) return ret; diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index bca6902..737fbc3 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -277,7 +277,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) memset(&rqd, 0, sizeof(struct nvm_rq)); - nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas); + nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1); nvm_generic_to_addr_mode(dev, &rqd); ret = dev->ops->set_bb_tbl(dev, &rqd, type); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 16d4f2e..9ae0b7c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -526,7 +526,7 @@ extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, - struct ppa_addr *, int); + struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); -- cgit v0.10.2 From a63d5cf2031cc84443440caf32c175b3548ac6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:08 +0200 Subject: lightnvm: move responsibility for bad blk mgmt to target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We move the responsibility of managing the persistent bad block table to the target. The target may choose to mark a block bad or retry writing to it. Never the less, it should be the target that makes the decision and not the media manager. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 9c6b141..89b880a 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -419,6 +419,9 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, struct gen_lun *lun; struct nvm_block *blk; + pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", + ppa->g.ch, ppa->g.lun, ppa->g.blk, ppa->g.pg, type); + if (unlikely(ppa->g.ch > dev->nr_chnls || ppa->g.lun > dev->luns_per_chnl || ppa->g.blk > dev->blks_per_lun)) { @@ -437,39 +440,33 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, blk->state = type; } -/* mark block bad. It is expected the target recover from the error. */ +/* + * mark block bad in gennvm. It is expected that the target recovers separately + */ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) { - int i; - - if (!dev->ops->set_bb_tbl) - return; - - if (dev->ops->set_bb_tbl(dev, rqd, 1)) - return; + int bit = -1; + int max_secs = dev->ops->max_phys_sect; + void *comp_bits = &rqd->ppa_status; nvm_addr_to_generic_mode(dev, rqd); /* look up blocks and mark them as bad */ - if (rqd->nr_pages > 1) - for (i = 0; i < rqd->nr_pages; i++) - gennvm_blk_set_type(dev, &rqd->ppa_list[i], - NVM_BLK_ST_BAD); - else + if (rqd->nr_pages == 1) { gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD); + return; + } + + while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs) + gennvm_blk_set_type(dev, &rqd->ppa_list[bit], NVM_BLK_ST_BAD); } static void gennvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_instance *ins = rqd->ins; - switch (rqd->error) { - case NVM_RSP_SUCCESS: - case NVM_RSP_ERR_EMPTYPAGE: - break; - case NVM_RSP_ERR_FAILWRITE: + if (rqd->error == NVM_RSP_ERR_FAILWRITE) gennvm_mark_blk_bad(rqd->dev, rqd); - } ins->tt->end_io(rqd); } -- cgit v0.10.2 From 00ee6cc3b74bdb4dfb71838046517beb6839647c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:09 +0200 Subject: lightnvm: refactor set_bb_tbl for accepting ppa list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The set_bb_tbl takes struct nvm_rq and only uses its ppa_list and nr_pages internally. Instead, make these two variables explicit. This allows a user to call it without initializing a struct nvm_rq first. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index 737fbc3..b98ca19 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -280,7 +280,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1); nvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->set_bb_tbl(dev, &rqd, type); + ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_pages, type); nvm_free_rqd_ppalist(dev, &rqd); if (ret) { pr_err("nvm: sysblk failed bb mark\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 45e3511..d426d95 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -441,8 +441,8 @@ out: return ret; } -static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, - int type) +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, + int nr_ppas, int type) { struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_nvm_command c = {}; @@ -450,8 +450,8 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; c.set_bb.nsid = cpu_to_le32(ns->ns_id); - c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); + c.set_bb.spba = cpu_to_le64(ppas->ppa); + c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); c.set_bb.value = type; ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9ae0b7c..af72ca7 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -45,7 +45,7 @@ typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); -- cgit v0.10.2 From 293a6e8e270f19f676ac4cbe6c653aaf27146ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:10 +0200 Subject: lightnvm: fix out of bound ppa lun id on bb tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ppa configured for retrieving the bad block table uses the internal lun id to setup the get bad block ppa. This increases monotonically with the number luns available. When configuring a ppa, the channel and lun must be specified separately, leading to an out of bound memory access in gennvm_block_bb when lun id goes beyond the luns available within a channel. Additional, remove out of bound check in gennvm_block_bb(), as it was a buggy to begin with. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 89b880a..61790ae 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -148,11 +148,6 @@ static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, continue; blk = &lun->vlun.blocks[i]; - if (!blk) { - pr_err("gennvm: BB data is out of bounds.\n"); - return -EINVAL; - } - list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; lun->vlun.nr_free_blocks--; @@ -257,7 +252,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.ppa = 0; ppa.g.ch = lun->vlun.chnl_id; - ppa.g.lun = lun->vlun.id; + ppa.g.lun = lun->vlun.lun_id; ret = nvm_get_bb_tbl(dev, ppa, blks); if (ret) -- cgit v0.10.2 From 57682b49154921306a5e7b79c883781230698878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:11 +0200 Subject: lightnvm: do not free unused metadata on rrpc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rrpc does not save any metadata on a given request. Thus, do not attempt to free the metadata dma region. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index c7fef71..ffcfee6 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -711,8 +711,6 @@ static void rrpc_end_io(struct nvm_rq *rqd) if (npages > 1) nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list); - if (rqd->metadata) - nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata); mempool_free(rqd, rrpc->rq_pool); } -- cgit v0.10.2 From 003fad376b924fc3a61c659f38da70356ec144fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:12 +0200 Subject: lightnvm: enable metadata to be sent to device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable metadata buffer to be sent to the device through the metadata field on the physical rw nvme command. The size of the metadata buffer must follow dev->oob_size * # of PPAs. Signed-off-by: Javier González Updated description. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d426d95..69a47fb 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -467,6 +467,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.opcode = rqd->opcode; c->ph_rw.nsid = cpu_to_le32(ns->ns_id); c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c->ph_rw.metadata = cpu_to_le64(rqd->meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index af72ca7..678df4d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -230,8 +230,8 @@ struct nvm_rq { struct ppa_addr *ppa_list; - void *metadata; - dma_addr_t dma_metadata; + void *meta_list; + dma_addr_t dma_meta_list; struct completion *wait; nvm_end_io_fn *end_io; -- cgit v0.10.2 From 75b8564932ef646e8620deffacbe134846333948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:13 +0200 Subject: lightnvm: rename dma helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, the dma pool have been exclusively used to allocate the ppa list being sent to the device. In pblk (upcoming), we use these pools to allocate metadata too. Thus, we generalize the names of some variables on the dma helper functions to make the code more readable. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index de5db7b..1873a3b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -87,15 +87,15 @@ EXPORT_SYMBOL(nvm_unregister_tgt_type); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) { - return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags, + return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_alloc); -void nvm_dev_dma_free(struct nvm_dev *dev, void *ppa_list, +void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) { - dev->ops->dev_dma_free(dev->ppalist_pool, ppa_list, dma_handler); + dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_free); @@ -652,8 +652,8 @@ err: static void nvm_exit(struct nvm_dev *dev) { - if (dev->ppalist_pool) - dev->ops->destroy_dma_pool(dev->ppalist_pool); + if (dev->dma_pool) + dev->ops->destroy_dma_pool(dev->dma_pool); nvm_free(dev); pr_info("nvm: successfully unloaded\n"); @@ -687,9 +687,9 @@ int nvm_register(struct request_queue *q, char *disk_name, } if (dev->ops->max_phys_sect > 1) { - dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist"); - if (!dev->ppalist_pool) { - pr_err("nvm: could not create ppa pool\n"); + dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); + if (!dev->dma_pool) { + pr_err("nvm: could not create dma pool\n"); ret = -ENOMEM; goto err_init; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69a47fb..8461f5a 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -565,10 +565,10 @@ static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, return dma_pool_alloc(pool, mem_flags, dma_handler); } -static void nvme_nvm_dev_dma_free(void *pool, void *ppa_list, +static void nvme_nvm_dev_dma_free(void *pool, void *addr, dma_addr_t dma_handler) { - dma_pool_free(pool, ppa_list, dma_handler); + dma_pool_free(pool, addr, dma_handler); } static struct nvm_dev_ops nvme_nvm_dev_ops = { diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 678df4d..0e8e019 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -347,7 +347,7 @@ struct nvm_dev { unsigned max_pages_per_blk; unsigned long *lun_map; - void *ppalist_pool; + void *dma_pool; struct nvm_id identity; -- cgit v0.10.2 From b86d8d363ece8fb9a7ec7e951c7f041f47f0f552 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 6 May 2016 20:03:14 +0200 Subject: nvme/lightnvm: Log using the ctrl named device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align with the rest of the nvme subsystem. Signed-off-by: Sagi Grimberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8461f5a..fadeb54 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -367,8 +367,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, entries, len); if (ret) { - dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n", - ret); + dev_err(ns->ctrl->device, + "L2P table transfer failed (%d)\n", ret); ret = -EIO; goto out; } @@ -410,27 +410,28 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { - dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret); + dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(ctrl->dev, "bbt format mismatch\n"); + dev_err(ctrl->device, "bbt format mismatch\n"); ret = -EINVAL; goto out; } if (le16_to_cpu(bb_tbl->verid) != 1) { ret = -EINVAL; - dev_err(ctrl->dev, "bbt version not supported\n"); + dev_err(ctrl->device, "bbt version not supported\n"); goto out; } if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { ret = -EINVAL; - dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", + dev_err(ctrl->device, + "bbt unsuspected blocks returned (%u!=%u)", le32_to_cpu(bb_tbl->tblks), nr_blks); goto out; } @@ -457,7 +458,8 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, NULL, 0); if (ret) - dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret); + dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", + ret); return ret; } -- cgit v0.10.2 From cca87bc9d359f1a03ac34ca095343d1bfa2e6b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:15 +0200 Subject: lightnvm: do not assume sequential lun alloc. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When doing GC, rrpc calculates the physical LUN to which the rrpc block belongs too. This calculation is based on the assumption that LUNs are assigned sequentially to the LUN list. Use the reference to the LUN instead. This saves us the calculation and allows us to align LUNs in a different manner to, for example, take advantage of devide parallelism. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index ffcfee6..48862ead 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -405,9 +405,8 @@ static void rrpc_block_gc(struct work_struct *work) ws_gc); struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; + struct rrpc_lun *rlun = rblk->rlun; struct nvm_dev *dev = rrpc->dev; - struct nvm_lun *lun = rblk->parent->lun; - struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id); @@ -508,9 +507,9 @@ static void rrpc_gc_queue(struct work_struct *work) ws_gc); struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; + struct rrpc_lun *rlun = rblk->rlun; struct nvm_lun *lun = rblk->parent->lun; struct nvm_block *blk = rblk->parent; - struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; spin_lock(&rlun->lock); list_add_tail(&rblk->prio, &rlun->prio_list); -- cgit v0.10.2 From 45bbd0529e5d50aa91a4bb5fffffc5e24df3ef7e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 May 2016 20:03:16 +0200 Subject: lightnvm: pass dma address to hardware rather than pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent change to lightnvm added code to pass a kernel pointer to the hardware, which gcc complained about: drivers/nvme/host/lightnvm.c: In function 'nvme_nvm_rqtocmd': drivers/nvme/host/lightnvm.c:472:32: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] c->ph_rw.metadata = cpu_to_le64(rqd->meta_list); It looks like this has no way of working anyway, so this changes the code to pass the dma_address instead. This was most likely what was intended here. Neither of the two are currently ever written to, so the effect is the same for now. Signed-off-by: Arnd Bergmann Fixes: a34b1eb78e21 ("lightnvm: enable metadata to be sent to device") Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index fadeb54..65de1e5 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.opcode = rqd->opcode; c->ph_rw.nsid = cpu_to_le32(ns->ns_id); c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c->ph_rw.metadata = cpu_to_le64(rqd->meta_list); + c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1); -- cgit v0.10.2 From 976bdfcae32ea10c2c8c2ecaeb0d85873f634dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:17 +0200 Subject: lightnvm: remove mgt targets on mgt removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Targets associated with a device manager are not freed on device removal. They have to be manually removed before shutdown. Make sure any outstanding targets are freed upon shutdown. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 1873a3b..d3af771 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -596,13 +596,52 @@ err_fmtype: return ret; } +static void nvm_remove_target(struct nvm_target *t) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + lockdep_assert_held(&nvm_lock); + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->exit) + tt->exit(tdisk->private_data); + + put_disk(tdisk); + + list_del(&t->list); + kfree(t); +} + +static void nvm_free_mgr(struct nvm_dev *dev) +{ + struct nvm_target *tgt, *tmp; + + if (!dev->mt) + return; + + down_write(&nvm_lock); + list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) { + if (tgt->dev != dev) + continue; + + nvm_remove_target(tgt); + } + up_write(&nvm_lock); + + dev->mt->unregister_mgr(dev); + dev->mt = NULL; +} + static void nvm_free(struct nvm_dev *dev) { if (!dev) return; - if (dev->mt) - dev->mt->unregister_mgr(dev); + nvm_free_mgr(dev); kfree(dev->lptbl); kfree(dev->lun_map); @@ -808,6 +847,7 @@ static int nvm_create_target(struct nvm_dev *dev, t->type = tt; t->disk = tdisk; + t->dev = dev; down_write(&nvm_lock); list_add_tail(&t->list, &nvm_targets); @@ -823,26 +863,6 @@ err_t: return -ENOMEM; } -static void nvm_remove_target(struct nvm_target *t) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; - - lockdep_assert_held(&nvm_lock); - - del_gendisk(tdisk); - blk_cleanup_queue(q); - - if (tt->exit) - tt->exit(tdisk->private_data); - - put_disk(tdisk); - - list_del(&t->list); - kfree(t); -} - static int __nvm_configure_create(struct nvm_ioctl_create *create) { struct nvm_dev *dev; @@ -1231,10 +1251,7 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) return -EINVAL; } - if (dev->mt) { - dev->mt->unregister_mgr(dev); - dev->mt = NULL; - } + nvm_free_mgr(dev); if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) return nvm_dev_factory(dev, fact.flags); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 0e8e019..cde31ff 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -200,6 +200,7 @@ struct nvm_id { struct nvm_target { struct list_head list; + struct nvm_dev *dev; struct nvm_tgt_type *type; struct gendisk *disk; }; -- cgit v0.10.2 From 04a8aa173bd9410849526a80dcf733da9c3e142d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:18 +0200 Subject: lightnvm: expose gennvm_mark_blk to targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Targets can update a block state when having a reference to an in-memory virtual block. In the case that a target does not keep the block metadata in memory, it does not have a way to update this structure. Therefore, expose gennvm_mark_blk() through the media managers ->mark_blk() callback and let targets update the state structure through this callback. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 61790ae..33a66d8c0 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -407,29 +407,28 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) spin_unlock(&vlun->lock); } -static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, - int type) +static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) { struct gen_nvm *gn = dev->mp; struct gen_lun *lun; struct nvm_block *blk; pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", - ppa->g.ch, ppa->g.lun, ppa->g.blk, ppa->g.pg, type); + ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type); - if (unlikely(ppa->g.ch > dev->nr_chnls || - ppa->g.lun > dev->luns_per_chnl || - ppa->g.blk > dev->blks_per_lun)) { + if (unlikely(ppa.g.ch > dev->nr_chnls || + ppa.g.lun > dev->luns_per_chnl || + ppa.g.blk > dev->blks_per_lun)) { WARN_ON_ONCE(1); pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", - ppa->g.ch, dev->nr_chnls, - ppa->g.lun, dev->luns_per_chnl, - ppa->g.blk, dev->blks_per_lun); + ppa.g.ch, dev->nr_chnls, + ppa.g.lun, dev->luns_per_chnl, + ppa.g.blk, dev->blks_per_lun); return; } - lun = &gn->luns[ppa->g.lun * ppa->g.ch]; - blk = &lun->vlun.blocks[ppa->g.blk]; + lun = &gn->luns[ppa.g.lun * ppa.g.ch]; + blk = &lun->vlun.blocks[ppa.g.blk]; /* will be moved to bb list on put_blk from target */ blk->state = type; @@ -448,12 +447,12 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) /* look up blocks and mark them as bad */ if (rqd->nr_pages == 1) { - gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD); + gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); return; } while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs) - gennvm_blk_set_type(dev, &rqd->ppa_list[bit], NVM_BLK_ST_BAD); + gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); } static void gennvm_end_io(struct nvm_rq *rqd) @@ -544,6 +543,8 @@ static struct nvmm_type gennvm = { .submit_io = gennvm_submit_io, .erase_blk = gennvm_erase_blk, + .mark_blk = gennvm_mark_blk, + .get_lun = gennvm_get_lun, .reserve_lun = gennvm_reserve_lun, .release_lun = gennvm_release_lun, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cde31ff..3c53911 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -467,6 +467,7 @@ typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); +typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); typedef int (nvmm_reserve_lun)(struct nvm_dev *, int); typedef void (nvmm_release_lun)(struct nvm_dev *, int); @@ -494,6 +495,9 @@ struct nvmm_type { nvmm_submit_io_fn *submit_io; nvmm_erase_blk_fn *erase_blk; + /* Bad block mgmt */ + nvmm_mark_blk_fn *mark_blk; + /* Configuration management */ nvmm_get_lun_fn *get_lun; nvmm_reserve_lun *reserve_lun; -- cgit v0.10.2 From df414b33bb1eb3a0ae52ccd4ecfec9323a4f89dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 May 2016 20:03:19 +0200 Subject: lightnvm: add is_cached entry to struct ppa_addr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A target requires a method to identify PPAs that are either cached in memory or on disk. This can efficiently be maintained within the PPA. The target host-side translation table can then lookup a PPA and know from the PPA if it is cached or on disk. In the case it is cached, it is the responsibility of the target to maintain this cache. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3c53911..3a810d7 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -18,7 +18,7 @@ enum { #define NVM_SEC_BITS (8) #define NVM_PL_BITS (8) #define NVM_LUN_BITS (8) -#define NVM_CH_BITS (8) +#define NVM_CH_BITS (7) struct ppa_addr { /* Generic structure for all addresses */ @@ -30,8 +30,14 @@ struct ppa_addr { u64 pl : NVM_PL_BITS; u64 lun : NVM_LUN_BITS; u64 ch : NVM_CH_BITS; + u64 reserved : 1; } g; + struct { + u64 line : 63; + u64 is_cached : 1; + } c; + u64 ppa; }; }; -- cgit v0.10.2 From 6d5be9590b5e15124e3c8b319c8d7ce01abcf07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:20 +0200 Subject: lightnvm: rename nr_pages to nr_ppas on nvm_rq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of ppas contained on a request is not necessarily the number of pages that it maps to neither on the target nor on the device side. In order to avoid confusion, rename nr_pages to nr_ppas since it is what the variable actually contains. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index d3af771..160c1a6 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -226,8 +226,8 @@ void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) { int i; - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) + if (rqd->nr_ppas > 1) { + for (i = 0; i < rqd->nr_ppas; i++) rqd->ppa_list[i] = dev_to_generic_addr(dev, rqd->ppa_list[i]); } else { @@ -240,8 +240,8 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) { int i; - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) + if (rqd->nr_ppas > 1) { + for (i = 0; i < rqd->nr_ppas; i++) rqd->ppa_list[i] = generic_to_dev_addr(dev, rqd->ppa_list[i]); } else { @@ -256,13 +256,13 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, int i, plane_cnt, pl_idx; if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { - rqd->nr_pages = nr_ppas; + rqd->nr_ppas = nr_ppas; rqd->ppa_addr = ppas[0]; return 0; } - rqd->nr_pages = nr_ppas; + rqd->nr_ppas = nr_ppas; rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); if (!rqd->ppa_list) { pr_err("nvm: failed to allocate dma memory\n"); @@ -274,7 +274,7 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, rqd->ppa_list[i] = ppas[i]; } else { plane_cnt = dev->plane_mode; - rqd->nr_pages *= plane_cnt; + rqd->nr_ppas *= plane_cnt; for (i = 0; i < nr_ppas; i++) { for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { @@ -395,7 +395,7 @@ int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list, memset(&rqd, 0, sizeof(struct nvm_rq)); - rqd.nr_pages = nr_ppas; + rqd.nr_ppas = nr_ppas; if (nr_ppas > 1) rqd.ppa_list = ppa_list; else diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 33a66d8c0..ec9fb68 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -446,7 +446,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) nvm_addr_to_generic_mode(dev, rqd); /* look up blocks and mark them as bad */ - if (rqd->nr_pages == 1) { + if (rqd->nr_ppas == 1) { gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); return; } diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 48862ead..72aca96 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -695,7 +695,7 @@ static void rrpc_end_io(struct nvm_rq *rqd) { struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance); struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); - uint8_t npages = rqd->nr_pages; + uint8_t npages = rqd->nr_ppas; sector_t laddr = rrpc_get_laddr(rqd->bio) - npages; if (bio_data_dir(rqd->bio) == WRITE) @@ -883,7 +883,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, bio_get(bio); rqd->bio = bio; rqd->ins = &rrpc->instance; - rqd->nr_pages = nr_pages; + rqd->nr_ppas = nr_pages; rrq->flags = flags; err = nvm_submit_io(rrpc->dev, rqd); @@ -892,7 +892,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, bio_put(bio); if (!(flags & NVM_IOTYPE_GC)) { rrpc_unlock_rq(rrpc, rqd); - if (rqd->nr_pages > 1) + if (rqd->nr_ppas > 1) nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list); } diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 2653484..87e84b5 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -251,7 +251,7 @@ static inline void rrpc_unlock_laddr(struct rrpc *rrpc, static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd) { struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); - uint8_t pages = rqd->nr_pages; + uint8_t pages = rqd->nr_ppas; BUG_ON((r->l_start + pages) > rrpc->nr_sects); diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index b98ca19..994697a 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -280,7 +280,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1); nvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_pages, type); + ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); nvm_free_rqd_ppalist(dev, &rqd); if (ret) { pr_err("nvm: sysblk failed bb mark\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 65de1e5..a0af055 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -471,7 +471,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); - c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1); + c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, @@ -542,7 +542,7 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) c.erase.opcode = NVM_OP_ERASE; c.erase.nsid = cpu_to_le32(ns->ns_id); c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c.erase.length = cpu_to_le16(rqd->nr_pages - 1); + c.erase.length = cpu_to_le16(rqd->nr_ppas - 1); return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3a810d7..b2991c7 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -244,7 +244,7 @@ struct nvm_rq { nvm_end_io_fn *end_io; uint8_t opcode; - uint16_t nr_pages; + uint16_t nr_ppas; uint16_t flags; u64 ppa_status; /* ppa media status */ -- cgit v0.10.2 From 116f7d4a21fe450efc652c4850eb27cda36c9db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 6 May 2016 20:03:21 +0200 Subject: lightnvm: reserved space calculation incorrect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_dev->max_pages_per_blk variable was removed in favor of the new nvm->sec_per_blk variable. The ->max_pages_per_blk variable was still used in rrpc_capacity, reporting the reserved capacity to zero. Replace with ->sec_per_blk to calculate the reserved area again. Signed-off-by: Javier González Updated patch description. Was "lightnvm: eliminate redundant variable" Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 72aca96..2103e97 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1264,7 +1264,7 @@ static sector_t rrpc_capacity(void *private) sector_t reserved, provisioned; /* cur, gc, and two emergency blocks for each lun */ - reserved = rrpc->nr_luns * dev->max_pages_per_blk * 4; + reserved = rrpc->nr_luns * dev->sec_per_blk * 4; provisioned = rrpc->nr_sects - reserved; if (reserved > rrpc->nr_sects) { diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index b2991c7..ef2c7d2 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -351,7 +351,6 @@ struct nvm_dev { unsigned long total_blocks; unsigned long total_secs; int nr_luns; - unsigned max_pages_per_blk; unsigned long *lun_map; void *dma_pool; -- cgit v0.10.2