From 4ead1a25ce436f73501d6358450f3ba0c1cb2ce3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:35 +0100 Subject: MAINTAINERS: Add linux-block list to LightNVM for patches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/MAINTAINERS b/MAINTAINERS index e9caa4b..018865e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6371,6 +6371,7 @@ F: arch/*/include/asm/pmem.h LIGHTNVM PLATFORM SUPPORT M: Matias Bjorling W: http://github/OpenChannelSSD +L: linux-block@vger.kernel.org S: Maintained F: drivers/lightnvm/ F: include/linux/lightnvm.h -- cgit v0.10.2 From aedf17f4515b12ba1cd73298e66baa69cf93010e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:36 +0100 Subject: lightnvm: change max_phys_sect to uint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max_phys_sect variable is defined as a char. We do a boundary check to maximally allow 256 physical page descriptors per command. As we are not indexing from zero. This expression is always false. Bump the max_phys_sect to an unsigned int to support the range check. Signed-off-by: Matias Bjørling Reported-by: Geert Uytterhoeven Signed-off-by: Jens Axboe diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 69c9057..32b5369 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -220,7 +220,7 @@ struct nvm_dev_ops { nvm_dev_dma_alloc_fn *dev_dma_alloc; nvm_dev_dma_free_fn *dev_dma_free; - uint8_t max_phys_sect; + unsigned int max_phys_sect; }; struct nvm_lun { -- cgit v0.10.2 From 11450469830f2481a9e7cb181609288d40f41323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:37 +0100 Subject: lightnvm: update bad block table format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification was changed to reflect a multi-value bad block table. Instead of bit-based bad block table, the bad block table now allows eight bad block categories. Currently four are defined: * Factory bad blocks * Grown bad blocks * Device-side reserved blocks * Host-side reserved blocks The factory and grown bad blocks are the regular bad blocks. The reserved blocks are either for internal use or external use. In particular, the device-side reserved blocks allows the host to bootstrap from a limited number of flash blocks. Reducing the flash blocks to scan upon super block initialization. Support for both get bad block table and set bad block table is added. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ae1fb2b..8cfc011 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -64,19 +64,22 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static int gennvm_block_bb(u32 lun_id, void *bb_bitmap, unsigned int nr_blocks, +static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, void *private) { struct gen_nvm *gn = private; - struct gen_lun *lun = &gn->luns[lun_id]; + struct nvm_dev *dev = gn->dev; + struct gen_lun *lun; struct nvm_block *blk; int i; - if (unlikely(bitmap_empty(bb_bitmap, nr_blocks))) - return 0; + ppa = addr_to_generic_mode(gn->dev, ppa); + lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; + + for (i = 0; i < nr_blocks; i++) { + if (blks[i] == 0) + continue; - i = -1; - while ((i = find_next_bit(bb_bitmap, nr_blocks, i + 1)) < nr_blocks) { blk = &lun->vlun.blocks[i]; if (!blk) { pr_err("gennvm: BB data is out of bounds.\n"); @@ -171,8 +174,16 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_bb_tbl) { - ret = dev->ops->get_bb_tbl(dev->q, lun->vlun.id, - dev->blks_per_lun, gennvm_block_bb, gn); + struct ppa_addr ppa; + + ppa.ppa = 0; + ppa.g.ch = lun->vlun.chnl_id; + ppa.g.lun = lun->vlun.id; + ppa = generic_to_addr_mode(dev, ppa); + + ret = dev->ops->get_bb_tbl(dev->q, ppa, + dev->blks_per_lun, + gennvm_block_bb, gn); if (ret) pr_err("gennvm: could not read BB table\n"); } @@ -199,6 +210,7 @@ static int gennvm_register(struct nvm_dev *dev) if (!gn) return -ENOMEM; + gn->dev = dev; gn->nr_luns = dev->nr_luns; dev->mp = gn; @@ -354,10 +366,10 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) { int i; - if (!dev->ops->set_bb) + if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h index d23bd35..9c24b5b 100644 --- a/drivers/lightnvm/gennvm.h +++ b/drivers/lightnvm/gennvm.h @@ -35,6 +35,8 @@ struct gen_lun { }; struct gen_nvm { + struct nvm_dev *dev; + int nr_luns; struct gen_lun *luns; }; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index e0b7b95..2c35465 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -93,7 +93,7 @@ struct nvme_nvm_l2ptbl { __le16 cdw14[6]; }; -struct nvme_nvm_bbtbl { +struct nvme_nvm_getbbtbl { __u8 opcode; __u8 flags; __u16 command_id; @@ -101,10 +101,23 @@ struct nvme_nvm_bbtbl { __u64 rsvd[2]; __le64 prp1; __le64 prp2; - __le32 prp1_len; - __le32 prp2_len; - __le32 lbb; - __u32 rsvd11[3]; + __le64 spba; + __u32 rsvd4[4]; +}; + +struct nvme_nvm_setbbtbl { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 nlb; + __u8 value; + __u8 rsvd3; + __u32 rsvd4[3]; }; struct nvme_nvm_erase_blk { @@ -129,8 +142,8 @@ struct nvme_nvm_command { struct nvme_nvm_hb_rw hb_rw; struct nvme_nvm_ph_rw ph_rw; struct nvme_nvm_l2ptbl l2p; - struct nvme_nvm_bbtbl get_bb; - struct nvme_nvm_bbtbl set_bb; + struct nvme_nvm_getbbtbl get_bb; + struct nvme_nvm_setbbtbl set_bb; struct nvme_nvm_erase_blk erase; }; }; @@ -187,6 +200,20 @@ struct nvme_nvm_id { struct nvme_nvm_id_group groups[4]; } __packed; +struct nvme_nvm_bb_tbl { + __u8 tblid[4]; + __le16 verid; + __le16 revid; + __le32 rvsd1; + __le32 tblks; + __le32 tfact; + __le32 tgrown; + __le32 tdresv; + __le32 thresv; + __le32 rsvd2[8]; + __u8 blk[0]; +}; + /* * Check we didn't inadvertently grow the command struct */ @@ -195,12 +222,14 @@ static inline void _nvme_nvm_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_bbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128); BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512); } static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) @@ -322,43 +351,80 @@ out: return ret; } -static int nvme_nvm_get_bb_tbl(struct request_queue *q, int lunid, - unsigned int nr_blocks, - nvm_bb_update_fn *update_bbtbl, void *priv) +static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, + int nr_blocks, nvm_bb_update_fn *update_bbtbl, + void *priv) { struct nvme_ns *ns = q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - void *bb_bitmap; - u16 bb_bitmap_size; + struct nvme_nvm_bb_tbl *bb_tbl; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; c.get_bb.nsid = cpu_to_le32(ns->ns_id); - c.get_bb.lbb = cpu_to_le32(lunid); - bb_bitmap_size = ((nr_blocks >> 15) + 1) * PAGE_SIZE; - bb_bitmap = kmalloc(bb_bitmap_size, GFP_KERNEL); - if (!bb_bitmap) - return -ENOMEM; + c.get_bb.spba = cpu_to_le64(ppa.ppa); - bitmap_zero(bb_bitmap, nr_blocks); + bb_tbl = kzalloc(tblsz, GFP_KERNEL); + if (!bb_tbl) + return -ENOMEM; - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_bitmap, - bb_bitmap_size); + ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { dev_err(dev->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } - ret = update_bbtbl(lunid, bb_bitmap, nr_blocks, priv); + if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || + bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { + dev_err(dev->dev, "bbt format mismatch\n"); + ret = -EINVAL; + goto out; + } + + if (le16_to_cpu(bb_tbl->verid) != 1) { + ret = -EINVAL; + dev_err(dev->dev, "bbt version not supported\n"); + goto out; + } + + if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { + ret = -EINVAL; + dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", + le32_to_cpu(bb_tbl->tblks), nr_blocks); + goto out; + } + + ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); if (ret) { ret = -EINTR; goto out; } out: - kfree(bb_bitmap); + kfree(bb_tbl); + return ret; +} + +static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, + int type) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = ns->dev; + struct nvme_nvm_command c = {}; + int ret = 0; + + c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; + c.set_bb.nsid = cpu_to_le32(ns->ns_id); + c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); + c.set_bb.value = type; + + ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); + if (ret) + dev_err(dev->dev, "set bad block table failed (%d)\n", ret); return ret; } @@ -474,6 +540,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .get_l2p_tbl = nvme_nvm_get_l2p_tbl, .get_bb_tbl = nvme_nvm_get_bb_tbl, + .set_bb_tbl = nvme_nvm_set_bb_tbl, .submit_io = nvme_nvm_submit_io, .erase_block = nvme_nvm_erase_block, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32b5369..9b3dc1b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -191,11 +191,11 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(u32, void *, unsigned int, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, int, unsigned int, +typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); @@ -210,7 +210,7 @@ struct nvm_dev_ops { nvm_id_fn *identity; nvm_get_l2p_tbl_fn *get_l2p_tbl; nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb; + nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; nvm_erase_blk_fn *erase_block; -- cgit v0.10.2 From 36d5dbc694e0d1b1909506666d7c27b53f7f9674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:38 +0100 Subject: lightnvm: update alignments for identify command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A single 8 bit and 16 bit reserve field were inserted in the specification to align fields appropriately. Reflect this in the identify group structure. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 2c35465..60687ed 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -155,11 +155,13 @@ struct nvme_nvm_id_group { __u8 num_ch; __u8 num_lun; __u8 num_pln; + __u8 rsvd1; __le16 num_blk; __le16 num_pg; __le16 fpg_sz; __le16 csecs; __le16 sos; + __le16 rsvd2; __le32 trdt; __le32 trdm; __le32 tprt; @@ -168,7 +170,7 @@ struct nvme_nvm_id_group { __le32 tbem; __le32 mpos; __le16 cpar; - __u8 reserved[913]; + __u8 reserved[910]; } __packed; struct nvme_nvm_addr_format { -- cgit v0.10.2 From 12be5edf68e785dd5dc8665db5a88152b49c1fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:39 +0100 Subject: lightnvm: expose mccap in identify command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mccap field is required for I/O command option support. It defines the following flash access modes: * SLC mode * Erase/Program Suspension * Scramble On/Off * Encryption It is slotted in between mpos and cpar, changing the offset for cpar as well. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 60687ed..52b311c 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -169,8 +169,9 @@ struct nvme_nvm_id_group { __le32 tbet; __le32 tbem; __le32 mpos; + __le32 mccap; __le16 cpar; - __u8 reserved[910]; + __u8 reserved[906]; } __packed; struct nvme_nvm_addr_format { @@ -265,6 +266,7 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) dst->tbet = le32_to_cpu(src->tbet); dst->tbem = le32_to_cpu(src->tbem); dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); dst->cpar = le16_to_cpu(src->cpar); } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9b3dc1b..2572856 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -74,6 +74,7 @@ struct nvm_id_group { u32 tbet; u32 tbem; u32 mpos; + u32 mccap; u16 cpar; u8 res[913]; } __packed; -- cgit v0.10.2 From 73387e7bed260c89628fc6a4e3632b45be9776b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:40 +0100 Subject: lightnvm: remove unused attrs in nvm_id structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_id, nvm_id_group and nvm_addr_format data structures contain reserved attributes. They are unused by media managers and targets. Remove them. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2572856..e6ef8aa 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -58,7 +58,6 @@ enum { struct nvm_id_group { u8 mtype; u8 fmtype; - u16 res16; u8 num_ch; u8 num_lun; u8 num_pln; @@ -76,8 +75,7 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; - u8 res[913]; -} __packed; +}; struct nvm_addr_format { u8 ch_offset; @@ -92,19 +90,16 @@ struct nvm_addr_format { u8 pg_len; u8 sect_offset; u8 sect_len; - u8 res[4]; }; struct nvm_id { u8 ver_id; u8 vmnt; u8 cgrps; - u8 res[5]; u32 cap; u32 dom; struct nvm_addr_format ppaf; u8 ppat; - u8 resv[224]; struct nvm_id_group groups[4]; } __packed; -- cgit v0.10.2 From 4264c980e3e9bb904b7f41dc9c64786cc5466bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:41 +0100 Subject: lightnvm: check for NAND flash and its type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only NAND flash with SLC and MLC is supported. Make sure to not try to initialize TLC memory or other non-volatile memory types. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index f659e60..0985a03 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -185,6 +185,16 @@ static int nvm_core_init(struct nvm_dev *dev) dev->plane_mode = NVM_PLANE_SINGLE; dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size; + if (grp->mtype != 0) { + pr_err("nvm: memory type not supported\n"); + return -EINVAL; + } + + if (grp->fmtype != 0 && grp->fmtype != 1) { + pr_err("nvm: flash type not supported\n"); + return -EINVAL; + } + if (grp->mpos & 0x020202) dev->plane_mode = NVM_PLANE_DOUBLE; if (grp->mpos & 0x040404) -- cgit v0.10.2 From edad2e6606ee62dd7dfc5b001fae39c5c8015a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:42 +0100 Subject: lightnvm: prematurely activate nvm_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We register with nvm_devices when there registration can still fail. Move the final registration at the end of the nvm_register function to make sure we are fully registered when added to the nvm_devices list. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0985a03..40e6cfa 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -318,10 +318,6 @@ int nvm_register(struct request_queue *q, char *disk_name, if (ret) goto err_init; - down_write(&nvm_lock); - list_add(&dev->devices, &nvm_devices); - up_write(&nvm_lock); - if (dev->ops->max_phys_sect > 1) { dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, "ppalist"); @@ -334,6 +330,10 @@ int nvm_register(struct request_queue *q, char *disk_name, return -EINVAL; } + down_write(&nvm_lock); + list_add(&dev->devices, &nvm_devices); + up_write(&nvm_lock); + return 0; err_init: kfree(dev); -- cgit v0.10.2 From c1480ad5943261e01a62eaa7132eab76f9c490e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:43 +0100 Subject: lightnvm: prevent double free on init error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both the nvm_register and nvm_init does a kfree(dev) on error. Make sure to only free it once. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 40e6cfa..899f6b9 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -160,11 +160,6 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk) } EXPORT_SYMBOL(nvm_erase_blk); -static void nvm_core_free(struct nvm_dev *dev) -{ - kfree(dev); -} - static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; @@ -223,8 +218,6 @@ static void nvm_free(struct nvm_dev *dev) if (dev->mt) dev->mt->unregister_mgr(dev); - - nvm_core_free(dev); } static int nvm_init(struct nvm_dev *dev) @@ -351,11 +344,12 @@ void nvm_unregister(char *disk_name) return; } - nvm_exit(dev); - down_write(&nvm_lock); list_del(&dev->devices); up_write(&nvm_lock); + + nvm_exit(dev); + kfree(dev); } EXPORT_SYMBOL(nvm_unregister); -- cgit v0.10.2 From 7386af270c72be65c7cb2ba4ad0d4e70dc373106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:44 +0100 Subject: lightnvm: remove linear and device addr modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linear and device specific address modes can be replaced with a simple offset and bit length conversion that is generic across all devices. This both simplifies the specification and removes the special case for qemu nvme, that previously relied on the linear address mapping. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 899f6b9..790b1d7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -174,8 +174,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->sec_size = grp->csecs; dev->oob_size = grp->sos; dev->sec_per_pg = grp->fpg_sz / grp->csecs; - dev->addr_mode = id->ppat; - dev->addr_format = id->ppaf; + memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); dev->plane_mode = NVM_PLANE_SINGLE; dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 8cfc011..c0d0eb2 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -73,7 +73,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, struct nvm_block *blk; int i; - ppa = addr_to_generic_mode(gn->dev, ppa); + ppa = dev_to_generic_addr(gn->dev, ppa); lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; for (i = 0; i < nr_blocks; i++) { @@ -179,7 +179,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.ppa = 0; ppa.g.ch = lun->vlun.chnl_id; ppa.g.lun = lun->vlun.id; - ppa = generic_to_addr_mode(dev, ppa); + ppa = generic_to_dev_addr(dev, ppa); ret = dev->ops->get_bb_tbl(dev->q, ppa, dev->blks_per_lun, @@ -304,10 +304,10 @@ static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) if (rqd->nr_pages > 1) { for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = addr_to_generic_mode(dev, + rqd->ppa_list[i] = dev_to_generic_addr(dev, rqd->ppa_list[i]); } else { - rqd->ppa_addr = addr_to_generic_mode(dev, rqd->ppa_addr); + rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); } } @@ -317,10 +317,10 @@ static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) if (rqd->nr_pages > 1) { for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = generic_to_addr_mode(dev, + rqd->ppa_list[i] = generic_to_dev_addr(dev, rqd->ppa_list[i]); } else { - rqd->ppa_addr = generic_to_addr_mode(dev, rqd->ppa_addr); + rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); } } diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 7ba64c8..75e59c3 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -123,12 +123,42 @@ static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk) return blk->id * rrpc->dev->pgs_per_blk; } +static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) +{ + struct ppa_addr l; + int secs, pgs, blks, luns; + sector_t ppa = r.ppa; + + l.ppa = 0; + + div_u64_rem(ppa, dev->sec_per_pg, &secs); + l.g.sec = secs; + + sector_div(ppa, dev->sec_per_pg); + div_u64_rem(ppa, dev->sec_per_blk, &pgs); + l.g.pg = pgs; + + sector_div(ppa, dev->pgs_per_blk); + div_u64_rem(ppa, dev->blks_per_lun, &blks); + l.g.blk = blks; + + sector_div(ppa, dev->blks_per_lun); + div_u64_rem(ppa, dev->luns_per_chnl, &luns); + l.g.lun = luns; + + sector_div(ppa, dev->luns_per_chnl); + l.g.ch = ppa; + + return l; +} + static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr) { struct ppa_addr paddr; paddr.ppa = addr; - return __linear_to_generic_addr(dev, paddr); + return linear_to_generic_addr(dev, paddr); } /* requires lun->lock taken */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 52b311c..9069be8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -198,8 +198,7 @@ struct nvme_nvm_id { __le32 cap; __le32 dom; struct nvme_nvm_addr_format ppaf; - __u8 ppat; - __u8 resv[223]; + __u8 resv[224]; struct nvme_nvm_id_group groups[4]; } __packed; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e6ef8aa..cbe288a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -99,7 +99,6 @@ struct nvm_id { u32 cap; u32 dom; struct nvm_addr_format ppaf; - u8 ppat; struct nvm_id_group groups[4]; } __packed; @@ -119,39 +118,28 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (6) -#define NVM_PG_BITS (16) #define NVM_BLK_BITS (16) -#define NVM_LUN_BITS (10) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) #define NVM_CH_BITS (8) struct ppa_addr { + /* Generic structure for all addresses */ union { - /* Channel-based PPA format in nand 4x2x2x2x8x10 */ - struct { - u64 ch : 4; - u64 sec : 2; /* 4 sectors per page */ - u64 pl : 2; /* 4 planes per LUN */ - u64 lun : 2; /* 4 LUNs per channel */ - u64 pg : 8; /* 256 pages per block */ - u64 blk : 10;/* 1024 blocks per plane */ - u64 resved : 36; - } chnl; - - /* Generic structure for all addresses */ struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; u64 sec : NVM_SEC_BITS; u64 pl : NVM_PL_BITS; - u64 pg : NVM_PG_BITS; - u64 blk : NVM_BLK_BITS; u64 lun : NVM_LUN_BITS; u64 ch : NVM_CH_BITS; } g; u64 ppa; }; -} __packed; +}; struct nvm_rq { struct nvm_tgt_instance *ins; @@ -259,8 +247,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; - int addr_mode; - struct nvm_addr_format addr_format; + struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable * blocks at run-time. @@ -286,118 +273,45 @@ struct nvm_dev { char name[DISK_NAME_LEN]; }; -/* fallback conversion */ -static struct ppa_addr __generic_to_linear_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - - l.ppa = r.g.sec + - r.g.pg * dev->sec_per_pg + - r.g.blk * (dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.lun * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.ch * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->luns_per_chnl * - dev->sec_per_pg); - - return l; -} - -/* fallback conversion */ -static struct ppa_addr __linear_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - int secs, pgs, blks, luns; - sector_t ppa = r.ppa; - - l.ppa = 0; - - div_u64_rem(ppa, dev->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, dev->sec_per_pg); - div_u64_rem(ppa, dev->sec_per_blk, &pgs); - l.g.pg = pgs; - - sector_div(ppa, dev->pgs_per_blk); - div_u64_rem(ppa, dev->blks_per_lun, &blks); - l.g.blk = blks; - - sector_div(ppa, dev->blks_per_lun); - div_u64_rem(ppa, dev->luns_per_chnl, &luns); - l.g.lun = luns; - - sector_div(ppa, dev->luns_per_chnl); - l.g.ch = ppa; - - return l; -} - -static struct ppa_addr __generic_to_chnl_addr(struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.chnl.sec = r.g.sec; - l.chnl.pl = r.g.pl; - l.chnl.pg = r.g.pg; - l.chnl.blk = r.g.blk; - l.chnl.lun = r.g.lun; - l.chnl.ch = r.g.ch; + l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset; + l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset; + l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset; + l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset; + l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset; + l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset; return l; } -static struct ppa_addr __chnl_to_generic_addr(struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.g.sec = r.chnl.sec; - l.g.pl = r.chnl.pl; - l.g.pg = r.chnl.pg; - l.g.blk = r.chnl.blk; - l.g.lun = r.chnl.lun; - l.g.ch = r.chnl.ch; + /* + * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. + */ + l.g.blk = (r.ppa >> dev->ppaf.blk_offset) & + (((1 << dev->ppaf.blk_len) - 1)); + l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) & + (((1 << dev->ppaf.pg_len) - 1)); + l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) & + (((1 << dev->ppaf.sect_len) - 1)); + l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) & + (((1 << dev->ppaf.pln_len) - 1)); + l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) & + (((1 << dev->ppaf.lun_len) - 1)); + l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) & + (((1 << dev->ppaf.ch_len) - 1)); return l; } -static inline struct ppa_addr addr_to_generic_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __linear_to_generic_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __chnl_to_generic_addr(gppa); - default: - BUG(); - } - return gppa; -} - -static inline struct ppa_addr generic_to_addr_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __generic_to_linear_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __generic_to_chnl_addr(gppa); - default: - BUG(); - } - return gppa; -} - static inline int ppa_empty(struct ppa_addr ppa_addr) { return (ppa_addr.ppa == ADDR_EMPTY); -- cgit v0.10.2 From 2393bd39c77f21488ac7e5337cac1523e9ebd2c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:45 +0100 Subject: nvme: missing ppaf copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ppa format was not copied from the NVMe specific ppa format to the lightnvm specific ppa format. This led to the ppa format not being communicated to the layers above. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 9069be8..fd37123 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -300,6 +300,8 @@ static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) nvm_id->cgrps = nvme_nvm_id->cgrps; nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap); nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom); + memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf, + sizeof(struct nvme_nvm_addr_format)); ret = init_grps(nvm_id, nvme_nvm_id); out: -- cgit v0.10.2 From dad1b00977f9dc8120b48f9711deb4aa6462a3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:46 +0100 Subject: nvme: remove reserved double word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification was updated the remove the double word just after number of configuration groups and capabilities. Update the identify structure to reflect it. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index fd37123..7e82fe3 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -194,11 +194,11 @@ struct nvme_nvm_id { __u8 ver_id; __u8 vmnt; __u8 cgrps; - __u8 res[5]; + __u8 res; __le32 cap; __le32 dom; struct nvme_nvm_addr_format ppaf; - __u8 resv[224]; + __u8 resv[228]; struct nvme_nvm_id_group groups[4]; } __packed; -- cgit v0.10.2 From d09f9581b235a92abbe372531660b468fadabc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 16 Nov 2015 15:34:47 +0100 Subject: lightnvm: cleanup queue before target removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This prevents outstanding IOs to be sent for completion to target after the target has been removed. The flow is now: stop new IOs > cleanup queue > remove target. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 790b1d7..8a556f3 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -460,11 +460,11 @@ static void nvm_remove_target(struct nvm_target *t) lockdep_assert_held(&nvm_lock); del_gendisk(tdisk); + blk_cleanup_queue(q); + if (tt->exit) tt->exit(tdisk->private_data); - blk_cleanup_queue(q); - put_disk(tdisk); list_del(&t->list); -- cgit v0.10.2 From 4736346bb47966254ee6d1fc50267a2609791cba Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 16 Nov 2015 23:30:00 +0800 Subject: elevator: use list_{first,prev,next}_entry To make the intention clearer, use list_{first,prev,next}_entry instead of list_entry. Signed-off-by: Geliang Tang Signed-off-by: Jens Axboe diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3de89d4..a163c48 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; + struct request *rq; - if (!list_empty(&nd->queue)) { - struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); + if (rq) { list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq) if (rq->queuelist.prev == &nd->queue) return NULL; - return list_entry(rq->queuelist.prev, struct request, queuelist); + return list_prev_entry(rq, queuelist); } static struct request * @@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq) if (rq->queuelist.next == &nd->queue) return NULL; - return list_entry(rq->queuelist.next, struct request, queuelist); + return list_next_entry(rq, queuelist); } static int noop_init_queue(struct request_queue *q, struct elevator_type *e) -- cgit v0.10.2 From b2b7e00148a203e9934bbd17aebffae3f447ade7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 12 Nov 2015 20:25:10 +0100 Subject: null_blk: register as a LightNVM device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for registering as a LightNVM device. This allows us to evaluate the performance of the LightNVM subsystem. In /drivers/Makefile, LightNVM is moved above block device drivers to make sure that the LightNVM media managers have been initialized before drivers under /drivers/block are initialized. Signed-off-by: Matias Bjørling Fix by Jens Axboe to remove unneeded slab cache and the following memory leak. Signed-off-by: Jens Axboe diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 2f6c6ff..d8880ca 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -70,3 +70,6 @@ use_per_node_hctx=[0/1]: Default: 0 parameter. 1: The multi-queue block layer is instantiated with a hardware dispatch queue for each CPU node in the system. + +use_lightnvm=[0/1]: Default: 0 + Register device with LightNVM. Requires blk-mq to be used. diff --git a/drivers/Makefile b/drivers/Makefile index 73d0391..795d0ca 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ obj-$(CONFIG_PARPORT) += parport/ +obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ @@ -70,7 +71,6 @@ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ obj-$(CONFIG_IDE) += ide/ obj-$(CONFIG_SCSI) += scsi/ -obj-$(CONFIG_NVM) += lightnvm/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ obj-$(CONFIG_TARGET_CORE) += target/ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 6255d1c..8165251 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -8,6 +8,7 @@ #include #include #include +#include struct nullb_cmd { struct list_head list; @@ -39,6 +40,7 @@ struct nullb { struct nullb_queue *queues; unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; }; static LIST_HEAD(nullb_list); @@ -119,6 +121,10 @@ static int nr_devices = 2; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); +static bool use_lightnvm; +module_param(use_lightnvm, bool, S_IRUGO); +MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); + static int irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -427,6 +433,8 @@ static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); + if (use_lightnvm) + nvm_unregister(nullb->disk->disk_name); del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); if (queue_mode == NULL_Q_MQ) @@ -436,6 +444,125 @@ static void null_del_dev(struct nullb *nullb) kfree(nullb); } +#ifdef CONFIG_NVM + +static void null_lnvm_end_io(struct request *rq, int error) +{ + struct nvm_rq *rqd = rq->end_io_data; + struct nvm_dev *dev = rqd->dev; + + dev->mt->end_io(rqd, error); + + blk_put_request(rq); +} + +static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +{ + struct request *rq; + struct bio *bio = rqd->bio; + + rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0); + if (IS_ERR(rq)) + return -ENOMEM; + + rq->cmd_type = REQ_TYPE_DRV_PRIV; + rq->__sector = bio->bi_iter.bi_sector; + rq->ioprio = bio_prio(bio); + + if (bio_has_data(bio)) + rq->nr_phys_segments = bio_phys_segments(q, bio); + + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + + rq->end_io_data = rqd; + + blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io); + + return 0; +} + +static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) +{ + sector_t size = gb * 1024 * 1024 * 1024ULL; + struct nvm_id_group *grp; + + id->ver_id = 0x1; + id->vmnt = 0; + id->cgrps = 1; + id->cap = 0x3; + id->dom = 0x1; + id->ppat = NVM_ADDRMODE_LINEAR; + + do_div(size, bs); /* convert size to pages */ + grp = &id->groups[0]; + grp->mtype = 0; + grp->fmtype = 1; + grp->num_ch = 1; + grp->num_lun = 1; + grp->num_pln = 1; + grp->num_blk = size / 256; + grp->num_pg = 256; + grp->fpg_sz = bs; + grp->csecs = bs; + grp->trdt = 25000; + grp->trdm = 25000; + grp->tprt = 500000; + grp->tprm = 500000; + grp->tbet = 1500000; + grp->tbem = 1500000; + grp->mpos = 0x010101; /* single plane rwe */ + grp->cpar = hw_queue_depth; + + return 0; +} + +static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) +{ + mempool_t *virtmem_pool; + + virtmem_pool = mempool_create_page_pool(64, 0); + if (!virtmem_pool) { + pr_err("null_blk: Unable to create virtual memory pool\n"); + return NULL; + } + + return virtmem_pool; +} + +static void null_lnvm_destroy_dma_pool(void *pool) +{ + mempool_destroy(pool); +} + +static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool, + gfp_t mem_flags, dma_addr_t *dma_handler) +{ + return mempool_alloc(pool, mem_flags); +} + +static void null_lnvm_dev_dma_free(void *pool, void *entry, + dma_addr_t dma_handler) +{ + mempool_free(entry, pool); +} + +static struct nvm_dev_ops null_lnvm_dev_ops = { + .identity = null_lnvm_id, + .submit_io = null_lnvm_submit_io, + + .create_dma_pool = null_lnvm_create_dma_pool, + .destroy_dma_pool = null_lnvm_destroy_dma_pool, + .dev_dma_alloc = null_lnvm_dev_dma_alloc, + .dev_dma_free = null_lnvm_dev_dma_free, + + /* Simulate nvme protocol restriction */ + .max_phys_sect = 64, +}; +#else +static struct nvm_dev_ops null_lnvm_dev_ops; +#endif /* CONFIG_NVM */ + static int null_open(struct block_device *bdev, fmode_t mode) { return 0; @@ -575,11 +702,6 @@ static int null_add_dev(void) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); - disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { - rv = -ENOMEM; - goto out_cleanup_blk_queue; - } mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -589,6 +711,21 @@ static int null_add_dev(void) blk_queue_logical_block_size(nullb->q, bs); blk_queue_physical_block_size(nullb->q, bs); + sprintf(nullb->disk_name, "nullb%d", nullb->index); + + if (use_lightnvm) { + rv = nvm_register(nullb->q, nullb->disk_name, + &null_lnvm_dev_ops); + if (rv) + goto out_cleanup_blk_queue; + goto done; + } + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) { + rv = -ENOMEM; + goto out_cleanup_lightnvm; + } size = gb * 1024 * 1024 * 1024ULL; set_capacity(disk, size >> 9); @@ -598,10 +735,15 @@ static int null_add_dev(void) disk->fops = &null_fops; disk->private_data = nullb; disk->queue = nullb->q; - sprintf(disk->disk_name, "nullb%d", nullb->index); + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + add_disk(disk); +done: return 0; +out_cleanup_lightnvm: + if (use_lightnvm) + nvm_unregister(nullb->disk_name); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: @@ -625,6 +767,12 @@ static int __init null_init(void) bs = PAGE_SIZE; } + if (use_lightnvm && queue_mode != NULL_Q_MQ) { + pr_warn("null_blk: LightNVM only supported for blk-mq\n"); + pr_warn("null_blk: defaults queue mode to blk-mq\n"); + queue_mode = NULL_Q_MQ; + } + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { if (submit_queues < nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.", -- cgit v0.10.2 From 1b2ff19e6a957b1ef0f365ad331b608af80e932e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 Nov 2015 14:25:52 +0100 Subject: blk-flush: Queue through IO scheduler when flush not required Currently blk_insert_flush() just adds flush request to q->queue_head when flush is not required. That completely bypasses IO scheduler so e.g. CFQ can be idling waiting for new request to arrive and will idle through the whole window unnecessarily. Luckily this only happens in rare cases as usually checks in generic_make_request_checks() clear FLUSH and FUA flags early if they are not needed. When no flushing is actually required, we can easily fix the problem by properly queueing the request through the IO scheduler. Ideally IO scheduler should be also made aware of requests queued via blk_flush_queue_rq(). However inserting flush request through IO scheduler can have unwanted side-effects since due to flush batching delaying the flush request in IO scheduler will delay all flush requests possibly coming from other processes. So we keep adding the request directly to q->queue_head. Signed-off-by: Jan Kara Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/block/blk-flush.c b/block/blk-flush.c index 9c423e5..c81d56e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -422,7 +422,7 @@ void blk_insert_flush(struct request *rq) if (q->mq_ops) { blk_mq_insert_request(rq, false, false, true); } else - list_add_tail(&rq->queuelist, &q->queue_head); + q->elevator->type->ops.elevator_add_req_fn(q, rq); return; } -- cgit v0.10.2 From 6824c5ef5e8900e61ce8ed40885cacc1c9301c14 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 18 Nov 2015 16:33:08 -0700 Subject: NVMe: Fix possible arithmetic overflow for max segments Reported-by: Paul Grabinar Signed-off-by: Keith Busch Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8187df2..394fd16 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2268,7 +2268,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) if (dev->max_hw_sectors) { blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); blk_queue_max_segments(ns->queue, - ((dev->max_hw_sectors << 9) / dev->page_size) + 1); + (dev->max_hw_sectors / (dev->page_size >> 9)) + 1); } if (dev->stripe_size) blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); -- cgit v0.10.2 From 6bb9535bc3f59194a0ae17b17ca71aecd0f7e3a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 19 Nov 2015 12:50:08 +0100 Subject: null_blk: use ppa_cache pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using a page pool, we can save memory by only allocating room for 64 entries for the ppa command. Introduce a ppa_cache to allocate only the required memory for the ppa list. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8165251..3e9c4f9 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -47,6 +47,7 @@ static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static int nullb_indexes; +static struct kmem_cache *ppa_cache; struct completion_queue { struct llist_head list; @@ -521,7 +522,7 @@ static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) { mempool_t *virtmem_pool; - virtmem_pool = mempool_create_page_pool(64, 0); + virtmem_pool = mempool_create_slab_pool(64, ppa_cache); if (!virtmem_pool) { pr_err("null_blk: Unable to create virtual memory pool\n"); return NULL; @@ -767,6 +768,12 @@ static int __init null_init(void) bs = PAGE_SIZE; } + if (use_lightnvm && bs != 4096) { + pr_warn("null_blk: LightNVM only supports 4k block size\n"); + pr_warn("null_blk: defaults block size to 4k\n"); + bs = 4096; + } + if (use_lightnvm && queue_mode != NULL_Q_MQ) { pr_warn("null_blk: LightNVM only supported for blk-mq\n"); pr_warn("null_blk: defaults queue mode to blk-mq\n"); @@ -803,15 +810,27 @@ static int __init null_init(void) if (null_major < 0) return null_major; + if (use_lightnvm) { + ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), + 0, 0, NULL); + if (!ppa_cache) { + pr_err("null_blk: unable to create ppa cache\n"); + return -ENOMEM; + } + } + for (i = 0; i < nr_devices; i++) { if (null_add_dev()) { unregister_blkdev(null_major, "nullb"); - return -EINVAL; + goto err_ppa; } } pr_info("null: module loaded\n"); return 0; +err_ppa: + kmem_cache_destroy(ppa_cache); + return -EINVAL; } static void __exit null_exit(void) @@ -826,6 +845,8 @@ static void __exit null_exit(void) null_del_dev(nullb); } mutex_unlock(&lock); + + kmem_cache_destroy(ppa_cache); } module_init(null_init); -- cgit v0.10.2 From 5b40db99099ddebe31e9b1b759894cf09c0c6679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 19 Nov 2015 12:50:09 +0100 Subject: null_blk: use device addressing mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linear addressing mode was removed in 7386af2. Make null_blk instead expose the ppa format geometry and support the generic addressing mode. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 3e9c4f9..d51c24a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -486,6 +486,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) { sector_t size = gb * 1024 * 1024 * 1024ULL; + sector_t blksize; struct nvm_id_group *grp; id->ver_id = 0x1; @@ -493,17 +494,34 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) id->cgrps = 1; id->cap = 0x3; id->dom = 0x1; - id->ppat = NVM_ADDRMODE_LINEAR; + + id->ppaf.blk_offset = 0; + id->ppaf.blk_len = 16; + id->ppaf.pg_offset = 16; + id->ppaf.pg_len = 16; + id->ppaf.sect_offset = 32; + id->ppaf.sect_len = 8; + id->ppaf.pln_offset = 40; + id->ppaf.pln_len = 8; + id->ppaf.lun_offset = 48; + id->ppaf.lun_len = 8; + id->ppaf.ch_offset = 56; + id->ppaf.ch_len = 8; do_div(size, bs); /* convert size to pages */ + do_div(size, 256); /* concert size to pgs pr blk */ grp = &id->groups[0]; grp->mtype = 0; - grp->fmtype = 1; + grp->fmtype = 0; grp->num_ch = 1; - grp->num_lun = 1; - grp->num_pln = 1; - grp->num_blk = size / 256; grp->num_pg = 256; + blksize = size; + do_div(size, (1 << 16)); + grp->num_lun = size + 1; + do_div(blksize, grp->num_lun); + grp->num_blk = blksize; + grp->num_pln = 1; + grp->fpg_sz = bs; grp->csecs = bs; grp->trdt = 25000; -- cgit v0.10.2 From 54514aa465e94316a4bf1c5dfe970536bec3e76f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 19 Nov 2015 12:50:10 +0100 Subject: null_blk: do not del gendisk with lightnvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gendisk structure has not been initialized when using lightnvm. Make sure to not delete it upon exit. Also make sure that we use the appropriate disk_name at unregistration. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index d51c24a..5c8ba54 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -435,12 +435,14 @@ static void null_del_dev(struct nullb *nullb) list_del_init(&nullb->list); if (use_lightnvm) - nvm_unregister(nullb->disk->disk_name); - del_gendisk(nullb->disk); + nvm_unregister(nullb->disk_name); + else + del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); if (queue_mode == NULL_Q_MQ) blk_mq_free_tag_set(&nullb->tag_set); - put_disk(nullb->disk); + if (!use_lightnvm) + put_disk(nullb->disk); cleanup_queues(nullb); kfree(nullb); } -- cgit v0.10.2 From 480fc0db819d706ea5608545a12d33cf8d5a31ab Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Fri, 20 Nov 2015 13:47:53 +0100 Subject: lightnvm: wrong return value and redundant free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return value should be non-zero under error conditions. Remove nvme_free(dev) to avoid free dev more than once. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 8a556f3..bed47e7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -222,14 +222,13 @@ static void nvm_free(struct nvm_dev *dev) static int nvm_init(struct nvm_dev *dev) { struct nvmm_type *mt; - int ret = 0; + int ret = -EINVAL; if (!dev->q || !dev->ops) - return -EINVAL; + return ret; if (dev->ops->identity(dev->q, &dev->identity)) { pr_err("nvm: device could not be identified\n"); - ret = -EINVAL; goto err; } @@ -275,7 +274,6 @@ static int nvm_init(struct nvm_dev *dev) dev->nr_chnls); return 0; err: - nvm_free(dev); pr_err("nvm: failed to initialize nvm\n"); return ret; } -- cgit v0.10.2 From 93e70c1f2883f2db2d6a1f339d0e26f00b138e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 20 Nov 2015 13:47:54 +0100 Subject: lightnvm: missing free on init error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If either max_phys_sect is out of bound, the nvm_dev structure is not freed. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index bed47e7..f61d325 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -313,11 +313,13 @@ int nvm_register(struct request_queue *q, char *disk_name, "ppalist"); if (!dev->ppalist_pool) { pr_err("nvm: could not create ppa pool\n"); - return -ENOMEM; + ret = -ENOMEM; + goto err_init; } } else if (dev->ops->max_phys_sect > 256) { pr_info("nvm: max sectors supported is 256.\n"); - return -EINVAL; + ret = -EINVAL; + goto err_init; } down_write(&nvm_lock); -- cgit v0.10.2 From 47b3115ae7b799be8b77b0f024215ad4f68d6460 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Fri, 20 Nov 2015 13:47:55 +0100 Subject: nvme: lightnvm: use admin queues for admin cmds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the Open-Channel SSD Specification, the NVMe-NVM admin commands use vendor specific opcodes of NVMe, so use the NVMe admin queue to dispatch these commands. Signed-off-by: Wenwei Tao Updated by me to include set bad block table as well and also use the admin queue for l2p len calculation. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 7e82fe3..9202d1a 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -276,6 +276,7 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) { struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; int ret; @@ -288,8 +289,8 @@ static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) if (!nvme_nvm_id) return -ENOMEM; - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, nvme_nvm_id, - sizeof(struct nvme_nvm_id)); + ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + nvme_nvm_id, sizeof(struct nvme_nvm_id)); if (ret) { ret = -EIO; goto out; @@ -315,7 +316,7 @@ static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, struct nvme_ns *ns = q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(q) << 9; + u32 len = queue_max_hw_sectors(dev->admin_q) << 9; u32 nlb_pr_rq = len / sizeof(u64); u64 cmd_slba = slba; void *entries; @@ -333,8 +334,8 @@ static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, c.l2p.slba = cpu_to_le64(cmd_slba); c.l2p.nlb = cpu_to_le32(cmd_nlb); - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, - entries, len); + ret = nvme_submit_sync_cmd(dev->admin_q, + (struct nvme_command *)&c, entries, len); if (ret) { dev_err(dev->dev, "L2P table transfer failed (%d)\n", ret); @@ -375,7 +376,8 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_tbl, tblsz); + ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + bb_tbl, tblsz); if (ret) { dev_err(dev->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; @@ -427,7 +429,8 @@ static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); c.set_bb.value = type; - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); + ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + NULL, 0); if (ret) dev_err(dev->dev, "set bad block table failed (%d)\n", ret); return ret; -- cgit v0.10.2 From 0b59733b95f9d7af6bee6e6a4d0d444eb694c514 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:56 +0100 Subject: lightnvm: keep track of block counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain number of in use blocks, free blocks, and bad blocks in a per lun basis. This allows the upper layers to get information about the state of each lun. Also, account for blocks reserved to the device on the free block count. nr_free_blocks matches now the actual number of blocks on the free list when the device is booted. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index c0d0eb2..43c01e0 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -60,6 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; + lun->vlun.nr_inuse_blocks = 0; + lun->vlun.nr_bad_blocks = 0; } return 0; } @@ -87,6 +89,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, } list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; } return 0; @@ -139,6 +142,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) list_move_tail(&blk->list, &lun->used_list); blk->type = 1; lun->vlun.nr_free_blocks--; + lun->vlun.nr_inuse_blocks++; } } @@ -167,8 +171,10 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) block->id = cur_block_id++; /* First block is reserved for device */ - if (unlikely(lun_iter == 0 && blk_iter == 0)) + if (unlikely(lun_iter == 0 && blk_iter == 0)) { + lun->vlun.nr_free_blocks--; continue; + } list_add_tail(&block->list, &lun->free_list); } @@ -266,6 +272,7 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, blk->type = 1; lun->vlun.nr_free_blocks--; + lun->vlun.nr_inuse_blocks++; spin_unlock(&vlun->lock); out: @@ -283,16 +290,21 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) case 1: list_move_tail(&blk->list, &lun->free_list); lun->vlun.nr_free_blocks++; + lun->vlun.nr_inuse_blocks--; blk->type = 0; break; case 2: list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; + lun->vlun.nr_inuse_blocks--; break; default: WARN_ON_ONCE(1); pr_err("gennvm: erroneous block type (%lu -> %u)\n", blk->id, blk->type); list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; + lun->vlun.nr_inuse_blocks--; } spin_unlock(&vlun->lock); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cbe288a..831a20c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,7 +213,9 @@ struct nvm_lun { int lun_id; int chnl_id; + unsigned int nr_inuse_blocks; /* Number of used blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ + unsigned int nr_bad_blocks; /* Number of bad blocks */ struct nvm_block *blocks; spinlock_t lock; -- cgit v0.10.2 From 2fde0e482db2b43bb4ed0e9aebfbe78ebcbbf5a6 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:57 +0100 Subject: lightnvm: add free and bad lun info to show luns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add free block, used block, and bad block information to the show debug interface. This information is used to debug how targets track blocks. Also, change debug function name to make it more generic. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index f61d325..5178645 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -544,7 +544,7 @@ static int nvm_configure_show(const char *val) if (!dev->mt) return 0; - dev->mt->free_blocks_print(dev); + dev->mt->lun_info_print(dev); return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 43c01e0..e20e74e 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -464,15 +464,24 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) return &gn->luns[lunid].vlun; } -static void gennvm_free_blocks_print(struct nvm_dev *dev) +static void gennvm_lun_info_print(struct nvm_dev *dev) { struct gen_nvm *gn = dev->mp; struct gen_lun *lun; unsigned int i; - gennvm_for_each_lun(gn, lun, i) - pr_info("%s: lun%8u\t%u\n", - dev->name, i, lun->vlun.nr_free_blocks); + + gennvm_for_each_lun(gn, lun, i) { + spin_lock(&lun->vlun.lock); + + pr_info("%s: lun%8u\t%u\t%u\t%u\n", + dev->name, i, + lun->vlun.nr_free_blocks, + lun->vlun.nr_inuse_blocks, + lun->vlun.nr_bad_blocks); + + spin_unlock(&lun->vlun.lock); + } } static struct nvmm_type gennvm = { @@ -490,7 +499,7 @@ static struct nvmm_type gennvm = { .erase_blk = gennvm_erase_blk, .get_lun = gennvm_get_lun, - .free_blocks_print = gennvm_free_blocks_print, + .lun_info_print = gennvm_lun_info_print, }; static int __init gennvm_module_init(void) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 831a20c..3db5552 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -380,7 +380,7 @@ typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); -typedef void (nvmm_free_blocks_print_fn)(struct nvm_dev *); +typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); struct nvmm_type { const char *name; @@ -404,7 +404,7 @@ struct nvmm_type { nvmm_get_lun_fn *get_lun; /* Statistics */ - nvmm_free_blocks_print_fn *free_blocks_print; + nvmm_lun_info_print_fn *lun_info_print; struct list_head list; }; -- cgit v0.10.2 From 604e8c8da8854351496215d269c3fa93859e3fee Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 20 Nov 2015 08:38:13 -0700 Subject: NVMe: reap completion entries when deleting queue Make sure that there are no unprocesssed entries on a completion queue before deleting it, and check for validity of the CQ door bell before writing completions to it. This fixes problems with doing a sysfs reset of the device while it's handling IO. Tested-by: Jon Derrick Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 394fd16..930042fa 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -968,7 +968,8 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) return; - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + if (likely(nvmeq->cq_vector >= 0)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); nvmeq->cq_head = head; nvmeq->cq_phase = phase; @@ -2787,6 +2788,10 @@ static void nvme_del_queue_end(struct nvme_queue *nvmeq) { struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; nvme_put_dq(dq); + + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); } static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, -- cgit v0.10.2 From 8aeea03195ee6e33fcb00039c414eabfc37a4eb8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 20 Nov 2015 10:46:49 +0100 Subject: mtip32xx: use formatting capability of kthread_create_on_node kthread_create_on_node takes format+args, so there's no need to do the pretty-printing in advance. Moreover, "mtip_svc_thd_99" (including its '\0') only just fits in 16 bytes, so if index could ever go above 99 we'd have a stack buffer overflow. Signed-off-by: Rasmus Villemoes Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index a28a562..3457ac8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3810,7 +3810,6 @@ static int mtip_block_initialize(struct driver_data *dd) sector_t capacity; unsigned int index = 0; struct kobject *kobj; - unsigned char thd_name[16]; if (dd->disk) goto skip_create_disk; /* hw init done, before rebuild */ @@ -3958,10 +3957,9 @@ skip_create_disk: } start_service_thread: - sprintf(thd_name, "mtip_svc_thd_%02d", index); dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread, - dd, dd->numa_node, "%s", - thd_name); + dd, dd->numa_node, + "mtip_svc_thd_%02d", index); if (IS_ERR(dd->mtip_svc_handler)) { dev_err(&dd->pdev->dev, "service thread failed to start\n"); -- cgit v0.10.2 From 02e2a5bfebe99edcf9d694575a75032d53fe1b73 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Nov 2015 17:18:54 -0800 Subject: mac: validate mac_partition is within sector If md->signature == MAC_DRIVER_MAGIC and md->block_size == 1023, a single 512 byte sector would be read (secsize / 512). However the partition structure would be located past the end of the buffer (secsize % 512). Signed-off-by: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c2c48ec..621317a 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state) Sector sect; unsigned char *data; int slot, blocks_in_map; - unsigned secsize; + unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; @@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); if (!data) return -1; - part = (struct mac_partition *) (data + secsize%512); + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ -- cgit v0.10.2 From b094f89ca42fbb8ce40174d5f85ca8430e499da6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Nov 2015 20:29:45 -0700 Subject: blk-mq: fix calling unplug callbacks with preempt disabled Liu reported that running certain parts of xfstests threw the following error: BUG: sleeping function called from invalid context at mm/page_alloc.c:3190 in_atomic(): 1, irqs_disabled(): 0, pid: 6, name: kworker/u16:0 3 locks held by kworker/u16:0/6: #0: ("writeback"){++++.+}, at: [] process_one_work+0x173/0x730 #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [] process_one_work+0x173/0x730 #2: (&type->s_umount_key#44){+++++.}, at: [] trylock_super+0x25/0x60 CPU: 5 PID: 6 Comm: kworker/u16:0 Tainted: G OE 4.3.0+ #3 Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011 Workqueue: writeback wb_workfn (flush-btrfs-108) ffffffff81a3abab ffff88042e282ba8 ffffffff8130191b ffffffff81a3abab 0000000000000c76 ffff88042e282ba8 ffff88042e27c180 ffff88042e282bd8 ffffffff8108ed95 ffff880400000004 0000000000000000 0000000000000c76 Call Trace: [] dump_stack+0x4f/0x74 [] ___might_sleep+0x185/0x240 [] __might_sleep+0x52/0x90 [] __alloc_pages_nodemask+0x268/0x410 [] ? sched_clock_local+0x1c/0x90 [] ? local_clock+0x21/0x40 [] ? __lock_release+0x420/0x510 [] ? __lock_acquired+0x16c/0x3c0 [] alloc_pages_current+0xc5/0x210 [] ? rbio_is_full+0x55/0x70 [btrfs] [] ? mark_held_locks+0x78/0xa0 [] ? _raw_spin_unlock_irqrestore+0x40/0x60 [] full_stripe_write+0x5a/0xc0 [btrfs] [] __raid56_parity_write+0x39/0x60 [btrfs] [] run_plug+0x11b/0x140 [btrfs] [] btrfs_raid_unplug+0x23/0x70 [btrfs] [] blk_flush_plug_list+0x82/0x1f0 [] blk_sq_make_request+0x1f9/0x740 [] ? generic_make_request_checks+0x222/0x7c0 [] ? blk_queue_enter+0x124/0x310 [] ? blk_queue_enter+0x92/0x310 [] generic_make_request+0x172/0x2c0 [] ? generic_make_request+0x164/0x2c0 [] submit_bio+0x70/0x140 [] ? rbio_add_io_page+0x99/0x150 [btrfs] [] finish_rmw+0x4d9/0x600 [btrfs] [] full_stripe_write+0x9c/0xc0 [btrfs] [] raid56_parity_write+0xef/0x160 [btrfs] [] btrfs_map_bio+0xe3/0x2d0 [btrfs] [] btrfs_submit_bio_hook+0x8d/0x1d0 [btrfs] [] submit_one_bio+0x74/0xb0 [btrfs] [] submit_extent_page+0xe5/0x1c0 [btrfs] [] __extent_writepage_io+0x408/0x4c0 [btrfs] [] ? alloc_dummy_extent_buffer+0x140/0x140 [btrfs] [] __extent_writepage+0x218/0x3a0 [btrfs] [] ? mark_held_locks+0x78/0xa0 [] extent_write_cache_pages.clone.0+0x2f9/0x400 [btrfs] [] extent_writepages+0x52/0x70 [btrfs] [] ? btrfs_set_inode_index+0x70/0x70 [btrfs] [] btrfs_writepages+0x27/0x30 [btrfs] [] do_writepages+0x23/0x40 [] __writeback_single_inode+0x89/0x4d0 [] ? writeback_sb_inodes+0x260/0x480 [] ? writeback_sb_inodes+0x260/0x480 [] ? writeback_sb_inodes+0x15f/0x480 [] writeback_sb_inodes+0x2d2/0x480 [] ? down_read_trylock+0x57/0x60 [] ? trylock_super+0x25/0x60 [] ? rcu_read_lock_sched_held+0x4f/0x90 [] __writeback_inodes_wb+0x8c/0xc0 [] wb_writeback+0x2b5/0x500 [] ? mark_held_locks+0x78/0xa0 [] ? __local_bh_enable_ip+0x68/0xc0 [] ? wb_do_writeback+0x62/0x310 [] wb_do_writeback+0xc1/0x310 [] ? set_worker_desc+0x79/0x90 [] wb_workfn+0x92/0x330 [] process_one_work+0x223/0x730 [] ? process_one_work+0x173/0x730 [] ? worker_thread+0x18f/0x430 [] worker_thread+0x11d/0x430 [] ? maybe_create_worker+0xf0/0xf0 [] ? maybe_create_worker+0xf0/0xf0 [] kthread+0xef/0x110 [] ? schedule_tail+0x1e/0xd0 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x3f/0x70 [] ? __init_kthread_worker+0x70/0x70 The issue is that we've got the software context pinned while calling blk_flush_plug_list(), which flushes callbacks that are allowed to sleep. btrfs and raid has such callbacks. Flip the checks around a bit, so we can enable preempt a bit earlier and flush plugs without having preempt disabled. This only affects blk-mq driven devices, and only those that register a single queue. Reported-by: Liu Bo Tested-by: Liu Bo Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 3ae09de..6d6f8fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1291,15 +1291,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * we do limited pluging. If bio can be merged, do merge. + * We do limited pluging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ if (plug) { /* * The plug list might get flushed before this. If that - * happens, same_queue_rq is invalid and plug list is empty - **/ + * happens, same_queue_rq is invalid and plug list is + * empty + */ if (same_queue_rq && !list_empty(&plug->mq_list)) { old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); @@ -1380,12 +1381,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); if (!request_count) trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { + + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); return cookie; } -- cgit v0.10.2 From 578270bfbd2803dc7b0b03fbc2ac119efbc73195 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 24 Nov 2015 10:35:29 +0800 Subject: block: fix segment split Inside blk_bio_segment_split(), previous bvec pointer(bvprvp) always points to the iterator local variable, which is obviously wrong, so fix it by pointing to the local variable of 'bvprv'. Fixes: 5014c311baa2b(block: fix bogus compiler warnings in blk-merge.c) Cc: stable@kernel.org #4.3 Reported-by: Michael Ellerman Reported-by: Mark Salter Tested-by: Laurent Dufour Tested-by: Mark Salter Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index de5716d8..f2efe8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -98,7 +98,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; - bvprvp = &bv; + bvprvp = &bvprv; sectors += bv.bv_len >> 9; continue; } @@ -108,7 +108,7 @@ new_segment: nsegs++; bvprv = bv; - bvprvp = &bv; + bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; } -- cgit v0.10.2 From 02e707424c2eadbcda68cd38876c9f4434ca8e1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 24 Nov 2015 10:35:30 +0800 Subject: blk-merge: fix blk_bio_segment_split Commit bdced438acd83a(block: setup bi_phys_segments after splitting) introduces function of computing bio->bi_phys_segments during bio splitting. Unfortunately both bio->bi_seg_front_size and bio->bi_seg_back_size arn't computed, so too many physical segments may be obtained for one request since both the two are used to check if one segment across two bios can be possible. This patch fixes the issue by computing the two variables in blk_bio_segment_split(). Fixes: bdced438acd83a(block: setup bi_phys_segments after splitting) Reported-by: Michael Ellerman Reported-by: Mark Salter Tested-by: Laurent Dufour Tested-by: Mark Salter Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index f2efe8a..50793cd 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -76,6 +76,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; + unsigned front_seg_size = bio->bi_seg_front_size; + bool do_split = true; + struct bio *new = NULL; bio_for_each_segment(bv, bio, iter) { if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) @@ -111,13 +114,26 @@ new_segment: bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; } - *segs = nsegs; - return NULL; + do_split = false; split: *segs = nsegs; - return bio_split(bio, sectors, GFP_NOIO, bs); + + if (do_split) { + new = bio_split(bio, sectors, GFP_NOIO, bs); + if (new) + bio = new; + } + + bio->bi_seg_front_size = front_seg_size; + if (seg_size > bio->bi_seg_back_size) + bio->bi_seg_back_size = seg_size; + + return do_split ? new : NULL; } void blk_queue_split(struct request_queue *q, struct bio **bio, -- cgit v0.10.2 From 12e57f59ca3344a588531f68eeede45666e8a6e0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 24 Nov 2015 10:35:31 +0800 Subject: blk-merge: warn if figured out segment number is bigger than nr_phys_segments We had seen lots of reports of this kind issue, so add one warnning in blk-merge, then it can be triggered easily and avoid to depend on warning/bug from drivers. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index 50793cd..41a55ba 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -428,6 +428,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > rq->nr_phys_segments); + return nsegs; } EXPORT_SYMBOL(blk_rq_map_sg); -- cgit v0.10.2