From 3681c85dffda70e551dead31c8d102bd69033fe8 Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Sat, 5 Mar 2016 00:27:04 +0800
Subject: null_blk: add lightnvm null_blk device to the nullb_list

After register null_blk devices into lightnvm, we forget
to add these devices to the the nullb_list, makes them
invisible to the null_blk driver.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Fixes: a514379b0c77 ("null_blk: oops when initializing without lightnvm")
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 64a7b59..cab9759 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -742,10 +742,11 @@ static int null_add_dev(void)
 
 	add_disk(disk);
 
+done:
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
 	mutex_unlock(&lock);
-done:
+
 	return 0;
 
 out_cleanup_lightnvm:
-- 
cgit v0.10.2


From 4c9dacb82d5aa36aa2568df60d897f2eb3d8819b Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Thu, 3 Mar 2016 15:06:37 +0100
Subject: lightnvm: specify target's logical address area
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We can create more than one target on a lightnvm
device by specifying its begin lun and end lun.

But only specify the physical address area is not
enough, we need to get the corresponding non-
intersection logical address area division from
the backend device's logcial address space.
Otherwise the targets on the device might use
the same logical addresses cause incorrect
information in the device's l2p table.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 0d1fb6b..2925fd0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -466,6 +466,7 @@ static int nvm_core_init(struct nvm_dev *dev)
 	dev->total_secs = dev->nr_luns * dev->sec_per_lun;
 	INIT_LIST_HEAD(&dev->online_targets);
 	mutex_init(&dev->mlock);
+	spin_lock_init(&dev->lock);
 
 	return 0;
 }
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index d65ec36..d460b37 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -20,6 +20,68 @@
 
 #include "gennvm.h"
 
+static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
+{
+	struct gen_nvm *gn = dev->mp;
+	struct gennvm_area *area, *prev, *next;
+	sector_t begin = 0;
+	sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9;
+
+	if (len > max_sectors)
+		return -EINVAL;
+
+	area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL);
+	if (!area)
+		return -ENOMEM;
+
+	prev = NULL;
+
+	spin_lock(&dev->lock);
+	list_for_each_entry(next, &gn->area_list, list) {
+		if (begin + len > next->begin) {
+			begin = next->end;
+			prev = next;
+			continue;
+		}
+		break;
+	}
+
+	if ((begin + len) > max_sectors) {
+		spin_unlock(&dev->lock);
+		kfree(area);
+		return -EINVAL;
+	}
+
+	area->begin = *lba = begin;
+	area->end = begin + len;
+
+	if (prev) /* insert into sorted order */
+		list_add(&area->list, &prev->list);
+	else
+		list_add(&area->list, &gn->area_list);
+	spin_unlock(&dev->lock);
+
+	return 0;
+}
+
+static void gennvm_put_area(struct nvm_dev *dev, sector_t begin)
+{
+	struct gen_nvm *gn = dev->mp;
+	struct gennvm_area *area;
+
+	spin_lock(&dev->lock);
+	list_for_each_entry(area, &gn->area_list, list) {
+		if (area->begin != begin)
+			continue;
+
+		list_del(&area->list);
+		spin_unlock(&dev->lock);
+		kfree(area);
+		return;
+	}
+	spin_unlock(&dev->lock);
+}
+
 static void gennvm_blocks_free(struct nvm_dev *dev)
 {
 	struct gen_nvm *gn = dev->mp;
@@ -229,6 +291,7 @@ static int gennvm_register(struct nvm_dev *dev)
 
 	gn->dev = dev;
 	gn->nr_luns = dev->nr_luns;
+	INIT_LIST_HEAD(&gn->area_list);
 	dev->mp = gn;
 
 	ret = gennvm_luns_init(dev, gn);
@@ -465,6 +528,10 @@ static struct nvmm_type gennvm = {
 
 	.get_lun		= gennvm_get_lun,
 	.lun_info_print		= gennvm_lun_info_print,
+
+	.get_area		= gennvm_get_area,
+	.put_area		= gennvm_put_area,
+
 };
 
 static int __init gennvm_module_init(void)
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
index 9c24b5b..04d7c23 100644
--- a/drivers/lightnvm/gennvm.h
+++ b/drivers/lightnvm/gennvm.h
@@ -39,8 +39,14 @@ struct gen_nvm {
 
 	int nr_luns;
 	struct gen_lun *luns;
+	struct list_head area_list;
 };
 
+struct gennvm_area {
+	struct list_head list;
+	sector_t begin;
+	sector_t end;	/* end is excluded */
+};
 #define gennvm_for_each_lun(bm, lun, i) \
 		for ((i) = 0, lun = &(bm)->luns[0]; \
 			(i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 8234378..c1e3c83 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1053,8 +1053,11 @@ static int rrpc_map_init(struct rrpc *rrpc)
 {
 	struct nvm_dev *dev = rrpc->dev;
 	sector_t i;
+	u64 slba;
 	int ret;
 
+	slba = rrpc->soffset >> (ilog2(dev->sec_size) - 9);
+
 	rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects);
 	if (!rrpc->trans_map)
 		return -ENOMEM;
@@ -1076,7 +1079,7 @@ static int rrpc_map_init(struct rrpc *rrpc)
 		return 0;
 
 	/* Bring up the mapping table from device */
-	ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, rrpc_l2p_update,
+	ret = dev->ops->get_l2p_tbl(dev, slba, rrpc->nr_sects, rrpc_l2p_update,
 									rrpc);
 	if (ret) {
 		pr_err("nvm: rrpc: could not read L2P table.\n");
@@ -1086,7 +1089,6 @@ static int rrpc_map_init(struct rrpc *rrpc)
 	return 0;
 }
 
-
 /* Minimum pages needed within a lun */
 #define PAGE_POOL_SIZE 16
 #define ADDR_POOL_SIZE 64
@@ -1200,12 +1202,33 @@ err:
 	return -ENOMEM;
 }
 
+/* returns 0 on success and stores the beginning address in *begin */
+static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
+{
+	struct nvm_dev *dev = rrpc->dev;
+	struct nvmm_type *mt = dev->mt;
+	sector_t size = rrpc->nr_sects * dev->sec_size;
+
+	size >>= 9;
+
+	return mt->get_area(dev, begin, size);
+}
+
+static void rrpc_area_free(struct rrpc *rrpc)
+{
+	struct nvm_dev *dev = rrpc->dev;
+	struct nvmm_type *mt = dev->mt;
+
+	mt->put_area(dev, rrpc->soffset);
+}
+
 static void rrpc_free(struct rrpc *rrpc)
 {
 	rrpc_gc_free(rrpc);
 	rrpc_map_free(rrpc);
 	rrpc_core_free(rrpc);
 	rrpc_luns_free(rrpc);
+	rrpc_area_free(rrpc);
 
 	kfree(rrpc);
 }
@@ -1327,6 +1350,7 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
 	struct rrpc *rrpc;
+	sector_t soffset;
 	int ret;
 
 	if (!(dev->identity.dom & NVM_RSP_L2P)) {
@@ -1352,6 +1376,13 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 	/* simple round-robin strategy */
 	atomic_set(&rrpc->next_lun, -1);
 
+	ret = rrpc_area_init(rrpc, &soffset);
+	if (ret < 0) {
+		pr_err("nvm: rrpc: could not initialize area\n");
+		return ERR_PTR(ret);
+	}
+	rrpc->soffset = soffset;
+
 	ret = rrpc_luns_init(rrpc, lun_begin, lun_end);
 	if (ret) {
 		pr_err("nvm: rrpc: could not initialize luns\n");
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 855f4a5..2653484 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -97,6 +97,7 @@ struct rrpc {
 	struct nvm_dev *dev;
 	struct gendisk *disk;
 
+	sector_t soffset; /* logical sector offset */
 	u64 poffset; /* physical page offset */
 	int lun_offset;
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index c3c4318..b466bd9 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -355,6 +355,7 @@ struct nvm_dev {
 	char name[DISK_NAME_LEN];
 
 	struct mutex mlock;
+	spinlock_t lock;
 };
 
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
@@ -467,6 +468,9 @@ typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
 typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
 
+typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
+typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
+
 struct nvmm_type {
 	const char *name;
 	unsigned int version[3];
@@ -491,6 +495,10 @@ struct nvmm_type {
 
 	/* Statistics */
 	nvmm_lun_info_print_fn *lun_info_print;
+
+	nvmm_get_area_fn *get_area;
+	nvmm_put_area_fn *put_area;
+
 	struct list_head list;
 };
 
-- 
cgit v0.10.2


From da1e284919b0b99c5bf0618b6c98cbaf2c17e62e Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Thu, 3 Mar 2016 15:06:38 +0100
Subject: lightnvm: add a bitmap of luns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a bitmap of luns to indicate the status
of luns: inuse/available. When create targets
do the necessary check to avoid allocating luns
that are already allocated.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Freed dev->lun_map if nvm_core_init later failed in the init process.
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 2925fd0..0dc9a80 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -464,6 +464,10 @@ static int nvm_core_init(struct nvm_dev *dev)
 	dev->nr_luns = dev->luns_per_chnl * dev->nr_chnls;
 
 	dev->total_secs = dev->nr_luns * dev->sec_per_lun;
+	dev->lun_map = kcalloc(BITS_TO_LONGS(dev->nr_luns),
+					sizeof(unsigned long), GFP_KERNEL);
+	if (!dev->lun_map)
+		return -ENOMEM;
 	INIT_LIST_HEAD(&dev->online_targets);
 	mutex_init(&dev->mlock);
 	spin_lock_init(&dev->lock);
@@ -586,6 +590,7 @@ int nvm_register(struct request_queue *q, char *disk_name,
 
 	return 0;
 err_init:
+	kfree(dev->lun_map);
 	kfree(dev);
 	return ret;
 }
@@ -608,6 +613,7 @@ void nvm_unregister(char *disk_name)
 	up_write(&nvm_lock);
 
 	nvm_exit(dev);
+	kfree(dev->lun_map);
 	kfree(dev);
 }
 EXPORT_SYMBOL(nvm_unregister);
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index d460b37..b97801c 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -192,6 +192,9 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 		lun_id = div_u64(pba, dev->sec_per_lun);
 		lun = &gn->luns[lun_id];
 
+		if (!test_bit(lun_id, dev->lun_map))
+			__set_bit(lun_id, dev->lun_map);
+
 		/* Calculate block offset into lun */
 		pba = pba - (dev->sec_per_lun * lun_id);
 		blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)];
@@ -482,10 +485,23 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
 	return nvm_erase_ppa(dev, &addr, 1);
 }
 
+static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid)
+{
+	return test_and_set_bit(lunid, dev->lun_map);
+}
+
+static void gennvm_release_lun(struct nvm_dev *dev, int lunid)
+{
+	WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+}
+
 static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
 {
 	struct gen_nvm *gn = dev->mp;
 
+	if (unlikely(lunid >= dev->nr_luns))
+		return NULL;
+
 	return &gn->luns[lunid].vlun;
 }
 
@@ -527,6 +543,8 @@ static struct nvmm_type gennvm = {
 	.erase_blk		= gennvm_erase_blk,
 
 	.get_lun		= gennvm_get_lun,
+	.reserve_lun		= gennvm_reserve_lun,
+	.release_lun		= gennvm_release_lun,
 	.lun_info_print		= gennvm_lun_info_print,
 
 	.get_area		= gennvm_get_area,
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index c1e3c83..3ab6495 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -965,25 +965,11 @@ static void rrpc_requeue(struct work_struct *work)
 
 static void rrpc_gc_free(struct rrpc *rrpc)
 {
-	struct rrpc_lun *rlun;
-	int i;
-
 	if (rrpc->krqd_wq)
 		destroy_workqueue(rrpc->krqd_wq);
 
 	if (rrpc->kgc_wq)
 		destroy_workqueue(rrpc->kgc_wq);
-
-	if (!rrpc->luns)
-		return;
-
-	for (i = 0; i < rrpc->nr_luns; i++) {
-		rlun = &rrpc->luns[i];
-
-		if (!rlun->blocks)
-			break;
-		vfree(rlun->blocks);
-	}
 }
 
 static int rrpc_gc_init(struct rrpc *rrpc)
@@ -1143,6 +1129,23 @@ static void rrpc_core_free(struct rrpc *rrpc)
 
 static void rrpc_luns_free(struct rrpc *rrpc)
 {
+	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_lun *lun;
+	struct rrpc_lun *rlun;
+	int i;
+
+	if (!rrpc->luns)
+		return;
+
+	for (i = 0; i < rrpc->nr_luns; i++) {
+		rlun = &rrpc->luns[i];
+		lun = rlun->parent;
+		if (!lun)
+			break;
+		dev->mt->release_lun(dev, lun->id);
+		vfree(rlun->blocks);
+	}
+
 	kfree(rrpc->luns);
 }
 
@@ -1150,7 +1153,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 {
 	struct nvm_dev *dev = rrpc->dev;
 	struct rrpc_lun *rlun;
-	int i, j;
+	int i, j, ret = -EINVAL;
 
 	if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
 		pr_err("rrpc: number of pages per block too high.");
@@ -1166,25 +1169,26 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 
 	/* 1:1 mapping */
 	for (i = 0; i < rrpc->nr_luns; i++) {
-		struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i);
-
-		rlun = &rrpc->luns[i];
-		rlun->rrpc = rrpc;
-		rlun->parent = lun;
-		INIT_LIST_HEAD(&rlun->prio_list);
-		INIT_LIST_HEAD(&rlun->open_list);
-		INIT_LIST_HEAD(&rlun->closed_list);
+		int lunid = lun_begin + i;
+		struct nvm_lun *lun;
 
-		INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
-		spin_lock_init(&rlun->lock);
+		if (dev->mt->reserve_lun(dev, lunid)) {
+			pr_err("rrpc: lun %u is already allocated\n", lunid);
+			goto err;
+		}
 
-		rrpc->total_blocks += dev->blks_per_lun;
-		rrpc->nr_sects += dev->sec_per_lun;
+		lun = dev->mt->get_lun(dev, lunid);
+		if (!lun)
+			goto err;
 
+		rlun = &rrpc->luns[i];
+		rlun->parent = lun;
 		rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
 						rrpc->dev->blks_per_lun);
-		if (!rlun->blocks)
+		if (!rlun->blocks) {
+			ret = -ENOMEM;
 			goto err;
+		}
 
 		for (j = 0; j < rrpc->dev->blks_per_lun; j++) {
 			struct rrpc_block *rblk = &rlun->blocks[j];
@@ -1195,11 +1199,23 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 			INIT_LIST_HEAD(&rblk->prio);
 			spin_lock_init(&rblk->lock);
 		}
+
+		rlun->rrpc = rrpc;
+		INIT_LIST_HEAD(&rlun->prio_list);
+		INIT_LIST_HEAD(&rlun->open_list);
+		INIT_LIST_HEAD(&rlun->closed_list);
+
+		INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
+		spin_lock_init(&rlun->lock);
+
+		rrpc->total_blocks += dev->blks_per_lun;
+		rrpc->nr_sects += dev->sec_per_lun;
+
 	}
 
 	return 0;
 err:
-	return -ENOMEM;
+	return ret;
 }
 
 /* returns 0 on success and stores the beginning address in *begin */
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index b466bd9..0ee2c2c 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -346,6 +346,7 @@ struct nvm_dev {
 	int nr_luns;
 	unsigned max_pages_per_blk;
 
+	unsigned long *lun_map;
 	void *ppalist_pool;
 
 	struct nvm_id identity;
@@ -466,6 +467,8 @@ typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
 								unsigned long);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
+typedef int (nvmm_reserve_lun)(struct nvm_dev *, int);
+typedef void (nvmm_release_lun)(struct nvm_dev *, int);
 typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
 
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
@@ -492,6 +495,8 @@ struct nvmm_type {
 
 	/* Configuration management */
 	nvmm_get_lun_fn *get_lun;
+	nvmm_reserve_lun *reserve_lun;
+	nvmm_release_lun *release_lun;
 
 	/* Statistics */
 	nvmm_lun_info_print_fn *lun_info_print;
-- 
cgit v0.10.2


From 9f867268436d799549909437e627e7cf279e1127 Mon Sep 17 00:00:00 2001
From: Matias Bjorling <m@bjorling.me>
Date: Thu, 3 Mar 2016 15:06:39 +0100
Subject: nvme: lightnvm: return ppa completion status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPAs sent to device is separately acknowledge in a 64bit status
variable. The status is stored in DW0 and DW1 of the completion queue
entry. Store this status inside the nvm_rq for further processing.

This can later be used to implement retry techniques for failed writes
and reads.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 42a01a9..9461dd6 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -146,6 +146,14 @@ struct nvme_nvm_command {
 	};
 };
 
+struct nvme_nvm_completion {
+	__le64	result;		/* Used by LightNVM to return ppa completions */
+	__le16	sq_head;	/* how much of this queue may be reclaimed */
+	__le16	sq_id;		/* submission queue that generated this entry */
+	__u16	command_id;	/* of the command which completed */
+	__le16	status;		/* did the command fail, and if so, why? */
+};
+
 #define NVME_NVM_LP_MLC_PAIRS 886
 struct nvme_nvm_lp_mlc {
 	__u16			num_pairs;
@@ -507,6 +515,10 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 static void nvme_nvm_end_io(struct request *rq, int error)
 {
 	struct nvm_rq *rqd = rq->end_io_data;
+	struct nvme_nvm_completion *cqe = rq->special;
+
+	if (cqe)
+		rqd->ppa_status = le64_to_cpu(cqe->result);
 
 	nvm_end_io(rqd, error);
 
@@ -526,7 +538,8 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	if (IS_ERR(rq))
 		return -ENOMEM;
 
-	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+	cmd = kzalloc(sizeof(struct nvme_nvm_command) +
+				sizeof(struct nvme_nvm_completion), GFP_KERNEL);
 	if (!cmd) {
 		blk_mq_free_request(rq);
 		return -ENOMEM;
@@ -545,7 +558,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 
 	rq->cmd = (unsigned char *)cmd;
 	rq->cmd_len = sizeof(struct nvme_nvm_command);
-	rq->special = (void *)0;
+	rq->special = cmd + 1;
 
 	rq->end_io_data = rqd;
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 0ee2c2c..cdcb2cc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -242,6 +242,7 @@ struct nvm_rq {
 	uint16_t nr_pages;
 	uint16_t flags;
 
+	u64 ppa_status; /* ppa media status */
 	int error;
 };
 
-- 
cgit v0.10.2


From 719b59172cdcd5a2ba532b4bb4d56c36df20c28e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Thu, 3 Mar 2016 15:06:40 +0100
Subject: lightnvm: do not reserve lun on l2p loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the l2p table is loaded, addresses are checked for the lun they
belong to and luns are reserved accordingly. This assumes that metadata
is being stored in the backend device to recover the previous target
configuration. Since this is not yet implemented, this check collides
with some of the core initialization (e.g., sysblock initialization when
a page is formed by several sectors).

We take this check out and for now rely on that the right target will be
created instead. When metadata is stored to recover a target, this check
will come natural as part of the recovery strategy.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index b97801c..42c1c2a 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -192,9 +192,6 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 		lun_id = div_u64(pba, dev->sec_per_lun);
 		lun = &gn->luns[lun_id];
 
-		if (!test_bit(lun_id, dev->lun_map))
-			__set_bit(lun_id, dev->lun_map);
-
 		/* Calculate block offset into lun */
 		pba = pba - (dev->sec_per_lun * lun_id);
 		blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)];
-- 
cgit v0.10.2


From 29fd20b8e68a4d31a82909265b1e650b7b860f54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Thu, 3 Mar 2016 15:06:41 +0100
Subject: lightnvm: do not load L2P table if not supported
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An Open-Channel SSD can work on two modes: (i) hybrid mode, where the
L2P table is maintained both by the host and by the device; and (ii)
full host-based, where the L2P table is uniquely maintained by the host.

In the advent of a new target implementing the full host-based mode, do
not assume that the L2P table must be loaded on the generic media
manager; check device properties loaded on the identify command instead.

Signed-off-by: Javier González <javier@cnexlabs.com>
Moved into the following statement.
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 42c1c2a..72e124a 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -257,7 +257,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 		}
 	}
 
-	if (dev->ops->get_l2p_tbl) {
+	if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) {
 		ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs,
 							gennvm_block_map, dev);
 		if (ret) {
-- 
cgit v0.10.2


From 5173cb814b36439a9d9537016965e75798b9f130 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 19 Mar 2016 01:35:54 +0300
Subject: mtip32xx: fix checks for dma mapping errors

exec_drive_taskfile() checks for dma mapping errors by comparison
returned address with zero, while pci_dma_mapping_error() should be used.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index cc2e71d..25824c1 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2051,7 +2051,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 					 outbuf,
 					 taskout,
 					 DMA_TO_DEVICE);
-		if (outbuf_dma == 0) {
+		if (pci_dma_mapping_error(dd->pdev, outbuf_dma)) {
 			err = -ENOMEM;
 			goto abort;
 		}
@@ -2068,7 +2068,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		inbuf_dma = pci_map_single(dd->pdev,
 					 inbuf,
 					 taskin, DMA_FROM_DEVICE);
-		if (inbuf_dma == 0) {
+		if (pci_dma_mapping_error(dd->pdev, inbuf_dma)) {
 			err = -ENOMEM;
 			goto abort;
 		}
-- 
cgit v0.10.2


From 897bb0c7f1ea82d7cc882b19790b5e1df00ffc29 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Mar 2016 11:30:33 +0100
Subject: blk-mq: Use proper cpumask iterator

queue_for_each_ctx() iterates over per_cpu variables under the assumption that
the possible cpu mask cannot have holes. That's wrong as all cpumasks can have
holes. In case there are holes the iteration ends up accessing uninitialized
memory and crashing as a result.

Replace the macro by a proper for_each_possible_cpu() loop and drop the unused
macro blk_ctx_sum() which references queue_for_each_ctx().

Reported-by: Xiong Zhou <jencce.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 431fdda..4ea4dd8 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -416,12 +416,14 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 static void blk_mq_sysfs_init(struct request_queue *q)
 {
 	struct blk_mq_ctx *ctx;
-	int i;
+	int cpu;
 
 	kobject_init(&q->mq_kobj, &blk_mq_ktype);
 
-	queue_for_each_ctx(q, ctx, i)
+	for_each_possible_cpu(cpu) {
+		ctx = per_cpu_ptr(q->queue_ctx, cpu);
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+	}
 }
 
 int blk_mq_register_disk(struct gendisk *disk)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 050f7a1..1699baf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1798,11 +1798,12 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	/*
 	 * Map software to hardware queues
 	 */
-	queue_for_each_ctx(q, ctx, i) {
+	for_each_possible_cpu(i) {
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpumask_test_cpu(i, online_mask))
 			continue;
 
+		ctx = per_cpu_ptr(q->queue_ctx, i);
 		hctx = q->mq_ops->map_queue(q, i);
 
 		cpumask_set_cpu(i, hctx->cpumask);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15a73d4..9ac9799 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -263,22 +263,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
 	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
 
-#define queue_for_each_ctx(q, ctx, i)					\
-	for ((i) = 0; (i) < (q)->nr_queues &&				\
-	     ({ ctx = per_cpu_ptr((q)->queue_ctx, (i)); 1; }); (i)++)
-
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
 	     ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
 
-#define blk_ctx_sum(q, sum)						\
-({									\
-	struct blk_mq_ctx *__x;						\
-	unsigned int __ret = 0, __i;					\
-									\
-	queue_for_each_ctx((q), __x, __i)				\
-		__ret += sum;						\
-	__ret;								\
-})
-
 #endif
-- 
cgit v0.10.2


From 614a4e3773148a31f58dc174bbf578ceb63510c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Mar 2016 13:50:03 -0400
Subject: writeback, cgroup: fix premature wb_put() in
 locked_inode_to_wb_and_lock_list()

locked_inode_to_wb_and_lock_list() wb_get()'s the wb associated with
the target inode, unlocks inode, locks the wb's list_lock and verifies
that the inode is still associated with the wb.  To prevent the wb
going away between dropping inode lock and acquiring list_lock, the wb
is pinned while inode lock is held.  The wb reference is put right
after acquiring list_lock citing that the wb won't be dereferenced
anymore.

This isn't true.  If the inode is still associated with the wb, the
inode has reference and it's safe to return the wb; however, if inode
has been switched, the wb still needs to be unlocked which is a
dereference and can lead to use-after-free if it it races with wb
destruction.

Fix it by putting the reference after releasing list_lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 87e1d789bf55 ("writeback: implement [locked_]inode_to_wb_and_lock_list()")
Cc: stable@vger.kernel.org # v4.2+
Tested-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c46ed9..7b9582e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
 		wb_get(wb);
 		spin_unlock(&inode->i_lock);
 		spin_lock(&wb->list_lock);
-		wb_put(wb);		/* not gonna deref it anymore */
 
 		/* i_wb may have changed inbetween, can't use inode_to_wb() */
-		if (likely(wb == inode->i_wb))
-			return wb;	/* @inode already has ref */
+		if (likely(wb == inode->i_wb)) {
+			wb_put(wb);	/* @inode already has ref */
+			return wb;
+		}
 
 		spin_unlock(&wb->list_lock);
+		wb_put(wb);
 		cpu_relax();
 		spin_lock(&inode->i_lock);
 	}
-- 
cgit v0.10.2


From aaf2559332ba272671bb870464a99b909b29a3a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Mar 2016 13:52:04 -0400
Subject: writeback, cgroup: fix use of the wrong bdi_writeback which
 mismatches the inode

When cgroup writeback is in use, there can be multiple wb's
(bdi_writeback's) per bdi and an inode may switch among them
dynamically.  In a couple places, the wrong wb was used leading to
performing operations on the wrong list under the wrong lock
corrupting the io lists.

* writeback_single_inode() was taking @wb parameter and used it to
  remove the inode from io lists if it becomes clean after writeback.
  The callers of this function were always passing in the root wb
  regardless of the actual wb that the inode was associated with,
  which could also change while writeback is in progress.

  Fix it by dropping the @wb parameter and using
  inode_to_wb_and_lock_list() to determine and lock the associated wb.

* After writeback_sb_inodes() writes out an inode, it re-locks @wb and
  inode to remove it from or move it to the right io list.  It assumes
  that the inode is still associated with @wb; however, the inode may
  have switched to another wb while writeback was in progress.

  Fix it by using inode_to_wb_and_lock_list() to determine and lock
  the associated wb after writeback is complete.  As the function
  requires the original @wb->list_lock locked for the next iteration,
  in the unlikely case where the inode has changed association, switch
  the locks.

Kudos to Tahsin for pinpointing these subtle breakages.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: d10c80955265 ("writeback: implement foreign cgroup inode bdi_writeback switching")
Link: http://lkml.kernel.org/g/CAAeU0aMYeM_39Y2+PaRvyB1nqAPYZSNngJ1eBRmrxn7gKAt2Mg@mail.gmail.com
Reported-and-diagnosed-by: Tahsin Erdogan <tahsin@google.com>
Tested-by: Tahsin Erdogan <tahsin@google.com>
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7b9582e..fee81e8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1339,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
  * and does more profound writeback list handling in writeback_sb_inodes().
  */
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-		       struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+				  struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb;
 	int ret = 0;
 
 	spin_lock(&inode->i_lock);
@@ -1380,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	ret = __writeback_single_inode(inode, wbc);
 
 	wbc_detach_inode(wbc);
-	spin_lock(&wb->list_lock);
+
+	wb = inode_to_wb_and_lock_list(inode);
 	spin_lock(&inode->i_lock);
 	/*
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1455,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
+		struct bdi_writeback *tmp_wb;
 
 		if (inode->i_sb != sb) {
 			if (work->sb) {
@@ -1545,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
 			cond_resched();
 		}
 
-
-		spin_lock(&wb->list_lock);
+		/*
+		 * Requeue @inode if still dirty.  Be careful as @inode may
+		 * have been switched to another wb in the meantime.
+		 */
+		tmp_wb = inode_to_wb_and_lock_list(inode);
 		spin_lock(&inode->i_lock);
 		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
-		requeue_inode(inode, wb, &wbc);
+		requeue_inode(inode, tmp_wb, &wbc);
 		inode_sync_complete(inode);
 		spin_unlock(&inode->i_lock);
 
+		if (unlikely(tmp_wb != wb)) {
+			spin_unlock(&tmp_wb->list_lock);
+			spin_lock(&wb->list_lock);
+		}
+
 		/*
 		 * bail out to wb_writeback() often enough to check
 		 * background threshold and other termination conditions.
@@ -2340,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
  */
 int write_inode_now(struct inode *inode, int sync)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2352,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
 		wbc.nr_to_write = 0;
 
 	might_sleep();
-	return writeback_single_inode(inode, wb, &wbc);
+	return writeback_single_inode(inode, &wbc);
 }
 EXPORT_SYMBOL(write_inode_now);
 
@@ -2369,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+	return writeback_single_inode(inode, wbc);
 }
 EXPORT_SYMBOL(sync_inode);
 
-- 
cgit v0.10.2


From d783e0bd02e700e7a893ef4fa71c69438ac1c276 Mon Sep 17 00:00:00 2001
From: Marta Rybczynska <mrybczyn@kalray.eu>
Date: Tue, 22 Mar 2016 16:02:06 +0100
Subject: nvme: avoid cqe corruption when update at the same time as read

Make sure the CQE phase (validity) is read before the rest of the
structure. The phase bit is the highest address and the CQE
read will happen on most platforms from lower to upper addresses
and will be done by multiple non-atomic loads. If the structure
is updated by PCI during the reads from the processor, the
processor may get a corrupted copy.

The addition of the new nvme_cqe_valid function that verifies
the validity bit also allows refactoring of the other CQE read
sequences.

Signed-off-by: Marta Rybczynska <marta.rybczynska@kalray.eu>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f8db70a..24ccda3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -723,6 +723,13 @@ static void nvme_complete_rq(struct request *req)
 	blk_mq_end_request(req, error);
 }
 
+/* We read the CQE phase first to check if the rest of the entry is valid */
+static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
+		u16 phase)
+{
+	return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
+}
+
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 {
 	u16 head, phase;
@@ -730,13 +737,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 	head = nvmeq->cq_head;
 	phase = nvmeq->cq_phase;
 
-	for (;;) {
+	while (nvme_cqe_valid(nvmeq, head, phase)) {
 		struct nvme_completion cqe = nvmeq->cqes[head];
-		u16 status = le16_to_cpu(cqe.status);
 		struct request *req;
 
-		if ((status & 1) != phase)
-			break;
 		if (++head == nvmeq->q_depth) {
 			head = 0;
 			phase = !phase;
@@ -767,7 +771,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
 		if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
 			memcpy(req->special, &cqe, sizeof(cqe));
-		blk_mq_complete_request(req, status >> 1);
+		blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
 
 	}
 
@@ -808,18 +812,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
 static irqreturn_t nvme_irq_check(int irq, void *data)
 {
 	struct nvme_queue *nvmeq = data;
-	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
-	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
-		return IRQ_NONE;
-	return IRQ_WAKE_THREAD;
+	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
 }
 
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 
-	if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
-	    nvmeq->cq_phase) {
+	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
 		spin_lock_irq(&nvmeq->q_lock);
 		__nvme_process_cq(nvmeq, &tag);
 		spin_unlock_irq(&nvmeq->q_lock);
-- 
cgit v0.10.2