From 4d51abf9bcca01efa3afbe94d50cc2cda8095da8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 15 Jun 2014 13:37:33 -0700
Subject: block: Use dma_zalloc_coherent

Use the zeroing function instead of dma_alloc_coherent & memset(,0,)

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e2bb8af..baee595 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1275,11 +1275,10 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	if (!nvmeq)
 		return NULL;
 
-	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
-					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
+					  &nvmeq->cq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
-	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
 
 	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
 					&nvmeq->sq_dma_addr, GFP_KERNEL);
-- 
cgit v0.10.2


From 6fccf9383b280d463a7dfe1e0d048aff8df8a25e Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 18 Jun 2014 13:58:57 -0600
Subject: NVMe: Async event request

Submits NVMe asynchronous event requests, one event up to the controller
maximum or number of possible different event types (8), whichever is
smaller. Events successfully returned by the controller are logged.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index baee595..42a62bb 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -207,6 +207,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_ABORT		(0x318 + CMD_CTX_BASE)
+#define CMD_CTX_ASYNC		(0x31C + CMD_CTX_BASE)
 
 static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
@@ -229,6 +230,17 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
+	if (ctx == CMD_CTX_ASYNC) {
+		u32 result = le32_to_cpup(&cqe->result);
+		u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+		if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+			++nvmeq->dev->event_limit;
+		if (status == NVME_SC_SUCCESS)
+			dev_warn(nvmeq->q_dmadev,
+				"async event result %08x\n", result);
+		return;
+	}
 
 	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
 }
@@ -1161,6 +1173,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 			continue;
 		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
 			continue;
+		if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC)
+			continue;
 		if (timeout && nvmeq->dev->initialized) {
 			nvme_abort_cmd(cmdid, nvmeq);
 			continue;
@@ -1842,6 +1856,27 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 	}
 }
 
+static int nvme_submit_async_req(struct nvme_queue *nvmeq)
+{
+	struct nvme_command *c;
+	int cmdid;
+
+	cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0);
+	if (cmdid < 0)
+		return cmdid;
+
+	c = &nvmeq->sq_cmds[nvmeq->sq_tail];
+	memset(c, 0, sizeof(*c));
+	c->common.opcode = nvme_admin_async_event;
+	c->common.command_id = cmdid;
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
 static int nvme_kthread(void *data)
 {
 	struct nvme_dev *dev, *next;
@@ -1875,6 +1910,12 @@ static int nvme_kthread(void *data)
 				nvme_cancel_ios(nvmeq, true);
 				nvme_resubmit_bios(nvmeq);
 				nvme_resubmit_iods(nvmeq);
+
+				while ((i == 0) && (dev->event_limit > 0)) {
+					if (nvme_submit_async_req(nvmeq))
+						break;
+					dev->event_limit--;
+				}
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
@@ -2244,6 +2285,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	dev->abort_limit = ctrl->acl + 1;
 	dev->vwc = ctrl->vwc;
+	dev->event_limit = min(ctrl->aerl + 1, 8);
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2bf4031..974efd0 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -99,6 +99,7 @@ struct nvme_dev {
 	u32 stripe_size;
 	u16 oncs;
 	u16 abort_limit;
+	u8 event_limit;
 	u8 vwc;
 	u8 initialized;
 };
-- 
cgit v0.10.2


From a7dd7957acf798ac406afd6631e64a27ac4a5bf1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sat, 28 Jun 2014 09:00:27 -0400
Subject: NVMe: Update list of status codes

Taken from the draft NVMe 1.1b specification.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 29a7d86..134518b 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -440,9 +440,15 @@ enum {
 	NVME_SC_FUSED_MISSING		= 0xa,
 	NVME_SC_INVALID_NS		= 0xb,
 	NVME_SC_CMD_SEQ_ERROR		= 0xc,
+	NVME_SC_SGL_INVALID_LAST	= 0xd,
+	NVME_SC_SGL_INVALID_COUNT	= 0xe,
+	NVME_SC_SGL_INVALID_DATA	= 0xf,
+	NVME_SC_SGL_INVALID_METADATA	= 0x10,
+	NVME_SC_SGL_INVALID_TYPE	= 0x11,
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
+	NVME_SC_RESERVATION_CONFLICT	= 0x83,
 	NVME_SC_CQ_INVALID		= 0x100,
 	NVME_SC_QID_INVALID		= 0x101,
 	NVME_SC_QUEUE_SIZE		= 0x102,
@@ -454,7 +460,15 @@ enum {
 	NVME_SC_INVALID_VECTOR		= 0x108,
 	NVME_SC_INVALID_LOG_PAGE	= 0x109,
 	NVME_SC_INVALID_FORMAT		= 0x10a,
+	NVME_SC_FIRMWARE_NEEDS_RESET	= 0x10b,
+	NVME_SC_INVALID_QUEUE		= 0x10c,
+	NVME_SC_FEATURE_NOT_SAVEABLE	= 0x10d,
+	NVME_SC_FEATURE_NOT_CHANGEABLE	= 0x10e,
+	NVME_SC_FEATURE_NOT_PER_NS	= 0x10f,
+	NVME_SC_FW_NEEDS_RESET_SUBSYS	= 0x110,
 	NVME_SC_BAD_ATTRIBUTES		= 0x180,
+	NVME_SC_INVALID_PI		= 0x181,
+	NVME_SC_READ_ONLY		= 0x182,
 	NVME_SC_WRITE_FAULT		= 0x280,
 	NVME_SC_READ_ERROR		= 0x281,
 	NVME_SC_GUARD_CHECK		= 0x282,
-- 
cgit v0.10.2


From 1d0906246095184d1624c643c2088152d330c40a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 23 Jun 2014 11:34:01 -0600
Subject: NVMe: Mismatched host/device page size support

Adds support for devices with max page size smaller than the host's.
In the case we encounter such a host/device combination, the driver will
split a page into as many PRP entries as necessary for the device's page
size capabilities. If the device's reported minimum page size is greater
than the host's, the driver will not attempt to enable the device and
return an error instead.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 42a62bb..e60bb0f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -373,17 +373,17 @@ static __le64 **iod_list(struct nvme_iod *iod)
  * as it only leads to a small amount of wasted memory for the lifetime of
  * the I/O.
  */
-static int nvme_npages(unsigned size)
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
 {
-	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
-	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes) +
+				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
 	if (iod) {
@@ -400,7 +400,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
-	const int last_prp = PAGE_SIZE / 8 - 1;
+	const int last_prp = dev->page_size / 8 - 1;
 	int i;
 	__le64 **list = iod_list(iod);
 	dma_addr_t prp_dma = iod->first_dma;
@@ -491,26 +491,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	__le64 **list = iod_list(iod);
 	dma_addr_t prp_dma;
 	int nprps, i;
+	u32 page_size = dev->page_size;
 
-	length -= (PAGE_SIZE - offset);
+	length -= (page_size - offset);
 	if (length <= 0)
 		return total_len;
 
-	dma_len -= (PAGE_SIZE - offset);
+	dma_len -= (page_size - offset);
 	if (dma_len) {
-		dma_addr += (PAGE_SIZE - offset);
+		dma_addr += (page_size - offset);
 	} else {
 		sg = sg_next(sg);
 		dma_addr = sg_dma_address(sg);
 		dma_len = sg_dma_len(sg);
 	}
 
-	if (length <= PAGE_SIZE) {
+	if (length <= page_size) {
 		iod->first_dma = dma_addr;
 		return total_len;
 	}
 
-	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+	nprps = DIV_ROUND_UP(length, page_size);
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
 		iod->npages = 0;
@@ -523,13 +524,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	if (!prp_list) {
 		iod->first_dma = dma_addr;
 		iod->npages = -1;
-		return (total_len - length) + PAGE_SIZE;
+		return (total_len - length) + page_size;
 	}
 	list[0] = prp_list;
 	iod->first_dma = prp_dma;
 	i = 0;
 	for (;;) {
-		if (i == PAGE_SIZE / 8) {
+		if (i == page_size >> 3) {
 			__le64 *old_prp_list = prp_list;
 			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 			if (!prp_list)
@@ -540,9 +541,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 			i = 1;
 		}
 		prp_list[i++] = cpu_to_le64(dma_addr);
-		dma_len -= PAGE_SIZE;
-		dma_addr += PAGE_SIZE;
-		length -= PAGE_SIZE;
+		dma_len -= page_size;
+		dma_addr += page_size;
+		length -= page_size;
 		if (length <= 0)
 			break;
 		if (dma_len > 0)
@@ -749,7 +750,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if ((bio->bi_rw & REQ_FLUSH) && psegs)
 		return nvme_split_flush_data(nvmeq, bio);
 
-	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return -ENOMEM;
 
@@ -1463,6 +1464,24 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	u32 aqa;
 	u64 cap = readq(&dev->bar->cap);
 	struct nvme_queue *nvmeq;
+	unsigned page_shift = PAGE_SHIFT;
+	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
+	unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
+
+	if (page_shift < dev_page_min) {
+		dev_err(&dev->pci_dev->dev,
+				"Minimum device page size (%u) too large for "
+				"host (%u)\n", 1 << dev_page_min,
+				1 << page_shift);
+		return -ENODEV;
+	}
+	if (page_shift > dev_page_max) {
+		dev_info(&dev->pci_dev->dev,
+				"Device maximum page size (%u) smaller than "
+				"host (%u); enabling work-around\n",
+				1 << dev_page_max, 1 << page_shift);
+		page_shift = dev_page_max;
+	}
 
 	result = nvme_disable_ctrl(dev, cap);
 	if (result < 0)
@@ -1478,8 +1497,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	aqa = nvmeq->q_depth - 1;
 	aqa |= aqa << 16;
 
+	dev->page_size = 1 << page_shift;
+
 	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
-	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 
@@ -1529,7 +1550,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 974efd0..ed09074 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -38,6 +38,7 @@ struct nvme_bar {
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
+#define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
@@ -97,6 +98,7 @@ struct nvme_dev {
 	char firmware_rev[8];
 	u32 max_hw_sectors;
 	u32 stripe_size;
+	u32 page_size;
 	u16 oncs;
 	u16 abort_limit;
 	u8 event_limit;
-- 
cgit v0.10.2


From 01079522f94b3a1c87cfa375be3917d2da6e6363 Mon Sep 17 00:00:00 2001
From: Dan McLeran <daniel.mcleran@intel.com>
Date: Mon, 23 Jun 2014 08:24:36 -0600
Subject: NVMe: Change nvme_enable_ctrl to set EN and manage CC thru
 ctrl_config.

Change the behavior of nvme_enable_ctrl to set EN.
Clear CC.SH for both nvme_enable_ctrl and nvme_disable_ctrl.
Remove reading of the CC register and manage the state in
dev->ctrl_config.

Signed-off-by: Dan McLeran <daniel.mcleran@intel.com>
[removed an unwanted write to CC]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e60bb0f..0a98de8 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1422,25 +1422,30 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
  */
 static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
 {
-	u32 cc = readl(&dev->bar->cc);
+	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+	dev->ctrl_config &= ~NVME_CC_ENABLE;
+	writel(dev->ctrl_config, &dev->bar->cc);
 
-	if (cc & NVME_CC_ENABLE)
-		writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
 	return nvme_wait_ready(dev, cap, false);
 }
 
 static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
 {
+	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+	dev->ctrl_config |= NVME_CC_ENABLE;
+	writel(dev->ctrl_config, &dev->bar->cc);
+
 	return nvme_wait_ready(dev, cap, true);
 }
 
 static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 {
 	unsigned long timeout;
-	u32 cc;
 
-	cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL;
-	writel(cc, &dev->bar->cc);
+	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+	dev->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+	writel(dev->ctrl_config, &dev->bar->cc);
 
 	timeout = 2 * HZ + jiffies;
 	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
@@ -1499,7 +1504,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	dev->page_size = 1 << page_shift;
 
-	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	dev->ctrl_config = NVME_CC_CSS_NVM;
 	dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
@@ -1507,7 +1512,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	writel(aqa, &dev->bar->aqa);
 	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
-	writel(dev->ctrl_config, &dev->bar->cc);
 
 	result = nvme_enable_ctrl(dev, cap);
 	if (result)
-- 
cgit v0.10.2


From badc34d4154de81b965629a5c3110e058eb8ca2b Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 23 Jun 2014 14:25:35 -0600
Subject: NVMe: Handling devices incapable of I/O

This is a minor refactor for handling devices that are incapable of IO.
The driver previously used special error codes to know that IO queues
are unavailable, but we have an online queue count now.

This also fixes an issue where the driver successfully sets the queue
count, but either is unable to allocate an IO queue or the device can't
create one for some reason.

If the driver can successfully enable the device and get responses to
admin commands, the driver will bring up a character device for managment
but not create block devices.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 0a98de8..f3103aa 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2172,7 +2172,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	if (status > 0) {
 		dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
 									status);
-		return -EBUSY;
+		return 0;
 	}
 	return min(result & 0xffff, result >> 16) + 1;
 }
@@ -2214,7 +2214,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	nr_io_queues = num_possible_cpus();
 	result = set_queue_count(dev, nr_io_queues);
-	if (result < 0)
+	if (result <= 0)
 		return result;
 	if (result < nr_io_queues)
 		nr_io_queues = result;
@@ -2740,7 +2740,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 	}
 
 	result = nvme_setup_io_queues(dev);
-	if (result && result != -EBUSY)
+	if (result)
 		goto disable;
 
 	return result;
@@ -2777,9 +2777,9 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 	int ret;
 
 	ret = nvme_dev_start(dev);
-	if (ret && ret != -EBUSY)
+	if (ret)
 		return ret;
-	if (ret == -EBUSY) {
+	if (dev->online_queues < 2) {
 		spin_lock(&dev_list_lock);
 		dev->reset_workfn = nvme_remove_disks;
 		queue_work(nvme_workq, &dev->reset_work);
@@ -2852,17 +2852,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	kref_init(&dev->kref);
 	result = nvme_dev_start(dev);
-	if (result) {
-		if (result == -EBUSY)
-			goto create_cdev;
+	if (result)
 		goto release_pools;
-	}
 
-	result = nvme_dev_add(dev);
+	if (dev->online_queues > 1)
+		result = nvme_dev_add(dev);
 	if (result)
 		goto shutdown;
 
- create_cdev:
 	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
 	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
 	dev->miscdev.parent = &pdev->dev;
-- 
cgit v0.10.2


From c81f49758a677e878df4e6a33f29d8ce401bf66d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 23 Jun 2014 15:24:53 -0600
Subject: NVMe: Use pci_stop_and_remove_bus_device_locked()

Race conditions are theoretically possible between the NVMe PCI device
removal and the generic PCI bus rescan and device removal that can be
triggered via sysfs.

To avoid those race conditions make the NVMe code use
pci_stop_and_remove_bus_device_locked().

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f3103aa..48a7127 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2759,7 +2759,7 @@ static int nvme_remove_dead_ctrl(void *arg)
 	struct pci_dev *pdev = dev->pci_dev;
 
 	if (pci_get_drvdata(pdev))
-		pci_stop_and_remove_bus_device(pdev);
+		pci_stop_and_remove_bus_device_locked(pdev);
 	kref_put(&dev->kref, nvme_free_dev);
 	return 0;
 }
-- 
cgit v0.10.2


From a67394790a9c52657273212647e350e36a292183 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 23 Jun 2014 16:03:21 -0600
Subject: NVMe: Whitespace fixes

Fixing tabs inadvertently converted to spaces.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 48a7127..c5f379f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2892,12 +2892,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
 {
-       struct nvme_dev *dev = pci_get_drvdata(pdev);
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
 
-       if (prepare)
-               nvme_dev_shutdown(dev);
-       else
-               nvme_dev_resume(dev);
+	if (prepare)
+		nvme_dev_shutdown(dev);
+	else
+		nvme_dev_resume(dev);
 }
 
 static void nvme_shutdown(struct pci_dev *pdev)
-- 
cgit v0.10.2


From 7c1b24503873e6cfc1d31f2e55c35358fd438351 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 25 Jun 2014 11:18:12 -0600
Subject: NVMe: Skip orderly shutdown on failed devices

Rather than skipping shutdown only for devices that have been removed,
skip the orderly shutdown on failed devices to avoid the long timeout
handling that inevitably happens when deleting queues on such a device.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c5f379f..ac36940 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2567,11 +2567,14 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
 static void nvme_dev_shutdown(struct nvme_dev *dev)
 {
 	int i;
+	u32 csts = -1;
 
 	dev->initialized = 0;
 	nvme_dev_list_remove(dev);
 
-	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
+	if (dev->bar)
+		csts = readl(&dev->bar->csts);
+	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
 			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
 			nvme_suspend_queue(nvmeq);
-- 
cgit v0.10.2


From 2484f40780b97df1b5eb09e78ce4efaa78b21875 Mon Sep 17 00:00:00 2001
From: Dan McLeran <daniel.mcleran@intel.com>
Date: Tue, 1 Jul 2014 09:33:32 -0600
Subject: NVMe: Add shutdown timeout as module parameter.

The current implementation hard-codes the shutdown timeout to 2 seconds.
Some devices take longer than this to complete a normal shutdown.
Changing the shutdown timeout to a module parameter with a default
timeout of 5 seconds.

Signed-off-by: Dan McLeran <daniel.mcleran@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ac36940..3ada636 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -48,6 +48,7 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT		(admin_timeout * HZ)
+#define SHUTDOWN_TIMEOUT	(shutdown_timeout * HZ)
 #define IOD_TIMEOUT		(retry_time * HZ)
 
 static unsigned char admin_timeout = 60;
@@ -62,6 +63,10 @@ static unsigned char retry_time = 30;
 module_param(retry_time, byte, 0644);
 MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O");
 
+static unsigned char shutdown_timeout = 5;
+module_param(shutdown_timeout, byte, 0644);
+MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
+
 static int nvme_major;
 module_param(nvme_major, int, 0);
 
@@ -1447,7 +1452,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 
 	writel(dev->ctrl_config, &dev->bar->cc);
 
-	timeout = 2 * HZ + jiffies;
+	timeout = SHUTDOWN_TIMEOUT + jiffies;
 	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
 							NVME_CSTS_SHST_CMPLT) {
 		msleep(100);
-- 
cgit v0.10.2


From f435c2825b4cc6453b9a1f91418cabbd6ba08cc0 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 7 Jul 2014 09:14:42 -0600
Subject: NVMe: Call nvme_free_queue directly

Rather than relying on call_rcu, this patch directly frees the
nvme_queue's memory after ensuring no readers exist. Some arch specific
dma_free_coherent implementations may not be called from a call_rcu's
soft interrupt context, hence the change.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reported-by: Matthew Minter <matthew_minter@xyratex.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 3ada636..d3a025f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -95,7 +95,7 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-	struct rcu_head r_head;
+	struct llist_node node;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -1192,10 +1192,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 	}
 }
 
-static void nvme_free_queue(struct rcu_head *r)
+static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
-	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
-
 	spin_lock_irq(&nvmeq->q_lock);
 	while (bio_list_peek(&nvmeq->sq_cong)) {
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1225,14 +1223,21 @@ static void nvme_free_queue(struct rcu_head *r)
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
+	LLIST_HEAD(q_list);
+	struct nvme_queue *nvmeq, *next;
+	struct llist_node *entry;
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		nvmeq = raw_nvmeq(dev, i);
 		rcu_assign_pointer(dev->queues[i], NULL);
-		call_rcu(&nvmeq->r_head, nvme_free_queue);
+		llist_add(&nvmeq->node, &q_list);
 		dev->queue_count--;
 	}
+	synchronize_rcu();
+	entry = llist_del_all(&q_list);
+	llist_for_each_entry_safe(nvmeq, next, entry, node)
+		nvme_free_queue(nvmeq);
 }
 
 /**
@@ -2929,7 +2934,6 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
-	rcu_barrier();
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
-- 
cgit v0.10.2


From 302c6727e5eb4d0be0e02d077b65feb3e73ea254 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 18 Jul 2014 11:40:20 -0600
Subject: NVMe: Fix filesystem sync deadlock on removal

This changes the order of deleting the gendisks so it happens after the
nvme IO queues are freed. If a device is removed while a filesystem has
associated dirty data, the removal will wait on these to complete before
proceeding from del_gendisk, which could have caused deadlock before.

The implication of this is that an orderly removal of a responsive
device won't necessarily wait for dirty data to be written, but we are
not guaranteed the device is even going to respond at this point either.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d3a025f..48a1be4 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2781,8 +2781,8 @@ static void nvme_remove_disks(struct work_struct *ws)
 {
 	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 
-	nvme_dev_remove(dev);
 	nvme_free_queues(dev, 1);
+	nvme_dev_remove(dev);
 }
 
 static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2931,9 +2931,9 @@ static void nvme_remove(struct pci_dev *pdev)
 	flush_work(&dev->reset_work);
 	flush_work(&dev->cpu_work);
 	misc_deregister(&dev->miscdev);
-	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
+	nvme_dev_remove(dev);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
-- 
cgit v0.10.2


From 5905535610fc6b11379a999ff45bfa39f0d605b6 Mon Sep 17 00:00:00 2001
From: Sam Bradshaw <sbradshaw@micron.com>
Date: Tue, 29 Jul 2014 13:31:36 -0700
Subject: NVMe: Correctly handle IOCTL_SUBMIT_IO when cpus > online queues

nvme_submit_io_cmd() uses smp_processor_id() to pick an IO queue index.
This patch fixes the case where there are more cpus from which the ioctl
call can originate than online queues, which can happen when a device
supports or was allocated fewer interrupt vectors than exist cpu cores.

Thanks to Keith Busch for the implementation suggestion.

Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 48a1be4..5d44c934 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -984,8 +984,8 @@ int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
-							NVME_IO_TIMEOUT);
+	return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd,
+						result,	NVME_IO_TIMEOUT);
 }
 
 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
-- 
cgit v0.10.2


From 062261be4e39e35bdf2fba16a4b2d8a432ae8281 Mon Sep 17 00:00:00 2001
From: Andreea-Cristina Bernat <bernat.ada@gmail.com>
Date: Mon, 18 Aug 2014 17:41:01 +0300
Subject: nvme: Replace rcu_assign_pointer() with RCU_INIT_POINTER()

The use of "rcu_assign_pointer()" is NULLing out the pointer.
According to RCU_INIT_POINTER()'s block comment:
"1.   This use of RCU_INIT_POINTER() is NULLing out the pointer"
it is better to use it instead of rcu_assign_pointer() because it has a
smaller overhead.

The following Coccinelle semantic patch was used:
@@
@@

- rcu_assign_pointer
+ RCU_INIT_POINTER
  (..., NULL)

Signed-off-by: Andreea-Cristina Bernat <bernat.ada@gmail.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 5d44c934..c1e3c1a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1230,7 +1230,7 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		nvmeq = raw_nvmeq(dev, i);
-		rcu_assign_pointer(dev->queues[i], NULL);
+		RCU_INIT_POINTER(dev->queues[i], NULL);
 		llist_add(&nvmeq->node, &q_list);
 		dev->queue_count--;
 	}
-- 
cgit v0.10.2


From a96d4f5c2da66a0c1537bfd8aaa868b69595476a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 19 Aug 2014 19:15:59 -0600
Subject: NVMe: Reference count pci device

If an nvme device is removed but user space has an open reference,
the nvme driver would have been holding an invalid reference to its pci
device. You may get a general protection fault on x86 h/w when the driver
uses that reference in dma_map_sg(), as is done in nvme_map_user_pages()
from the IOCTL interface.

This patch fixes the fault by taking a reference on the pci device and
holding it even after device removal until all opens on the nvme device
are closed.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reported-by: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c1e3c1a..955b569 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2678,6 +2678,7 @@ static void nvme_free_dev(struct kref *kref)
 {
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
+	pci_dev_put(dev->pci_dev);
 	nvme_free_namespaces(dev);
 	free_percpu(dev->io_queue);
 	kfree(dev->queues);
@@ -2853,11 +2854,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
 	INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
-	dev->pci_dev = pdev;
+	dev->pci_dev = pci_dev_get(pdev);
 	pci_set_drvdata(pdev, dev);
 	result = nvme_set_instance(dev);
 	if (result)
-		goto free;
+		goto put_pci;
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
@@ -2895,6 +2896,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_prp_pools(dev);
  release:
 	nvme_release_instance(dev);
+ put_pci:
+	pci_dev_put(dev->pci_dev);
  free:
 	free_percpu(dev->io_queue);
 	kfree(dev->queues);
-- 
cgit v0.10.2


From e179729a821c1b82375f5593533c027462e753b7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 27 Aug 2014 13:55:38 -0600
Subject: NVMe: Remove duplicate compat SG_IO code

We can return -ENOIOCTLCMD and the ioctl will be handled by
fs/compat_ioctl.c instead. This removes a lot of duplicate code in the
nvme driver.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 955b569..37f22b6 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1807,11 +1807,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 					unsigned int cmd, unsigned long arg)
 {
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
-
 	switch (cmd) {
 	case SG_IO:
-		return nvme_sg_io32(ns, arg);
+		return -ENOIOCTLCMD;
 	}
 	return nvme_ioctl(bdev, mode, cmd, arg);
 }
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index a4cd6d6..dabafa6a 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -3016,152 +3016,6 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
 	return retcode;
 }
 
-#ifdef CONFIG_COMPAT
-typedef struct sg_io_hdr32 {
-	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
-	compat_int_t dxfer_direction;	/* [i] data transfer direction  */
-	unsigned char cmd_len;		/* [i] SCSI command length ( <= 16 bytes) */
-	unsigned char mx_sb_len;		/* [i] max length to write to sbp */
-	unsigned short iovec_count;	/* [i] 0 implies no scatter gather */
-	compat_uint_t dxfer_len;		/* [i] byte count of data transfer */
-	compat_uint_t dxferp;		/* [i], [*io] points to data transfer memory
-					      or scatter gather list */
-	compat_uptr_t cmdp;		/* [i], [*i] points to command to perform */
-	compat_uptr_t sbp;		/* [i], [*o] points to sense_buffer memory */
-	compat_uint_t timeout;		/* [i] MAX_UINT->no timeout (unit: millisec) */
-	compat_uint_t flags;		/* [i] 0 -> default, see SG_FLAG... */
-	compat_int_t pack_id;		/* [i->o] unused internally (normally) */
-	compat_uptr_t usr_ptr;		/* [i->o] unused internally */
-	unsigned char status;		/* [o] scsi status */
-	unsigned char masked_status;	/* [o] shifted, masked scsi status */
-	unsigned char msg_status;		/* [o] messaging level data (optional) */
-	unsigned char sb_len_wr;		/* [o] byte count actually written to sbp */
-	unsigned short host_status;	/* [o] errors from host adapter */
-	unsigned short driver_status;	/* [o] errors from software driver */
-	compat_int_t resid;		/* [o] dxfer_len - actual_transferred */
-	compat_uint_t duration;		/* [o] time taken by cmd (unit: millisec) */
-	compat_uint_t info;		/* [o] auxiliary information */
-} sg_io_hdr32_t;  /* 64 bytes long (on sparc32) */
-
-typedef struct sg_iovec32 {
-	compat_uint_t iov_base;
-	compat_uint_t iov_len;
-} sg_iovec32_t;
-
-static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
-{
-	sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
-	sg_iovec32_t __user *iov32 = dxferp;
-	int i;
-
-	for (i = 0; i < iovec_count; i++) {
-		u32 base, len;
-
-		if (get_user(base, &iov32[i].iov_base) ||
-		    get_user(len, &iov32[i].iov_len) ||
-		    put_user(compat_ptr(base), &iov[i].iov_base) ||
-		    put_user(len, &iov[i].iov_len))
-			return -EFAULT;
-	}
-
-	if (put_user(iov, &sgio->dxferp))
-		return -EFAULT;
-	return 0;
-}
-
-int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg)
-{
-	sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg;
-	sg_io_hdr_t __user *sgio;
-	u16 iovec_count;
-	u32 data;
-	void __user *dxferp;
-	int err;
-	int interface_id;
-
-	if (get_user(interface_id, &sgio32->interface_id))
-		return -EFAULT;
-	if (interface_id != 'S')
-		return -EINVAL;
-
-	if (get_user(iovec_count, &sgio32->iovec_count))
-		return -EFAULT;
-
-	{
-		void __user *top = compat_alloc_user_space(0);
-		void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
-				       (iovec_count * sizeof(sg_iovec_t)));
-		if (new > top)
-			return -EINVAL;
-
-		sgio = new;
-	}
-
-	/* Ok, now construct.  */
-	if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
-			 (2 * sizeof(int)) +
-			 (2 * sizeof(unsigned char)) +
-			 (1 * sizeof(unsigned short)) +
-			 (1 * sizeof(unsigned int))))
-		return -EFAULT;
-
-	if (get_user(data, &sgio32->dxferp))
-		return -EFAULT;
-	dxferp = compat_ptr(data);
-	if (iovec_count) {
-		if (sg_build_iovec(sgio, dxferp, iovec_count))
-			return -EFAULT;
-	} else {
-		if (put_user(dxferp, &sgio->dxferp))
-			return -EFAULT;
-	}
-
-	{
-		unsigned char __user *cmdp;
-		unsigned char __user *sbp;
-
-		if (get_user(data, &sgio32->cmdp))
-			return -EFAULT;
-		cmdp = compat_ptr(data);
-
-		if (get_user(data, &sgio32->sbp))
-			return -EFAULT;
-		sbp = compat_ptr(data);
-
-		if (put_user(cmdp, &sgio->cmdp) ||
-		    put_user(sbp, &sgio->sbp))
-			return -EFAULT;
-	}
-
-	if (copy_in_user(&sgio->timeout, &sgio32->timeout,
-			 3 * sizeof(int)))
-		return -EFAULT;
-
-	if (get_user(data, &sgio32->usr_ptr))
-		return -EFAULT;
-	if (put_user(compat_ptr(data), &sgio->usr_ptr))
-		return -EFAULT;
-
-	err = nvme_sg_io(ns, sgio);
-	if (err >= 0) {
-		void __user *datap;
-
-		if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
-				 sizeof(int)) ||
-		    get_user(datap, &sgio->usr_ptr) ||
-		    put_user((u32)(unsigned long)datap,
-			     &sgio32->usr_ptr) ||
-		    copy_in_user(&sgio32->status, &sgio->status,
-				 (4 * sizeof(unsigned char)) +
-				 (2 * sizeof(unsigned short)) +
-				 (3 * sizeof(int))))
-			err = -EFAULT;
-	}
-
-	return err;
-}
-#endif
-
 int nvme_sg_get_version_num(int __user *ip)
 {
 	return put_user(sg_version_num, ip);
-- 
cgit v0.10.2


From 695a4fe79ffa70023238b6b2d4c20fe1a05288fb Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 27 Aug 2014 13:55:39 -0600
Subject: NVMe: Fix SG_IO status values

We've only been setting the sg_io_hdr status values on SCSI commands
that require an nvme command to complete the translation. The fields
in the struct are output parameters, so we have to set them, otherwise
user space will see whatever was in memory from before. In the case of
compat SG_IO, this would reveal kernel memory. This fixes the issue by
initializing the sg_io_hdr with successful status.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Acked-by: Vishal Verma <vishal.l.verma@linux.intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index dabafa6a..046ae33 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2915,6 +2915,14 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
 	if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
 
+	/*
+	 * Prime the hdr with good status for scsi commands that don't require
+	 * an nvme command for translation.
+	 */
+	retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+	if (retcode)
+		return retcode;
+
 	opcode = cmd[0];
 
 	switch (opcode) {
-- 
cgit v0.10.2


From b4ff9c8ddb6f0cec99a53ab26a5aa2ed0162c472 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 29 Aug 2014 09:06:12 -0600
Subject: NVMe: Translate NVMe status to errno

This returns a more appropriate error for the "capacity exceeded"
status. In case other NVMe statuses have a better errno, this patch adds
a convience function to translate an NVMe status code to an errno for
IO commands, defaulting to the current -EIO.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 37f22b6..3153692 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -450,6 +450,18 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
 	}
 }
 
+static int nvme_error_status(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_SUCCESS:
+		return 0;
+	case NVME_SC_CAP_EXCEEDED:
+		return -ENOSPC;
+	default:
+		return -EIO;
+	}
+}
+
 static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
@@ -469,7 +481,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 			wake_up(&nvmeq->sq_full);
 			return;
 		}
-		error = -EIO;
+		error = nvme_error_status(status);
 	}
 	if (iod->nents) {
 		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
-- 
cgit v0.10.2


From 7be50e93fbc281967589a04be5b1125539b0d0e2 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 10 Sep 2014 15:48:47 -0600
Subject: NVMe: Fix nvmeq waitqueue entry initialization

We need to update the nvme queue's wait_queue_t entry during each
initialization since the nvme_thread may be ended and restarted when
the device is reset. If a device reset occurs during a large amount
of buffered IO, it would take a lot longer to complete the outstanding
requests due to the 1 second polling instead of waking up as completions
occur.

Fixes: b9afca3efb18a9b8392cb544a3e29e8b1168400c

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 3153692..16a886c 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1333,7 +1333,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
-	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
 	INIT_LIST_HEAD(&nvmeq->iod_bio);
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1373,6 +1372,8 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	struct nvme_dev *dev = nvmeq->dev;
 	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
 
+	spin_lock_irq(&nvmeq->q_lock);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	nvmeq->sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
@@ -1382,6 +1383,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
 	dev->online_queues++;
+	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1401,10 +1403,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	if (result < 0)
 		goto release_sq;
 
-	spin_lock_irq(&nvmeq->q_lock);
 	nvme_init_queue(nvmeq, qid);
-	spin_unlock_irq(&nvmeq->q_lock);
-
 	return result;
 
  release_sq:
@@ -1543,9 +1542,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result)
 		return result;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	nvme_init_queue(nvmeq, 0);
-	spin_unlock_irq(&nvmeq->q_lock);
 	return result;
 }
 
@@ -2762,6 +2758,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
 		goto disable;
 	}
+	nvme_init_queue(raw_nvmeq(dev, 0), 0);
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
-- 
cgit v0.10.2


From 1b9dbf7fe02d85f70e8efc1c12c070206c0d5c5f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 10 Sep 2014 17:21:14 -0600
Subject: NVMe: Add revalidate_disk callback

This adds a callback to revalidate the disk and change its block size
and capacity if needed. Before, a user would have to remove + rescan
an entire device if they changed the logical block size using an NVMe
Format or other vendor specific command; now they can just run something
that issues the BLKRRPART IOCTL, like

 # hdparm -z /dev/nvmeXnY

This can also be used in response to the 1.2 Spec's Namespace Attribute
Change asynchronous event.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 16a886c..93ee55e 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1853,6 +1853,35 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
 	return 0;
 }
 
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_id_ns *id;
+	dma_addr_t dma_addr;
+	int lbaf;
+
+	id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+								GFP_KERNEL);
+	if (!id) {
+		dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
+								__func__);
+		return 0;
+	}
+
+	if (nvme_identify(dev, ns->ns_id, 0, dma_addr))
+		goto free;
+
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+ free:
+	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	return 0;
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
@@ -1860,6 +1889,7 @@ static const struct block_device_operations nvme_fops = {
 	.open		= nvme_open,
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
+	.revalidate_disk= nvme_revalidate_disk,
 };
 
 static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
-- 
cgit v0.10.2


From 7963e521811ed19396d47793cc77d87c643ab891 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 12 Sep 2014 16:07:20 -0600
Subject: NVMe: Passthrough IOCTL for IO commands

The NVME_IOCTL_SUBMIT_IO only works for IO commands with block data
transfers and isn't usable for other NVMe commands like flush,
data set management, or any sort of vendor unique command. The
NVME_IOCTL_ADMIN_CMD, however, can easily be modified to accept arbitrary
IO commands in addition to arbitrary admin commands without breaking
backward compatibility. This patch just adds a new IOCTL to distinguish
if the driver should submit the command on an IO or Admin queue.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 93ee55e..70be3a1 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1732,10 +1732,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_user_admin_cmd(struct nvme_dev *dev,
-					struct nvme_admin_cmd __user *ucmd)
+static int nvme_user_cmd(struct nvme_dev *dev,
+			struct nvme_passthru_cmd __user *ucmd, bool ioq)
 {
-	struct nvme_admin_cmd cmd;
+	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
 	int status, length;
 	struct nvme_iod *uninitialized_var(iod);
@@ -1774,6 +1774,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 								ADMIN_TIMEOUT;
 	if (length != cmd.data_len)
 		status = -ENOMEM;
+	else if (ioq)
+		status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c,
+							&cmd.result, timeout);
 	else
 		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
 
@@ -1799,7 +1802,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		force_successful_syscall_return();
 		return ns->ns_id;
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
+		return nvme_user_cmd(ns->dev, (void __user *)arg, false);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_user_cmd(ns->dev, (void __user *)arg, true);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
 	case SG_GET_VERSION_NUM:
@@ -2743,7 +2748,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	struct nvme_dev *dev = f->private_data;
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_admin_cmd(dev, (void __user *)arg);
+		return nvme_user_cmd(dev, (void __user *)arg, false);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_user_cmd(dev, (void __user *)arg, true);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 134518b..9710655 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -503,7 +503,7 @@ struct nvme_user_io {
 	__u16	appmask;
 };
 
-struct nvme_admin_cmd {
+struct nvme_passthru_cmd {
 	__u8	opcode;
 	__u8	flags;
 	__u16	rsvd1;
@@ -524,8 +524,11 @@ struct nvme_admin_cmd {
 	__u32	result;
 };
 
+#define nvme_admin_cmd nvme_passthru_cmd
+
 #define NVME_IOCTL_ID		_IO('N', 0x40)
 #define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
+#define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
 
 #endif /* _UAPI_LINUX_NVME_H */
-- 
cgit v0.10.2


From 60e0545cc92ce4172c7069d559568717f64af332 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 15 Sep 2014 14:08:38 -0600
Subject: NVMe: Updates for 1.1 spec

Updating commands and structures for NVMe 1.1 updates, mostly for nvme
reservations. There are no additional in-kernel uses, but this is for
the uapi.

While doing this, I noticed that the software progress features was
using the wrong value, so updating that value as well.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 9710655..26386cf 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -181,6 +181,22 @@ enum {
 	NVME_LBART_ATTRIB_HIDE	= 1 << 1,
 };
 
+struct nvme_reservation_status {
+	__le32	gen;
+	__u8	rtype;
+	__u8	regctl[2];
+	__u8	resv5[2];
+	__u8	ptpls;
+	__u8	resv10[13];
+	struct {
+		__le16	cntlid;
+		__u8	rcsts;
+		__u8	resv3[5];
+		__le64	hostid;
+		__le64	rkey;
+	} regctl_ds[];
+};
+
 /* I/O commands */
 
 enum nvme_opcode {
@@ -189,7 +205,12 @@ enum nvme_opcode {
 	nvme_cmd_read		= 0x02,
 	nvme_cmd_write_uncor	= 0x04,
 	nvme_cmd_compare	= 0x05,
+	nvme_cmd_write_zeroes	= 0x08,
 	nvme_cmd_dsm		= 0x09,
+	nvme_cmd_resv_register	= 0x0d,
+	nvme_cmd_resv_report	= 0x0e,
+	nvme_cmd_resv_acquire	= 0x11,
+	nvme_cmd_resv_release	= 0x15,
 };
 
 struct nvme_common_command {
@@ -305,7 +326,11 @@ enum {
 	NVME_FEAT_IRQ_CONFIG	= 0x09,
 	NVME_FEAT_WRITE_ATOMIC	= 0x0a,
 	NVME_FEAT_ASYNC_EVENT	= 0x0b,
-	NVME_FEAT_SW_PROGRESS	= 0x0c,
+	NVME_FEAT_AUTO_PST	= 0x0c,
+	NVME_FEAT_SW_PROGRESS	= 0x80,
+	NVME_FEAT_HOST_ID	= 0x81,
+	NVME_FEAT_RESV_MASK	= 0x82,
+	NVME_FEAT_RESV_PERSIST	= 0x83,
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
-- 
cgit v0.10.2


From 387caa5a2b11ecb1abbbec8ef9d798f757ba3e5f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 22 Sep 2014 13:46:19 -0600
Subject: NVMe: Fix device probe waiting on kthread

If we ever do parallel device probing, we need to wake up all processes
waiting for nvme kthread to start, not just one. This is currently
serialized so the bug is not reachable today, but fixing this anyway in
the hopes we implement parallel or asynchronous probe in the future.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 70be3a1..0196fce 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2787,7 +2787,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 
 	if (start_thread) {
 		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-		wake_up(&nvme_kthread_wait);
+		wake_up_all(&nvme_kthread_wait);
 	} else
 		wait_event_killable(nvme_kthread_wait, nvme_thread);
 
-- 
cgit v0.10.2


From 5940c8578fe720afbf4ef041bad0d72a101f1d88 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 4 Nov 2014 13:18:10 -0700
Subject: NVMe: Clear QUEUE_FLAG_STACKABLE

The nvme namespace request_queue's flags are initialized to
QUEUE_FLAG_DEFAULT, which currently sets QUEUE_FLAG_STACKABLE. The
device-mapper indicates this flag means the block driver is requset
based, though this driver is bio-based and problems will occur if an nvme
namespace is used with a request based dm device. This patch clears the
stackable flag.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 0196fce..8fffc68 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2030,6 +2030,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	if (!ns->queue)
 		goto out_free_ns;
 	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue);
-- 
cgit v0.10.2


From 9e60352cf83faaba57f99f6960b545687b8bbb20 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 3 Oct 2014 11:15:47 -0600
Subject: NVMe: Do not open disks that are being deleted

It is possible the block layer will request to open a block device after
the driver deleted it. Subsequent releases will cause a double free,
or the disk's private_data is pointing to freed memory. This patch
protects the driver's freed disks from being opened and accessed: the
nvme namespaces are freed only when the device's refcount is 0, so at
that moment there were no active openers and no more should be allowed,
and it is safe to clear the disk's private_data that is about to be freed.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reported-by: Henry Chow <henry.chow@oracle.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 8fffc68..fb21d36 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1832,11 +1832,18 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 static int nvme_open(struct block_device *bdev, fmode_t mode)
 {
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
-	struct nvme_dev *dev = ns->dev;
+	int ret = 0;
+	struct nvme_ns *ns;
 
-	kref_get(&dev->kref);
-	return 0;
+	spin_lock(&dev_list_lock);
+	ns = bdev->bd_disk->private_data;
+	if (!ns)
+		ret = -ENXIO;
+	else if (!kref_get_unless_zero(&ns->dev->kref))
+		ret = -ENXIO;
+	spin_unlock(&dev_list_lock);
+
+	return ret;
 }
 
 static void nvme_free_dev(struct kref *kref);
@@ -2711,6 +2718,11 @@ static void nvme_free_namespaces(struct nvme_dev *dev)
 
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
 		list_del(&ns->list);
+
+		spin_lock(&dev_list_lock);
+		ns->disk->private_data = NULL;
+		spin_unlock(&dev_list_lock);
+
 		put_disk(ns->disk);
 		kfree(ns);
 	}
-- 
cgit v0.10.2


From 9dbbfab7d54109626031bf3bc476fb1804113970 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 6 Oct 2014 15:23:06 -0600
Subject: NVMe: Do not over allocate for discard requests

Discard requests are often for very large ranges. The discard size is not
representative of the data transfer size so we don't need to allocate
for such a large prp list. This patch requests allocating only enough
for the memory needed for the data transfer and saves a little over 8k
of memory per max discard request.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reported-by: Paul Grabinar <paul.grabinar@ranbarg.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index fb21d36..c70eff3 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -763,11 +763,13 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	struct nvme_iod *iod;
 	int psegs = bio_phys_segments(ns->queue, bio);
 	int result;
+	unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size :
+						sizeof(struct nvme_dsm_range);
 
 	if ((bio->bi_rw & REQ_FLUSH) && psegs)
 		return nvme_split_flush_data(nvmeq, bio);
 
-	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From a4aea5623d4a54682b6ff5c18196d7802f3e478f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Tue, 4 Nov 2014 08:20:14 -0700
Subject: NVMe: Convert to blk-mq

This converts the NVMe driver to a blk-mq request-based driver.

The NVMe driver is currently bio-based and implements queue logic within
itself.  By using blk-mq, a lot of these responsibilities can be moved
and simplified.

The patch is divided into the following blocks:

 * Per-command data and cmdid have been moved into the struct request
   field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id
   maintenance are now handled by blk-mq through the rq->tag field.

 * The logic for splitting bio's has been moved into the blk-mq layer.
   The driver instead notifies the block layer about limited gap support in
   SG lists.

 * blk-mq handles timeouts and is reimplemented within nvme_timeout().
   This both includes abort handling and command cancelation.

 * Assignment of nvme queues to CPUs are replaced with the blk-mq
   version. The current blk-mq strategy is to assign the number of
   mapped queues and CPUs to provide synergy, while the nvme driver
   assign as many nvme hw queues as possible. This can be implemented in
   blk-mq if needed.

 * NVMe queues are merged with the tags structure of blk-mq.

 * blk-mq takes care of setup/teardown of nvme queues and guards invalid
   accesses. Therefore, RCU-usage for nvme queues can be removed.

 * IO tracing and accounting are handled by blk-mq and therefore removed.

 * Queue suspension logic is replaced with the logic from the block
   layer.

Contributions in this patch from:

  Sam Bradshaw <sbradshaw@micron.com>
  Jens Axboe <axboe@fb.com>
  Keith Busch <keith.busch@intel.com>
  Robert Nelson <rlnelson@google.com>

Acked-by: Keith Busch <keith.busch@intel.com>
Acked-by: Jens Axboe <axboe@fb.com>

Updated for new ->queue_rq() prototype.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c70eff3..39050a3 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include <linux/nvme.h>
-#include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -33,7 +33,6 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
-#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -42,9 +41,8 @@
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
-#include <trace/events/block.h>
-
 #define NVME_Q_DEPTH		1024
+#define NVME_AQ_DEPTH		64
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT		(admin_timeout * HZ)
@@ -81,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait;
 static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
 
 struct async_cmd_info {
 	struct kthread_work work;
 	struct kthread_worker *worker;
+	struct request *req;
 	u32 result;
 	int status;
 	void *ctx;
@@ -104,10 +104,6 @@ struct nvme_queue {
 	volatile struct nvme_completion *cqes;
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
-	wait_queue_head_t sq_full;
-	wait_queue_t sq_cong_wait;
-	struct bio_list sq_cong;
-	struct list_head iod_bio;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -117,10 +113,8 @@ struct nvme_queue {
 	u16 qid;
 	u8 cq_phase;
 	u8 cqe_seen;
-	u8 q_suspended;
-	cpumask_var_t cpu_mask;
 	struct async_cmd_info cmdinfo;
-	unsigned long cmdid_data[];
+	struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -148,62 +142,72 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
 struct nvme_cmd_info {
 	nvme_completion_fn fn;
 	void *ctx;
-	unsigned long timeout;
 	int aborted;
+	struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
 {
-	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[0];
+
+	WARN_ON(nvmeq->hctx);
+	nvmeq->hctx = hctx;
+	hctx->driver_data = nvmeq;
+	return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_admin_init_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
 {
-	return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[0];
+
+	BUG_ON(!nvmeq);
+	cmd->nvmeq = nvmeq;
+	return 0;
 }
 
-/**
- * alloc_cmdid() - Allocate a Command ID
- * @nvmeq: The queue that will be used for this command
- * @ctx: A pointer that will be passed to the handler
- * @handler: The function to call on completion
- *
- * Allocate a Command ID for a queue.  The data passed in will
- * be passed to the completion handler.  This is implemented by using
- * the bottom two bits of the ctx pointer to store the handler ID.
- * Passing in a pointer that's not 4-byte aligned will cause a BUG.
- * We can change this if it becomes a problem.
- *
- * May be called with local interrupts disabled and the q_lock held,
- * or with interrupts enabled and no locks held.
- */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	int cmdid;
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[
+					(hctx_idx % dev->queue_count) + 1];
 
-	do {
-		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
-		if (cmdid >= depth)
-			return -EBUSY;
-	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+	if (!nvmeq->hctx)
+		nvmeq->hctx = hctx;
+
+	/* nvmeq queues are shared between namespaces. We assume here that
+	 * blk-mq map the tags so they match up with the nvme queue tags. */
+	WARN_ON(nvmeq->hctx->tags != hctx->tags);
 
-	info[cmdid].fn = handler;
-	info[cmdid].ctx = ctx;
-	info[cmdid].timeout = jiffies + timeout;
-	info[cmdid].aborted = 0;
-	return cmdid;
+	hctx->driver_data = nvmeq;
+	return 0;
 }
 
-static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static int nvme_init_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
 {
-	int cmdid;
-	wait_event_killable(nvmeq->sq_full,
-		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
-	return (cmdid < 0) ? -EINTR : cmdid;
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+
+	BUG_ON(!nvmeq);
+	cmd->nvmeq = nvmeq;
+	return 0;
+}
+
+static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
+				nvme_completion_fn handler)
+{
+	cmd->fn = handler;
+	cmd->ctx = ctx;
+	cmd->aborted = 0;
 }
 
 /* Special values must be less than 0x1000 */
@@ -211,18 +215,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
-#define CMD_CTX_ABORT		(0x318 + CMD_CTX_BASE)
-#define CMD_CTX_ASYNC		(0x31C + CMD_CTX_BASE)
 
 static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
 		return;
-	if (ctx == CMD_CTX_ABORT) {
-		++nvmeq->dev->abort_limit;
-		return;
-	}
 	if (ctx == CMD_CTX_COMPLETED) {
 		dev_warn(nvmeq->q_dmadev,
 				"completed id %d twice on queue %d\n",
@@ -235,110 +233,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
-	if (ctx == CMD_CTX_ASYNC) {
-		u32 result = le32_to_cpup(&cqe->result);
-		u16 status = le16_to_cpup(&cqe->status) >> 1;
-
-		if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
-			++nvmeq->dev->event_limit;
-		if (status == NVME_SC_SUCCESS)
-			dev_warn(nvmeq->q_dmadev,
-				"async event result %08x\n", result);
-		return;
-	}
-
 	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
 }
 
-static void async_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct async_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-}
-
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
-						nvme_completion_fn *fn)
+static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
 {
 	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 
-	if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
-		if (fn)
-			*fn = special_completion;
-		return CMD_CTX_INVALID;
-	}
 	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_COMPLETED;
-	clear_bit(cmdid, nvmeq->cmdid_data);
-	wake_up(&nvmeq->sq_full);
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_CANCELLED;
 	return ctx;
 }
 
-static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
-						nvme_completion_fn *fn)
+static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
 {
-	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_CANCELLED;
-	return ctx;
-}
+	struct request *req = ctx;
 
-static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
-{
-	return rcu_dereference_raw(dev->queues[qid]);
+	u32 result = le32_to_cpup(&cqe->result);
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+		++nvmeq->dev->event_limit;
+	if (status == NVME_SC_SUCCESS)
+		dev_warn(nvmeq->q_dmadev,
+			"async event result %08x\n", result);
+
+	blk_put_request(req);
 }
 
-static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
+static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
 {
-	struct nvme_queue *nvmeq;
-	unsigned queue_id = get_cpu_var(*dev->io_queue);
+	struct request *req = ctx;
+
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+	u32 result = le32_to_cpup(&cqe->result);
 
-	rcu_read_lock();
-	nvmeq = rcu_dereference(dev->queues[queue_id]);
-	if (nvmeq)
-		return nvmeq;
+	blk_put_request(req);
 
-	rcu_read_unlock();
-	put_cpu_var(*dev->io_queue);
-	return NULL;
+	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+	++nvmeq->dev->abort_limit;
 }
 
-static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
 {
-	rcu_read_unlock();
-	put_cpu_var(nvmeq->dev->io_queue);
+	struct async_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+	blk_put_request(cmdinfo->req);
 }
 
-static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
-							__acquires(RCU)
+static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
+				  unsigned int tag)
 {
-	struct nvme_queue *nvmeq;
-
-	rcu_read_lock();
-	nvmeq = rcu_dereference(dev->queues[q_idx]);
-	if (nvmeq)
-		return nvmeq;
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+	struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
 
-	rcu_read_unlock();
-	return NULL;
+	return blk_mq_rq_to_pdu(req);
 }
 
-static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
+						nvme_completion_fn *fn)
 {
-	rcu_read_unlock();
+	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
+	void *ctx;
+	if (tag >= nvmeq->q_depth) {
+		*fn = special_completion;
+		return CMD_CTX_INVALID;
+	}
+	if (fn)
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_COMPLETED;
+	return ctx;
 }
 
 /**
@@ -348,26 +325,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
  *
  * Safe to use from interrupt context
  */
-static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
-	unsigned long flags;
-	u16 tail;
-	spin_lock_irqsave(&nvmeq->q_lock, flags);
-	if (nvmeq->q_suspended) {
-		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-		return -EBUSY;
-	}
-	tail = nvmeq->sq_tail;
+	u16 tail = nvmeq->sq_tail;
+
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
 	writel(tail, nvmeq->q_db);
 	nvmeq->sq_tail = tail;
-	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 
 	return 0;
 }
 
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+	unsigned long flags;
+	int ret;
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	ret = __nvme_submit_cmd(nvmeq, cmd);
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+	return ret;
+}
+
 static __le64 **iod_list(struct nvme_iod *iod)
 {
 	return ((void *)iod) + iod->offset;
@@ -397,7 +377,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
 		iod->length = nbytes;
 		iod->nents = 0;
 		iod->first_dma = 0ULL;
-		iod->start_time = jiffies;
 	}
 
 	return iod;
@@ -421,35 +400,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 	kfree(iod);
 }
 
-static void nvme_start_io_acct(struct bio *bio)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	if (blk_queue_io_stat(disk->queue)) {
-		const int rw = bio_data_dir(bio);
-		int cpu = part_stat_lock();
-		part_round_stats(cpu, &disk->part0);
-		part_stat_inc(cpu, &disk->part0, ios[rw]);
-		part_stat_add(cpu, &disk->part0, sectors[rw],
-							bio_sectors(bio));
-		part_inc_in_flight(&disk->part0, rw);
-		part_stat_unlock();
-	}
-}
-
-static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	if (blk_queue_io_stat(disk->queue)) {
-		const int rw = bio_data_dir(bio);
-		unsigned long duration = jiffies - start_time;
-		int cpu = part_stat_lock();
-		part_stat_add(cpu, &disk->part0, ticks[rw], duration);
-		part_round_stats(cpu, &disk->part0);
-		part_dec_in_flight(&disk->part0, rw);
-		part_stat_unlock();
-	}
-}
-
 static int nvme_error_status(u16 status)
 {
 	switch (status & 0x7ff) {
@@ -462,36 +412,37 @@ static int nvme_error_status(u16 status)
 	}
 }
 
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct bio *bio = iod->private;
+	struct request *req = iod->private;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
-	int error = 0;
 
 	if (unlikely(status)) {
-		if (!(status & NVME_SC_DNR ||
-				bio->bi_rw & REQ_FAILFAST_MASK) &&
-				(jiffies - iod->start_time) < IOD_TIMEOUT) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			list_add_tail(&iod->node, &nvmeq->iod_bio);
-			wake_up(&nvmeq->sq_full);
+		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
+		    && (jiffies - req->start_time) < req->timeout) {
+			blk_mq_requeue_request(req);
+			blk_mq_kick_requeue_list(req->q);
 			return;
 		}
-		error = nvme_error_status(status);
-	}
-	if (iod->nents) {
-		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		nvme_end_io_acct(bio, iod->start_time);
-	}
+		req->errors = nvme_error_status(status);
+	} else
+		req->errors = 0;
+
+	if (cmd_rq->aborted)
+		dev_warn(&nvmeq->dev->pci_dev->dev,
+			"completing aborted command with status:%04x\n",
+			status);
+
+	if (iod->nents)
+		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	nvme_free_iod(nvmeq->dev, iod);
 
-	trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error);
-	bio_endio(bio, error);
+	blk_mq_complete_request(req);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -574,88 +525,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
 
-static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
-				 int len)
-{
-	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
-	if (!split)
-		return -ENOMEM;
-
-	trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
-					split->bi_iter.bi_sector);
-	bio_chain(split, bio);
-
-	if (!waitqueue_active(&nvmeq->sq_full))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, split);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	wake_up(&nvmeq->sq_full);
-
-	return 0;
-}
-
-/* NVMe scatterlists require no holes in the virtual address */
-#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
-			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
-
-static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
-		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
-{
-	struct bio_vec bvec, bvprv;
-	struct bvec_iter iter;
-	struct scatterlist *sg = NULL;
-	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
-	int first = 1;
-
-	if (nvmeq->dev->stripe_size)
-		split_len = nvmeq->dev->stripe_size -
-			((bio->bi_iter.bi_sector << 9) &
-			 (nvmeq->dev->stripe_size - 1));
-
-	sg_init_table(iod->sg, psegs);
-	bio_for_each_segment(bvec, bio, iter) {
-		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
-			sg->length += bvec.bv_len;
-		} else {
-			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
-				return nvme_split_and_submit(bio, nvmeq,
-							     length);
-
-			sg = sg ? sg + 1 : iod->sg;
-			sg_set_page(sg, bvec.bv_page,
-				    bvec.bv_len, bvec.bv_offset);
-			nsegs++;
-		}
-
-		if (split_len - length < bvec.bv_len)
-			return nvme_split_and_submit(bio, nvmeq, split_len);
-		length += bvec.bv_len;
-		bvprv = bvec;
-		first = 0;
-	}
-	iod->nents = nsegs;
-	sg_mark_end(sg);
-	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
-		return -ENOMEM;
-
-	BUG_ON(length != bio->bi_iter.bi_size);
-	return length;
-}
-
-static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-		struct bio *bio, struct nvme_iod *iod, int cmdid)
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct request *req, struct nvme_iod *iod)
 {
 	struct nvme_dsm_range *range =
 				(struct nvme_dsm_range *)iod_list(iod)[0];
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
-	cmnd->dsm.command_id = cmdid;
+	cmnd->dsm.command_id = req->tag;
 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
 	cmnd->dsm.nr = 0;
@@ -664,11 +552,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
 }
 
-static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								int cmdid)
 {
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
@@ -681,49 +567,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
 }
 
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+							struct nvme_ns *ns)
 {
-	struct bio *bio = iod->private;
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	struct request *req = iod->private;
 	struct nvme_command *cmnd;
-	int cmdid;
-	u16 control;
-	u32 dsmgmt;
+	u16 control = 0;
+	u32 dsmgmt = 0;
 
-	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
-	if (unlikely(cmdid < 0))
-		return cmdid;
-
-	if (bio->bi_rw & REQ_DISCARD)
-		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-	if (bio->bi_rw & REQ_FLUSH)
-		return nvme_submit_flush(nvmeq, ns, cmdid);
-
-	control = 0;
-	if (bio->bi_rw & REQ_FUA)
+	if (req->cmd_flags & REQ_FUA)
 		control |= NVME_RW_FUA;
-	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 		control |= NVME_RW_LR;
 
-	dsmgmt = 0;
-	if (bio->bi_rw & REQ_RAHEAD)
+	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 	memset(cmnd, 0, sizeof(*cmnd));
 
-	cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
-	cmnd->rw.command_id = cmdid;
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.command_id = req->tag;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
 	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
-	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-	cmnd->rw.length =
-		cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
@@ -734,47 +605,37 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 	return 0;
 }
 
-static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
-{
-	struct bio *split = bio_clone(bio, GFP_ATOMIC);
-	if (!split)
-		return -ENOMEM;
-
-	split->bi_iter.bi_size = 0;
-	split->bi_phys_segments = 0;
-	bio->bi_rw &= ~REQ_FLUSH;
-	bio_chain(split, bio);
-
-	if (!waitqueue_active(&nvmeq->sq_full))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, split);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	wake_up_process(nvme_thread);
-
-	return 0;
-}
-
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								struct bio *bio)
+static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
 {
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct request *req = bd->rq;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = bio_phys_segments(ns->queue, bio);
-	int result;
-	unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size :
+	int psegs = req->nr_phys_segments;
+	int result = BLK_MQ_RQ_QUEUE_BUSY;
+	enum dma_data_direction dma_dir;
+	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
 						sizeof(struct nvme_dsm_range);
 
-	if ((bio->bi_rw & REQ_FLUSH) && psegs)
-		return nvme_split_flush_data(nvmeq, bio);
+	/*
+	 * Requeued IO has already been prepped
+	 */
+	iod = req->special;
+	if (iod)
+		goto submit_iod;
 
 	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
 	if (!iod)
-		return -ENOMEM;
+		return result;
+
+	iod->private = req;
+	req->special = iod;
 
-	iod->private = bio;
-	if (bio->bi_rw & REQ_DISCARD) {
+	nvme_set_info(cmd, iod, req_completion);
+
+	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
 		 * We reuse the small pool to allocate the 16-byte range here
@@ -784,33 +645,45 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
 						GFP_ATOMIC,
 						&iod->first_dma);
-		if (!range) {
-			result = -ENOMEM;
-			goto free_iod;
-		}
+		if (!range)
+			goto finish_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
 	} else if (psegs) {
-		result = nvme_map_bio(nvmeq, iod, bio,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-			psegs);
-		if (result <= 0)
-			goto free_iod;
-		if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
-								result) {
-			result = -ENOMEM;
-			goto free_iod;
+		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+		sg_init_table(iod->sg, psegs);
+		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+		if (!iod->nents) {
+			result = BLK_MQ_RQ_QUEUE_ERROR;
+			goto finish_cmd;
 		}
-		nvme_start_io_acct(bio);
-	}
-	if (unlikely(nvme_submit_iod(nvmeq, iod))) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		list_add_tail(&iod->node, &nvmeq->iod_bio);
+
+		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
+			goto finish_cmd;
+
+		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+						blk_rq_bytes(req), GFP_ATOMIC))
+			goto finish_cmd;
 	}
-	return 0;
 
- free_iod:
+	blk_mq_start_request(req);
+
+ submit_iod:
+	spin_lock_irq(&nvmeq->q_lock);
+	if (req->cmd_flags & REQ_DISCARD)
+		nvme_submit_discard(nvmeq, ns, req, iod);
+	else if (req->cmd_flags & REQ_FLUSH)
+		nvme_submit_flush(nvmeq, ns, req->tag);
+	else
+		nvme_submit_iod(nvmeq, iod, ns);
+
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+	return BLK_MQ_RQ_QUEUE_OK;
+
+ finish_cmd:
+	nvme_finish_cmd(nvmeq, req->tag, NULL);
 	nvme_free_iod(nvmeq->dev, iod);
 	return result;
 }
@@ -833,8 +706,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 			head = 0;
 			phase = !phase;
 		}
-
-		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
 		fn(nvmeq, ctx, &cqe);
 	}
 
@@ -855,29 +727,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	return 1;
 }
 
-static void nvme_make_request(struct request_queue *q, struct bio *bio)
+/* Admin queue isn't initialized as a request queue. If at some point this
+ * happens anyway, make sure to notify the user */
+static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
+			       const struct blk_mq_queue_data *bd)
 {
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
-	int result = -EBUSY;
-
-	if (!nvmeq) {
-		bio_endio(bio, -EIO);
-		return;
-	}
-
-	spin_lock_irq(&nvmeq->q_lock);
-	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
-		result = nvme_submit_bio_queue(nvmeq, ns, bio);
-	if (unlikely(result)) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
-	}
-
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
-	put_nvmeq(nvmeq);
+	WARN_ON_ONCE(1);
+	return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
@@ -901,10 +757,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info *
+								cmd_info)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid(nvmeq, cmdid, NULL);
+	cancel_cmd_info(cmd_info, NULL);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
@@ -927,45 +784,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
-						struct nvme_command *cmd,
+static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
 						u32 *result, unsigned timeout)
 {
-	int cmdid, ret;
+	int ret;
 	struct sync_cmd_info cmdinfo;
-	struct nvme_queue *nvmeq;
-
-	nvmeq = lock_nvmeq(dev, q_idx);
-	if (!nvmeq)
-		return -ENODEV;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
-	if (cmdid < 0) {
-		unlock_nvmeq(nvmeq);
-		return cmdid;
-	}
-	cmd->common.command_id = cmdid;
+	cmd->common.command_id = req->tag;
+
+	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
 
 	set_current_state(TASK_KILLABLE);
 	ret = nvme_submit_cmd(nvmeq, cmd);
 	if (ret) {
-		free_cmdid(nvmeq, cmdid, NULL);
-		unlock_nvmeq(nvmeq);
+		nvme_finish_cmd(nvmeq, req->tag, NULL);
 		set_current_state(TASK_RUNNING);
-		return ret;
 	}
-	unlock_nvmeq(nvmeq);
 	schedule_timeout(timeout);
 
 	if (cmdinfo.status == -EINTR) {
-		nvmeq = lock_nvmeq(dev, q_idx);
-		if (nvmeq) {
-			nvme_abort_command(nvmeq, cmdid);
-			unlock_nvmeq(nvmeq);
-		}
+		nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
 		return -EINTR;
 	}
 
@@ -975,59 +818,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
 	return cmdinfo.status;
 }
 
-static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
+static int nvme_submit_async_admin_req(struct nvme_dev *dev)
+{
+	struct nvme_queue *nvmeq = dev->queues[0];
+	struct nvme_command c;
+	struct nvme_cmd_info *cmd_info;
+	struct request *req;
+
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+
+	cmd_info = blk_mq_rq_to_pdu(req);
+	nvme_set_info(cmd_info, req, async_req_completion);
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_async_event;
+	c.common.command_id = req->tag;
+
+	return __nvme_submit_cmd(nvmeq, &c);
+}
+
+static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 			struct nvme_command *cmd,
 			struct async_cmd_info *cmdinfo, unsigned timeout)
 {
-	int cmdid;
+	struct nvme_queue *nvmeq = dev->queues[0];
+	struct request *req;
+	struct nvme_cmd_info *cmd_rq;
 
-	cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
-	if (cmdid < 0)
-		return cmdid;
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+
+	req->timeout = timeout;
+	cmd_rq = blk_mq_rq_to_pdu(req);
+	cmdinfo->req = req;
+	nvme_set_info(cmd_rq, cmdinfo, async_completion);
 	cmdinfo->status = -EINTR;
-	cmd->common.command_id = cmdid;
+
+	cmd->common.command_id = req->tag;
+
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
-int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-								u32 *result)
+int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+						u32 *result, unsigned timeout)
 {
-	return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+	int res;
+	struct request *req;
+
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
+	blk_put_request(req);
+	return res;
 }
 
-int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd,
-						result,	NVME_IO_TIMEOUT);
+	return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
 }
 
-static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
-		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
+					struct nvme_command *cmd, u32 *result)
 {
-	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
-								ADMIN_TIMEOUT);
+	int res;
+	struct request *req;
+
+	req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
+									false);
+	if (!req)
+		return -ENOMEM;
+	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
+	blk_put_request(req);
+	return res;
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
-	int status;
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
@@ -1039,16 +922,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 
@@ -1060,10 +939,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1119,28 +995,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 }
 
 /**
- * nvme_abort_cmd - Attempt aborting a command
- * @cmdid: Command id of a timed out IO
- * @queue: The queue with timed out IO
+ * nvme_abort_req - Attempt aborting a request
  *
  * Schedule controller reset if the command was already aborted once before and
  * still hasn't been returned to the driver, or if this is the admin queue.
  */
-static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+static void nvme_abort_req(struct request *req)
 {
-	int a_cmdid;
-	struct nvme_command cmd;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	struct nvme_queue *adminq;
+	struct request *abort_req;
+	struct nvme_cmd_info *abort_cmd;
+	struct nvme_command cmd;
 
-	if (!nvmeq->qid || info[cmdid].aborted) {
+	if (!nvmeq->qid || cmd_rq->aborted) {
 		if (work_busy(&dev->reset_work))
 			return;
 		list_del_init(&dev->node);
 		dev_warn(&dev->pci_dev->dev,
-			"I/O %d QID %d timeout, reset controller\n", cmdid,
-								nvmeq->qid);
+			"I/O %d QID %d timeout, reset controller\n",
+							req->tag, nvmeq->qid);
 		dev->reset_workfn = nvme_reset_failed_dev;
 		queue_work(nvme_workq, &dev->reset_work);
 		return;
@@ -1149,89 +1024,79 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 	if (!dev->abort_limit)
 		return;
 
-	adminq = rcu_dereference(dev->queues[0]);
-	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
-								ADMIN_TIMEOUT);
-	if (a_cmdid < 0)
+	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
+									false);
+	if (!abort_req)
 		return;
 
+	abort_cmd = blk_mq_rq_to_pdu(abort_req);
+	nvme_set_info(abort_cmd, abort_req, abort_completion);
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
-	cmd.abort.cid = cmdid;
+	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-	cmd.abort.command_id = a_cmdid;
+	cmd.abort.command_id = abort_req->tag;
 
 	--dev->abort_limit;
-	info[cmdid].aborted = 1;
-	info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+	cmd_rq->aborted = 1;
 
-	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-	nvme_submit_cmd(adminq, &cmd);
+	if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
+		dev_warn(nvmeq->q_dmadev,
+				"Could not abort I/O %d QID %d",
+				req->tag, nvmeq->qid);
+		blk_put_request(req);
+	}
 }
 
-/**
- * nvme_cancel_ios - Cancel outstanding I/Os
- * @queue: The queue to cancel I/Os on
- * @timeout: True to only cancel I/Os which have timed out
- */
-static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
+				struct request *req, void *data, bool reserved)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	unsigned long now = jiffies;
-	int cmdid;
+	struct nvme_queue *nvmeq = data;
+	void *ctx;
+	nvme_completion_fn fn;
+	struct nvme_cmd_info *cmd;
+	static struct nvme_completion cqe = {
+		.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
+	};
 
-	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
-		void *ctx;
-		nvme_completion_fn fn;
-		static struct nvme_completion cqe = {
-			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
-		};
+	cmd = blk_mq_rq_to_pdu(req);
 
-		if (timeout && !time_after(now, info[cmdid].timeout))
-			continue;
-		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
-			continue;
-		if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC)
-			continue;
-		if (timeout && nvmeq->dev->initialized) {
-			nvme_abort_cmd(cmdid, nvmeq);
-			continue;
-		}
-		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
-								nvmeq->qid);
-		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq, ctx, &cqe);
-	}
+	if (cmd->ctx == CMD_CTX_CANCELLED)
+		return;
+
+	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
+						req->tag, nvmeq->qid);
+	ctx = cancel_cmd_info(cmd, &fn);
+	fn(nvmeq, ctx, &cqe);
 }
 
-static void nvme_free_queue(struct nvme_queue *nvmeq)
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
-	spin_lock_irq(&nvmeq->q_lock);
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		bio_endio(bio, -EIO);
-	}
-	while (!list_empty(&nvmeq->iod_bio)) {
-		static struct nvme_completion cqe = {
-			.status = cpu_to_le16(
-				(NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
-		};
-		struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
-							struct nvme_iod,
-							node);
-		list_del(&iod->node);
-		bio_completion(nvmeq, iod, &cqe);
-	}
-	spin_unlock_irq(&nvmeq->q_lock);
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+
+	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
+							nvmeq->qid);
+	if (nvmeq->dev->initialized)
+		nvme_abort_req(req);
+
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
+}
 
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
 					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-	if (nvmeq->qid)
-		free_cpumask_var(nvmeq->cpu_mask);
 	kfree(nvmeq);
 }
 
@@ -1243,10 +1108,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		nvmeq = raw_nvmeq(dev, i);
-		RCU_INIT_POINTER(dev->queues[i], NULL);
+		struct nvme_queue *nvmeq = dev->queues[i];
 		llist_add(&nvmeq->node, &q_list);
 		dev->queue_count--;
+		dev->queues[i] = NULL;
 	}
 	synchronize_rcu();
 	entry = llist_del_all(&q_list);
@@ -1257,19 +1122,12 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 /**
  * nvme_suspend_queue - put queue into suspended state
  * @nvmeq - queue to suspend
- *
- * Returns 1 if already suspended, 0 otherwise.
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
 	int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	if (nvmeq->q_suspended) {
-		spin_unlock_irq(&nvmeq->q_lock);
-		return 1;
-	}
-	nvmeq->q_suspended = 1;
 	nvmeq->dev->online_queues--;
 	spin_unlock_irq(&nvmeq->q_lock);
 
@@ -1281,15 +1139,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 
 static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_process_cq(nvmeq);
-	nvme_cancel_ios(nvmeq, false);
+	if (hctx && hctx->tags)
+		blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 {
-	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
+	struct nvme_queue *nvmeq = dev->queues[qid];
 
 	if (!nvmeq)
 		return;
@@ -1309,8 +1170,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = nvme_queue_extra(depth);
-	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
 
@@ -1324,9 +1184,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	if (!nvmeq->sq_cmds)
 		goto free_cqdma;
 
-	if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
-		goto free_sqdma;
-
 	nvmeq->q_dmadev = dmadev;
 	nvmeq->dev = dev;
 	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1334,22 +1191,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
-	init_waitqueue_head(&nvmeq->sq_full);
-	bio_list_init(&nvmeq->sq_cong);
-	INIT_LIST_HEAD(&nvmeq->iod_bio);
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
-	nvmeq->q_suspended = 1;
 	dev->queue_count++;
-	rcu_assign_pointer(dev->queues[qid], nvmeq);
+	dev->queues[qid] = nvmeq;
 
 	return nvmeq;
 
- free_sqdma:
-	dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
-							nvmeq->sq_dma_addr);
  free_cqdma:
 	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
 							nvmeq->cq_dma_addr);
@@ -1372,18 +1222,13 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
 
 	spin_lock_irq(&nvmeq->q_lock);
-	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	nvmeq->sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
-	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
-	nvme_cancel_ios(nvmeq, false);
-	nvmeq->q_suspended = 0;
 	dev->online_queues++;
 	spin_unlock_irq(&nvmeq->q_lock);
 }
@@ -1486,6 +1331,52 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 	return 0;
 }
 
+static struct blk_mq_ops nvme_mq_admin_ops = {
+	.queue_rq	= nvme_admin_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_admin_init_hctx,
+	.init_request	= nvme_admin_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static struct blk_mq_ops nvme_mq_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+	if (!dev->admin_q) {
+		dev->admin_tagset.ops = &nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+
+		dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (!dev->admin_q) {
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+	if (dev->admin_q)
+		blk_mq_free_tag_set(&dev->admin_tagset);
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
@@ -1515,9 +1406,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	nvmeq = raw_nvmeq(dev, 0);
+	nvmeq = dev->queues[0];
 	if (!nvmeq) {
-		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
 		if (!nvmeq)
 			return -ENOMEM;
 	}
@@ -1538,13 +1429,23 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	result = nvme_enable_ctrl(dev, cap);
 	if (result)
-		return result;
+		goto free_nvmeq;
+
+	result = nvme_alloc_admin_tags(dev);
+	if (result)
+		goto free_nvmeq;
 
 	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
 	if (result)
-		return result;
+		goto free_tags;
 
 	return result;
+
+ free_tags:
+	nvme_free_admin_tags(dev);
+ free_nvmeq:
+	nvme_free_queues(dev, 0);
+	return result;
 }
 
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
@@ -1702,7 +1603,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_io_cmd(dev, &c, NULL);
+		status = nvme_submit_io_cmd(dev, ns, &c, NULL);
 
 	if (meta_len) {
 		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1734,8 +1635,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_user_cmd(struct nvme_dev *dev,
-			struct nvme_passthru_cmd __user *ucmd, bool ioq)
+static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
+			struct nvme_passthru_cmd __user *ucmd)
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
@@ -1774,13 +1675,23 @@ static int nvme_user_cmd(struct nvme_dev *dev,
 
 	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
 								ADMIN_TIMEOUT;
+
 	if (length != cmd.data_len)
 		status = -ENOMEM;
-	else if (ioq)
-		status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c,
-							&cmd.result, timeout);
-	else
-		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
+	else if (ns) {
+		struct request *req;
+
+		req = blk_mq_alloc_request(ns->queue, WRITE,
+						(GFP_KERNEL|__GFP_WAIT), false);
+		if (!req)
+			status = -ENOMEM;
+		else {
+			status = nvme_submit_sync_cmd(req, &c, &cmd.result,
+								timeout);
+			blk_put_request(req);
+		}
+	} else
+		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
 
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1804,9 +1715,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		force_successful_syscall_return();
 		return ns->ns_id;
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ns->dev, (void __user *)arg, false);
+		return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
 	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(ns->dev, (void __user *)arg, true);
+		return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
 	case SG_GET_VERSION_NUM:
@@ -1906,62 +1817,6 @@ static const struct block_device_operations nvme_fops = {
 	.revalidate_disk= nvme_revalidate_disk,
 };
 
-static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
-{
-	struct nvme_iod *iod, *next;
-
-	list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
-		if (unlikely(nvme_submit_iod(nvmeq, iod)))
-			break;
-		list_del(&iod->node);
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-						&nvmeq->sq_cong_wait);
-	}
-}
-
-static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-{
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			bio_list_add_head(&nvmeq->sq_cong, bio);
-			break;
-		}
-	}
-}
-
-static int nvme_submit_async_req(struct nvme_queue *nvmeq)
-{
-	struct nvme_command *c;
-	int cmdid;
-
-	cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0);
-	if (cmdid < 0)
-		return cmdid;
-
-	c = &nvmeq->sq_cmds[nvmeq->sq_tail];
-	memset(c, 0, sizeof(*c));
-	c->common.opcode = nvme_admin_async_event;
-	c->common.command_id = cmdid;
-
-	if (++nvmeq->sq_tail == nvmeq->q_depth)
-		nvmeq->sq_tail = 0;
-	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
-}
-
 static int nvme_kthread(void *data)
 {
 	struct nvme_dev *dev, *next;
@@ -1977,34 +1832,26 @@ static int nvme_kthread(void *data)
 					continue;
 				list_del_init(&dev->node);
 				dev_warn(&dev->pci_dev->dev,
-					"Failed status, reset controller\n");
+					"Failed status: %x, reset controller\n",
+					readl(&dev->bar->csts));
 				dev->reset_workfn = nvme_reset_failed_dev;
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
-			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq =
-						rcu_dereference(dev->queues[i]);
+				struct nvme_queue *nvmeq = dev->queues[i];
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
-				if (nvmeq->q_suspended)
-					goto unlock;
 				nvme_process_cq(nvmeq);
-				nvme_cancel_ios(nvmeq, true);
-				nvme_resubmit_bios(nvmeq);
-				nvme_resubmit_iods(nvmeq);
 
 				while ((i == 0) && (dev->event_limit > 0)) {
-					if (nvme_submit_async_req(nvmeq))
+					if (nvme_submit_async_admin_req(dev))
 						break;
 					dev->event_limit--;
 				}
- unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
-			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -2027,29 +1874,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
+	int node = dev_to_node(&dev->pci_dev->dev);
 	int lbaf;
 
 	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
 		return NULL;
 
-	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
 		return NULL;
-	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	ns->queue = blk_mq_init_queue(&dev->tagset);
 	if (!ns->queue)
 		goto out_free_ns;
-	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
-	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue);
-	blk_queue_make_request(ns->queue, nvme_make_request);
+	queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
+	queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
-	disk = alloc_disk(0);
+	disk = alloc_disk_node(0, node);
 	if (!disk)
 		goto out_free_queue;
+
 	ns->ns_id = nsid;
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
@@ -2058,6 +1905,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	if (dev->max_hw_sectors)
 		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+	if (dev->stripe_size)
+		blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
 	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
 		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
 
@@ -2083,143 +1932,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
-static int nvme_find_closest_node(int node)
-{
-	int n, val, min_val = INT_MAX, best_node = node;
-
-	for_each_online_node(n) {
-		if (n == node)
-			continue;
-		val = node_distance(node, n);
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-	return best_node;
-}
-
-static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
-								int count)
-{
-	int cpu;
-	for_each_cpu(cpu, qmask) {
-		if (cpumask_weight(nvmeq->cpu_mask) >= count)
-			break;
-		if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
-			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
-	}
-}
-
-static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
-	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
-{
-	int next_cpu;
-	for_each_cpu(next_cpu, new_mask) {
-		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
-		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
-		cpumask_and(mask, mask, unassigned_cpus);
-		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
-	}
-}
-
 static void nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max;
+	unsigned i;
 
-	max = min(dev->max_qid, num_online_cpus());
-	for (i = dev->queue_count; i <= max; i++)
+	for (i = dev->queue_count; i <= dev->max_qid; i++)
 		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
 			break;
 
-	max = min(dev->queue_count - 1, num_online_cpus());
-	for (i = dev->online_queues; i <= max; i++)
-		if (nvme_create_queue(raw_nvmeq(dev, i), i))
+	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
+		if (nvme_create_queue(dev->queues[i], i))
 			break;
 }
 
-/*
- * If there are fewer queues than online cpus, this will try to optimally
- * assign a queue to multiple cpus by grouping cpus that are "close" together:
- * thread siblings, core, socket, closest node, then whatever else is
- * available.
- */
-static void nvme_assign_io_queues(struct nvme_dev *dev)
-{
-	unsigned cpu, cpus_per_queue, queues, remainder, i;
-	cpumask_var_t unassigned_cpus;
-
-	nvme_create_io_queues(dev);
-
-	queues = min(dev->online_queues - 1, num_online_cpus());
-	if (!queues)
-		return;
-
-	cpus_per_queue = num_online_cpus() / queues;
-	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
-
-	if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
-		return;
-
-	cpumask_copy(unassigned_cpus, cpu_online_mask);
-	cpu = cpumask_first(unassigned_cpus);
-	for (i = 1; i <= queues; i++) {
-		struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
-		cpumask_t mask;
-
-		cpumask_clear(nvmeq->cpu_mask);
-		if (!cpumask_weight(unassigned_cpus)) {
-			unlock_nvmeq(nvmeq);
-			break;
-		}
-
-		mask = *get_cpu_mask(cpu);
-		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_thread_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_core_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(cpu_to_node(cpu)),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(
-					nvme_find_closest_node(
-						cpu_to_node(cpu))),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				unassigned_cpus,
-				nvmeq, cpus_per_queue);
-
-		WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
-			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
-			dev->instance, i);
-
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							nvmeq->cpu_mask);
-		cpumask_andnot(unassigned_cpus, unassigned_cpus,
-						nvmeq->cpu_mask);
-		cpu = cpumask_next(cpu, unassigned_cpus);
-		if (remainder && !--remainder)
-			cpus_per_queue++;
-		unlock_nvmeq(nvmeq);
-	}
-	WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
-								dev->instance);
-	i = 0;
-	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
-	for_each_cpu(cpu, unassigned_cpus)
-		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
-	free_cpumask_var(unassigned_cpus);
-}
-
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -2243,33 +1968,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
-static void nvme_cpu_workfn(struct work_struct *work)
-{
-	struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
-	if (dev->initialized)
-		nvme_assign_io_queues(dev);
-}
-
-static int nvme_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	struct nvme_dev *dev;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		spin_lock(&dev_list_lock);
-		list_for_each_entry(dev, &dev_list, node)
-			schedule_work(&dev->cpu_work);
-		spin_unlock(&dev_list_lock);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
+	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = dev->pci_dev;
 	int result, i, vecs, nr_io_queues, size;
 
@@ -2321,14 +2022,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	dev->max_qid = nr_io_queues;
 
 	result = queue_request_irq(dev, adminq, adminq->irqname);
-	if (result) {
-		adminq->q_suspended = 1;
+	if (result)
 		goto free_queues;
-	}
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-	nvme_assign_io_queues(dev);
+	nvme_create_io_queues(dev);
 
 	return 0;
 
@@ -2378,8 +2077,30 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (ctrl->mdts)
 		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
 	if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
-			(pdev->device == 0x0953) && ctrl->vs[3])
+			(pdev->device == 0x0953) && ctrl->vs[3]) {
+		unsigned int max_hw_sectors;
+
 		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
+		max_hw_sectors = dev->stripe_size >> (shift - 9);
+		if (dev->max_hw_sectors) {
+			dev->max_hw_sectors = min(max_hw_sectors,
+							dev->max_hw_sectors);
+		} else
+			dev->max_hw_sectors = max_hw_sectors;
+	}
+
+	dev->tagset.ops = &nvme_mq_ops;
+	dev->tagset.nr_hw_queues = dev->online_queues - 1;
+	dev->tagset.timeout = NVME_IO_TIMEOUT;
+	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+	dev->tagset.queue_depth =
+				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+	dev->tagset.driver_data = dev;
+
+	if (blk_mq_alloc_tag_set(&dev->tagset))
+		goto out;
 
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
@@ -2529,7 +2250,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
 	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
 
 	init_kthread_work(&nvmeq->cmdinfo.work, fn);
-	return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
+	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
+								ADMIN_TIMEOUT);
 }
 
 static void nvme_del_cq_work_handler(struct kthread_work *work)
@@ -2592,7 +2314,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	atomic_set(&dq.refcount, 0);
 	dq.worker = &worker;
 	for (i = dev->queue_count - 1; i > 0; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		struct nvme_queue *nvmeq = dev->queues[i];
 
 		if (nvme_suspend_queue(nvmeq))
 			continue;
@@ -2637,7 +2359,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 		csts = readl(&dev->bar->csts);
 	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
-			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+			struct nvme_queue *nvmeq = dev->queues[i];
 			nvme_suspend_queue(nvmeq);
 			nvme_clear_queue(nvmeq);
 		}
@@ -2649,6 +2371,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	nvme_dev_unmap(dev);
 }
 
+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+	if (dev->admin_q && !blk_queue_dying(dev->admin_q))
+		blk_cleanup_queue(dev->admin_q);
+}
+
 static void nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns;
@@ -2736,7 +2464,7 @@ static void nvme_free_dev(struct kref *kref)
 
 	pci_dev_put(dev->pci_dev);
 	nvme_free_namespaces(dev);
-	free_percpu(dev->io_queue);
+	blk_mq_free_tag_set(&dev->tagset);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2761,11 +2489,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f)
 static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	struct nvme_dev *dev = f->private_data;
+	struct nvme_ns *ns;
+
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(dev, (void __user *)arg, false);
+		return nvme_user_cmd(dev, NULL, (void __user *)arg);
 	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(dev, (void __user *)arg, true);
+		if (list_empty(&dev->namespaces))
+			return -ENOTTY;
+		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
+		return nvme_user_cmd(dev, ns, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
@@ -2779,6 +2512,22 @@ static const struct file_operations nvme_dev_fops = {
 	.compat_ioctl	= nvme_dev_ioctl,
 };
 
+static void nvme_set_irq_hints(struct nvme_dev *dev)
+{
+	struct nvme_queue *nvmeq;
+	int i;
+
+	for (i = 0; i < dev->online_queues; i++) {
+		nvmeq = dev->queues[i];
+
+		if (!nvmeq->hctx)
+			continue;
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+							nvmeq->hctx->cpumask);
+	}
+}
+
 static int nvme_dev_start(struct nvme_dev *dev)
 {
 	int result;
@@ -2810,12 +2559,15 @@ static int nvme_dev_start(struct nvme_dev *dev)
 		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
 		goto disable;
 	}
-	nvme_init_queue(raw_nvmeq(dev, 0), 0);
+
+	nvme_init_queue(dev->queues[0], 0);
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
 		goto disable;
 
+	nvme_set_irq_hints(dev);
+
 	return result;
 
  disable:
@@ -2866,7 +2618,7 @@ static void nvme_dev_reset(struct nvme_dev *dev)
 {
 	nvme_dev_shutdown(dev);
 	if (nvme_dev_resume(dev)) {
-		dev_err(&dev->pci_dev->dev, "Device failed to resume\n");
+		dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
 		kref_get(&dev->kref);
 		if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
 							dev->instance))) {
@@ -2891,28 +2643,28 @@ static void nvme_reset_workfn(struct work_struct *work)
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	int result = -ENOMEM;
+	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	node = dev_to_node(&pdev->dev);
+	if (node == NUMA_NO_NODE)
+		set_dev_node(&pdev->dev, 0);
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
 	if (!dev)
 		return -ENOMEM;
-	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
-								GFP_KERNEL);
+	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
+							GFP_KERNEL, node);
 	if (!dev->entry)
 		goto free;
-	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
-								GFP_KERNEL);
+	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+							GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
-	dev->io_queue = alloc_percpu(unsigned short);
-	if (!dev->io_queue)
-		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
-	INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
 	dev->pci_dev = pci_dev_get(pdev);
 	pci_set_drvdata(pdev, dev);
 	result = nvme_set_instance(dev);
@@ -2942,11 +2694,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto remove;
 
+	nvme_set_irq_hints(dev);
+
 	dev->initialized = 1;
 	return 0;
 
  remove:
 	nvme_dev_remove(dev);
+	nvme_dev_remove_admin(dev);
 	nvme_free_namespaces(dev);
  shutdown:
 	nvme_dev_shutdown(dev);
@@ -2958,7 +2713,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  put_pci:
 	pci_dev_put(dev->pci_dev);
  free:
-	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2991,11 +2745,12 @@ static void nvme_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->reset_work);
-	flush_work(&dev->cpu_work);
 	misc_deregister(&dev->miscdev);
+	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
+	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
-	nvme_dev_remove(dev);
+	nvme_free_admin_tags(dev);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
@@ -3079,18 +2834,11 @@ static int __init nvme_init(void)
 	else if (result > 0)
 		nvme_major = result;
 
-	nvme_nb.notifier_call = &nvme_cpu_notify;
-	result = register_hotcpu_notifier(&nvme_nb);
-	if (result)
-		goto unregister_blkdev;
-
 	result = pci_register_driver(&nvme_driver);
 	if (result)
-		goto unregister_hotcpu;
+		goto unregister_blkdev;
 	return 0;
 
- unregister_hotcpu:
-	unregister_hotcpu_notifier(&nvme_nb);
  unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
  kill_workq:
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 046ae33..49f86d1 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 		nvme_offset += unit_num_blocks;
 
-		nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+		nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
 		if (nvme_sc != NVME_SC_SUCCESS) {
 			nvme_unmap_user_pages(dev,
 				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			c.common.opcode = nvme_cmd_flush;
 			c.common.nsid = cpu_to_le32(ns->ns_id);
 
-			nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+			nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
 			res = nvme_trans_status_code(hdr, nvme_sc);
 			if (res)
 				goto out;
@@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 	c.common.opcode = nvme_cmd_flush;
 	c.common.nsid = cpu_to_le32(ns->ns_id);
 
-	nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+	nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
 
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
@@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+	nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
 	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ed09074..258945f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/kref.h>
+#include <linux/blk-mq.h>
 
 struct nvme_bar {
 	__u64			cap;	/* Controller Capabilities */
@@ -71,8 +72,10 @@ extern unsigned char nvme_io_timeout;
  */
 struct nvme_dev {
 	struct list_head node;
-	struct nvme_queue __rcu **queues;
-	unsigned short __percpu *io_queue;
+	struct nvme_queue **queues;
+	struct request_queue *admin_q;
+	struct blk_mq_tag_set tagset;
+	struct blk_mq_tag_set admin_tagset;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
@@ -91,7 +94,6 @@ struct nvme_dev {
 	struct miscdevice miscdev;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
-	struct work_struct cpu_work;
 	char name[12];
 	char serial[20];
 	char model[40];
@@ -135,7 +137,6 @@ struct nvme_iod {
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
-	unsigned long start_time;
 	dma_addr_t first_dma;
 	struct list_head node;
 	struct scatterlist sg[0];
@@ -153,12 +154,14 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
  */
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
 
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t);
+int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 				unsigned long addr, unsigned length);
 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 			struct nvme_iod *iod);
-int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *);
+int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
+						struct nvme_command *, u32 *);
+int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
 int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
 							u32 *result);
 int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
-- 
cgit v0.10.2


From 9f173b33843552c48e0136e02e7362c8229c8e57 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 5 Nov 2014 23:39:09 +0300
Subject: NVMe: blk_mq_alloc_request() returns error pointers

We recently converted this to blk_mq but the error checks have to be
updated to check for IS_ERR() instead of NULL.

Fixes: a4aea5623d4a ('NVMe: Convert to blk-mq')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 39050a3..f7a8717 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -826,8 +826,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	struct request *req;
 
 	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	cmd_info = blk_mq_rq_to_pdu(req);
 	nvme_set_info(cmd_info, req, async_req_completion);
@@ -848,8 +848,8 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	struct nvme_cmd_info *cmd_rq;
 
 	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->timeout = timeout;
 	cmd_rq = blk_mq_rq_to_pdu(req);
@@ -1026,7 +1026,7 @@ static void nvme_abort_req(struct request *req)
 
 	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
 									false);
-	if (!abort_req)
+	if (IS_ERR(abort_req))
 		return;
 
 	abort_cmd = blk_mq_rq_to_pdu(abort_req);
@@ -1884,7 +1884,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	if (!ns)
 		return NULL;
 	ns->queue = blk_mq_init_queue(&dev->tagset);
-	if (!ns->queue)
+	if (IS_ERR(ns->queue))
 		goto out_free_ns;
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-- 
cgit v0.10.2


From a64e6bb4db601b561f2d3a80dfc554ddfe73db74 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Wed, 5 Nov 2014 18:47:07 +0800
Subject: NVMe: __nvme_submit_admin_cmd() can be static

drivers/block/nvme-core.c:865:5: sparse: symbol '__nvme_submit_admin_cmd' was not declared. Should it be static?

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f7a8717..8393f91 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -862,7 +862,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
-int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 						u32 *result, unsigned timeout)
 {
 	int res;
-- 
cgit v0.10.2


From 179e20b8df97e0c7541a1bae30cad53ecc7a5e86 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Mon, 10 Nov 2014 17:21:09 +0100
Subject: drbd: Minor cleanups

 . Update comments
 . drbd_set_{in,out_of}_sync(): Remove unused parameters
 . Move common code into adm_del_resource()
 . Redefine ERR_MINOR_EXISTS -> ERR_MINOR_OR_VOLUME_EXISTS

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index a2dfa16..1318e32 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -827,8 +827,7 @@ static int update_sync_bits(struct drbd_device *device,
  *
  */
 int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
-		enum update_sync_bits_mode mode,
-		const char *file, const unsigned int line)
+		enum update_sync_bits_mode mode)
 {
 	/* Is called from worker and receiver context _only_ */
 	unsigned long sbnr, ebnr, lbnr;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9b22f8f..c14b718 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1662,14 +1662,13 @@ extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long stil
 
 enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
 extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
-		enum update_sync_bits_mode mode,
-		const char *file, const unsigned int line);
+		enum update_sync_bits_mode mode);
 #define drbd_set_in_sync(device, sector, size) \
-	__drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
+	__drbd_change_sync(device, sector, size, SET_IN_SYNC)
 #define drbd_set_out_of_sync(device, sector, size) \
-	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
+	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
 #define drbd_rs_failed_io(device, sector, size) \
-	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
+	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
 extern void drbd_al_shrink(struct drbd_device *device);
 extern int drbd_initialize_al(struct drbd_device *, void *);
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 973c185..cce25a5 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2731,7 +2731,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	device = minor_to_device(minor);
 	if (device)
-		return ERR_MINOR_EXISTS;
+		return ERR_MINOR_OR_VOLUME_EXISTS;
 
 	/* GFP_KERNEL, we are outside of all write-out paths */
 	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
@@ -2794,7 +2794,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
 	if (id < 0) {
 		if (id == -ENOSPC) {
-			err = ERR_MINOR_EXISTS;
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
 			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_no_minor_idr;
@@ -2804,7 +2804,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
 	if (id < 0) {
 		if (id == -ENOSPC) {
-			err = ERR_MINOR_EXISTS;
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
 			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_idr_remove_minor;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1cd47df..c145619 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2052,7 +2052,7 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c
 	rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
 	rcu_read_unlock();
 
-	/* connection->volumes protected by genl_lock() here */
+	/* connection->peer_devices protected by genl_lock() here */
 	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
 		struct drbd_device *device = peer_device->device;
 		if (!device->bitmap) {
@@ -3483,7 +3483,7 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 	 * that first_peer_device(device)->connection and device->vnr match the request. */
 	if (adm_ctx.device) {
 		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
-			retcode = ERR_MINOR_EXISTS;
+			retcode = ERR_MINOR_OR_VOLUME_EXISTS;
 		/* else: still NO_ERROR */
 		goto out;
 	}
@@ -3530,6 +3530,27 @@ out:
 	return 0;
 }
 
+static int adm_del_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+
+	for_each_connection(connection, resource) {
+		if (connection->cstate > C_STANDALONE)
+			return ERR_NET_CONFIGURED;
+	}
+	if (!idr_is_empty(&resource->devices))
+		return ERR_RES_IN_USE;
+
+	list_del_rcu(&resource->resources);
+	/* Make sure all threads have actually stopped: state handling only
+	 * does drbd_thread_stop_nowait(). */
+	list_for_each_entry(connection, &resource->connections, connections)
+		drbd_thread_stop(&connection->worker);
+	synchronize_rcu();
+	drbd_free_resource(resource);
+	return NO_ERROR;
+}
+
 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -3575,14 +3596,6 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
-	/* If we reach this, all volumes (of this connection) are Secondary,
-	 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
-	 * actually stopped, state handling only does drbd_thread_stop_nowait(). */
-	for_each_connection(connection, resource)
-		drbd_thread_stop(&connection->worker);
-
-	/* Now, nothing can fail anymore */
-
 	/* delete volumes */
 	idr_for_each_entry(&resource->devices, device, i) {
 		retcode = adm_del_minor(device);
@@ -3593,10 +3606,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
-	list_del_rcu(&resource->resources);
-	synchronize_rcu();
-	drbd_free_resource(resource);
-	retcode = NO_ERROR;
+	retcode = adm_del_resource(resource);
 out:
 	mutex_unlock(&resource->adm_mutex);
 finish:
@@ -3608,7 +3618,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
 	struct drbd_resource *resource;
-	struct drbd_connection *connection;
 	enum drbd_ret_code retcode;
 
 	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
@@ -3616,27 +3625,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 		return retcode;
 	if (retcode != NO_ERROR)
 		goto finish;
-
 	resource = adm_ctx.resource;
-	mutex_lock(&resource->adm_mutex);
-	for_each_connection(connection, resource) {
-		if (connection->cstate > C_STANDALONE) {
-			retcode = ERR_NET_CONFIGURED;
-			goto out;
-		}
-	}
-	if (!idr_is_empty(&resource->devices)) {
-		retcode = ERR_RES_IN_USE;
-		goto out;
-	}
 
-	list_del_rcu(&resource->resources);
-	for_each_connection(connection, resource)
-		drbd_thread_stop(&connection->worker);
-	synchronize_rcu();
-	drbd_free_resource(resource);
-	retcode = NO_ERROR;
-out:
+	mutex_lock(&resource->adm_mutex);
+	retcode = adm_del_resource(resource);
 	mutex_unlock(&resource->adm_mutex);
 finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index debb70d..8723f2a 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -172,7 +172,7 @@ enum drbd_ret_code {
 	ERR_RES_NOT_KNOWN	= 158,
 	ERR_RES_IN_USE		= 159,
 	ERR_MINOR_CONFIGURED    = 160,
-	ERR_MINOR_EXISTS	= 161,
+	ERR_MINOR_OR_VOLUME_EXISTS = 161,
 	ERR_INVALID_REQUEST	= 162,
 	ERR_NEED_APV_100	= 163,
 	ERR_NEED_ALLOW_TWO_PRI  = 164,
-- 
cgit v0.10.2


From f221f4bcc5f40e2967e4596ef167bdbc987c8e9d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Mon, 10 Nov 2014 17:21:10 +0100
Subject: drbd: Only use drbd_msg_put_info() in drbd_nl.c

Avoid generic netlink calls in other parts of the code base.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c14b718..d2a7589 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1454,7 +1454,6 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
-extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index cce25a5..1fc8342 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2532,10 +2532,6 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
 
 	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
-		/*
-		retcode = ERR_NOMEM;
-		drbd_msg_put_info("unable to allocate cpumask");
-		*/
 
 	/* silently ignore cpu mask on UP kernel */
 	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
@@ -2793,20 +2789,16 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
 	if (id < 0) {
-		if (id == -ENOSPC) {
+		if (id == -ENOSPC)
 			err = ERR_MINOR_OR_VOLUME_EXISTS;
-			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
-		}
 		goto out_no_minor_idr;
 	}
 	kref_get(&device->kref);
 
 	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
 	if (id < 0) {
-		if (id == -ENOSPC) {
+		if (id == -ENOSPC)
 			err = ERR_MINOR_OR_VOLUME_EXISTS;
-			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
-		}
 		goto out_idr_remove_minor;
 	}
 	kref_get(&device->kref);
@@ -2825,10 +2817,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
 		if (id < 0) {
-			if (id == -ENOSPC) {
+			if (id == -ENOSPC)
 				err = ERR_INVALID_REQUEST;
-				drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
-			}
 			goto out_idr_remove_from_resource;
 		}
 		kref_get(&connection->kref);
@@ -2836,7 +2826,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	if (init_submitter(device)) {
 		err = ERR_NOMEM;
-		drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
 		goto out_idr_remove_vol;
 	}
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c145619..4782d07 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -92,7 +92,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
  * reason it could fail was no space in skb, and there are 4k available. */
-int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
 {
 	struct nlattr *nla;
 	int err = -EMSGSIZE;
-- 
cgit v0.10.2


From a88215312c5ed74697973f6c9f0fce718bcf18ad Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 10 Nov 2014 17:21:11 +0100
Subject: drbd: fix race between role change and handshake

Symptoms:
If DRBD was "cleanly shut down" (all in sync, both Secondary before
disconnect, identical data generation uuids), and then one side was
promoted *during* the next connection handshake, the role change
could confuse the handshake.

The Primary would get stuck in WFBitmapS, the Secondary would log
unexpected cstate (Connected) in receive_bitmap
and get stuck in WFBitmapT.

Fix:
The test in is_valid_soft_transition wrong. It works because
the not allowed actions (promote/attach) do not touch the
cstate. The previous condition failed to demand a cstate change
in one clause.

In order to avoid deadlocks give up the state_mutex while waiting
for the transient state to go away.

Conflicts:
	drbd/drbd_state.c
	drbd/drbd_state.h
	drbd/drbd_wrappers.h

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 4782d07..74df8cf 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -588,7 +588,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 	val.i  = 0; val.role  = new_role;
 
 	while (try++ < max_tries) {
-		rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
+		rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
 
 		/* in case we first succeeded to outdate,
 		 * but now suddenly could establish a connection */
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 84b11f8..4529d92 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -215,6 +215,18 @@ static bool no_peer_wf_report_params(struct drbd_connection *connection)
 	return rv;
 }
 
+static void wake_up_all_devices(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		wake_up(&peer_device->device->state_wait);
+	rcu_read_unlock();
+
+}
+
 
 /**
  * cl_wide_st_chg() - true if the state change is a cluster wide one
@@ -410,6 +422,22 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
 	return rv;
 }
 
+enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	BUG_ON(f & CS_SERIALIZE);
+
+	wait_event_cmd(device->state_wait,
+		       (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
+		       mutex_unlock(device->state_mutex),
+		       mutex_lock(device->state_mutex));
+
+	return rv;
+}
+
 static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
 {
 	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
@@ -629,14 +657,11 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
 	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
 		rv = SS_IN_TRANSIENT_STATE;
 
-	/* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
-	   rv = SS_IN_TRANSIENT_STATE; */
-
 	/* While establishing a connection only allow cstate to change.
-	   Delay/refuse role changes, detach attach etc... */
+	   Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
 	if (test_bit(STATE_SENT, &connection->flags) &&
-	    !(os.conn == C_WF_REPORT_PARAMS ||
-	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+	    !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
+	      (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
 		rv = SS_IN_TRANSIENT_STATE;
 
 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
@@ -1032,8 +1057,10 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 	/* Wake up role changes, that were delayed because of connection establishing */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-	    no_peer_wf_report_params(connection))
+	    no_peer_wf_report_params(connection)) {
 		clear_bit(STATE_SENT, &connection->flags);
+		wake_up_all_devices(connection);
+	}
 
 	wake_up(&device->misc_wait);
 	wake_up(&device->state_wait);
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index cc41605..7f53c40 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -117,6 +117,11 @@ extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
 					      union drbd_state,
 					      union drbd_state,
 					      enum chg_state_flags);
+
+extern enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
+					union drbd_state, enum chg_state_flags);
+
 extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
 					   enum chg_state_flags,
 					   struct completion *done);
-- 
cgit v0.10.2


From ff8bd88b73fad369a12465dfa52ad5c0bf088ced Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 10 Nov 2014 17:21:12 +0100
Subject: drbd: fix resync throttling initialization

If for some reason DRBD resync was the only activity on a backend
device, drbd_rs_c_min_rate_throttle() would mistakenly decide that it is
still initialization time, and keep throttling the resync.

This patch explicitly initializes ->rs_last_events to the current
backend event counters, and drops the rs_last_events == 0 from the
throttle condition.

Reported-by: Mikhail Sugakov <msugakov@amazon.de>

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6960fb0..d169b4a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2482,7 +2482,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 			atomic_read(&device->rs_sect_ev);
 
 	if (atomic_read(&device->ap_actlog_cnt)
-	    || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
+	    || curr_events - device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
 
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 4529d92..2d7dd26 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1099,7 +1099,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 		set_ov_position(device, ns.conn);
 		device->rs_start = now;
-		device->rs_last_events = 0;
 		device->rs_last_sect_ev = 0;
 		device->ov_last_oos_size = 0;
 		device->ov_last_oos_start = 0;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d2d1f97..d0fae55 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1592,11 +1592,15 @@ void drbd_resync_after_changed(struct drbd_device *device)
 
 void drbd_rs_controller_reset(struct drbd_device *device)
 {
+	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
 	struct fifo_buffer *plan;
 
 	atomic_set(&device->rs_sect_in, 0);
 	atomic_set(&device->rs_sect_ev, 0);
 	device->rs_in_flight = 0;
+	device->rs_last_events =
+		(int)part_stat_read(&disk->part0, sectors[0]) +
+		(int)part_stat_read(&disk->part0, sectors[1]);
 
 	/* Updating the RCU protected object in place is necessary since
 	   this function gets called from atomic context.
@@ -1743,7 +1747,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		device->rs_failed    = 0;
 		device->rs_paused    = 0;
 		device->rs_same_csum = 0;
-		device->rs_last_events = 0;
 		device->rs_last_sect_ev = 0;
 		device->rs_total     = tw;
 		device->rs_start     = now;
-- 
cgit v0.10.2


From 3b9d35d744bb5139f9fed57f38c019bb8c7d351c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 10 Nov 2014 17:21:13 +0100
Subject: drbd: merge_bvec_fn: properly remap bvm->bi_bdev

This was not noticed for many years. Affects operation if
md raid is used a backing device for DRBD.

CC: stable@kernel.org # v3.2+
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 5a01c53..90319b1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1545,6 +1545,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 		struct request_queue * const b =
 			device->ldev->backing_bdev->bd_disk->queue;
 		if (b->merge_bvec_fn) {
+			bvm->bi_bdev = device->ldev->backing_bdev;
 			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
 			limit = min(limit, backing_limit);
 		}
-- 
cgit v0.10.2


From 9581f97a687724ea41cf2e145dda4751161198c1 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 10 Nov 2014 17:21:14 +0100
Subject: drbd: Fix state change in case of connection timeout

A connection timeout affects all volumes of a resource!
Under the following conditions:

 A resource with multiple volumes
  AND
 ko-count >=1
  AND
 a write request triggers the timeout (ko-count * timeout)

DRBD's internal state gets confused. That in turn may
lead to very miss leading follow up failures. E.g.
"BUG: scheduling while atomic"

CC: stable@kernel.org # v3.17
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 90319b1..3b797cd 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1629,7 +1629,7 @@ void request_timer_fn(unsigned long data)
 		 time_after(now, req_peer->pre_send_jif + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
-		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
 	}
 	if (dt && oldest_submit_jif != now &&
 		 time_after(now, oldest_submit_jif + dt) &&
-- 
cgit v0.10.2


From e805b983d3fc852b80e30327e42c8c5f0c55c62c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 10 Nov 2014 17:21:15 +0100
Subject: drbd: Remove an useless copy of kernel_setsockopt()

Old backward-compat cruft

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d2a7589..b905e98 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1557,52 +1557,31 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-/* Yes, there is kernel_setsockopt, but only since 2.6.18.
- * So we have our own copy of it here. */
-static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
-				  char *optval, int optlen)
-{
-	mm_segment_t oldfs = get_fs();
-	char __user *uoptval;
-	int err;
-
-	uoptval = (char __user __force *)optval;
-
-	set_fs(KERNEL_DS);
-	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
-	else
-		err = sock->ops->setsockopt(sock, level, optname, uoptval,
-					    optlen);
-	set_fs(oldfs);
-	return err;
-}
-
 static inline void drbd_tcp_cork(struct socket *sock)
 {
 	int val = 1;
-	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
 			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_uncork(struct socket *sock)
 {
 	int val = 0;
-	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
 			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_nodelay(struct socket *sock)
 {
 	int val = 1;
-	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
 			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_quickack(struct socket *sock)
 {
 	int val = 2;
-	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
 			(char*)&val, sizeof(val));
 }
 
-- 
cgit v0.10.2


From 9d135bb8c2a0d2e54b84ebc1b7d41852614fead8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:43:42 -0700
Subject: NVMe: replace blk_put_request() with blk_mq_free_request()

No point in using blk_put_request(), since we know we are blk-mq.
This only makes sense in core code where we could be dealing with
either legacy or blk-mq drivers. Additionally, use
blk_mq_free_hctx_request() for the request completion fast path,
where we already know the mapping from request to hardware queue.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 8393f91..bbac17f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -262,7 +262,7 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
 		dev_warn(nvmeq->q_dmadev,
 			"async event result %08x\n", result);
 
-	blk_put_request(req);
+	blk_mq_free_hctx_request(nvmeq->hctx, req);
 }
 
 static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
@@ -273,7 +273,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 	u32 result = le32_to_cpup(&cqe->result);
 
-	blk_put_request(req);
+	blk_mq_free_hctx_request(nvmeq->hctx, req);
 
 	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
 	++nvmeq->dev->abort_limit;
@@ -286,7 +286,7 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-	blk_put_request(cmdinfo->req);
+	blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
 }
 
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
@@ -872,7 +872,7 @@ static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cm
 	if (!req)
 		return -ENOMEM;
 	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
-	blk_put_request(req);
+	blk_mq_free_request(req);
 	return res;
 }
 
@@ -893,7 +893,7 @@ int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	if (!req)
 		return -ENOMEM;
 	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
-	blk_put_request(req);
+	blk_mq_free_request(req);
 	return res;
 }
 
@@ -1047,7 +1047,7 @@ static void nvme_abort_req(struct request *req)
 		dev_warn(nvmeq->q_dmadev,
 				"Could not abort I/O %d QID %d",
 				req->tag, nvmeq->qid);
-		blk_put_request(req);
+		blk_mq_free_request(req);
 	}
 }
 
@@ -1688,7 +1688,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 		else {
 			status = nvme_submit_sync_cmd(req, &c, &cmd.result,
 								timeout);
-			blk_put_request(req);
+			blk_mq_free_request(req);
 		}
 	} else
 		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
-- 
cgit v0.10.2


From 6dcc0cf6cb3120cedc0d4c12171894f3d6415981 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 18 Nov 2014 08:21:18 -0700
Subject: NVMe: nvme_submit_async_admin_req() must use atomic rq allocation

We are called for async event notification issues, and the
nvmeq lock is already held. If we fail the request allocation,
we'll just retry next time.

Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index bbac17f..fb4b205 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -825,7 +825,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	struct nvme_cmd_info *cmd_info;
 	struct request *req;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-- 
cgit v0.10.2


From 139768895309c6c1d6913e909e9c9422f81a1640 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 18 Nov 2014 08:45:31 -0700
Subject: NVMe: enable IO stats by default

Before the blk-mq conversion they were on by default, we should
not change behavior there.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index fb4b205..677d7b9 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1889,7 +1889,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
-- 
cgit v0.10.2


From e32efbfc35c1b06f1bfe3e6d737acdd14d27baed Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 14 Nov 2014 09:49:26 -0700
Subject: NVMe: make setup work for devices that don't do INTx

The setup/probe part currently relies on INTx being there and
working, that's not always the case. For devices that don't
advertise INTx, enable a single MSIx vector early on and disable
it again before we ask for our full range of queue vecs.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 677d7b9..9310fe5 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1998,6 +1998,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, adminq);
 
+	/*
+	 * If we enable msix early due to not intx, disable it again before
+	 * setting up the full range we need.
+	 */
+	if (!pdev->irq)
+		pci_disable_msix(pdev);
+
 	for (i = 0; i < nr_io_queues; i++)
 		dev->entry[i].entry = i;
 	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
@@ -2150,10 +2157,22 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar)
 		goto disable;
+
 	if (readl(&dev->bar->csts) == -1) {
 		result = -ENODEV;
 		goto unmap;
 	}
+
+	/*
+	 * Some devices don't advertse INTx interrupts, pre-enable a single
+	 * MSIX vec for setup. We'll adjust this later.
+	 */
+	if (!pdev->irq) {
+		result = pci_enable_msix(pdev, dev->entry, 1);
+		if (result < 0)
+			goto unmap;
+	}
+
 	cap = readq(&dev->bar->cap);
 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
 	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
-- 
cgit v0.10.2


From 2c30540b38d683d4c7f06d13a451f67d4362d7b1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 14 Nov 2014 09:47:32 -0700
Subject: NVMe: add ->exit_hctx() hook

If we do teardown and setup of the queue and block related parts
of the driver, then we should clear nvmeq->hctx once we kill the
hardware queue.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 9310fe5..ba278ae 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -171,6 +171,13 @@ static int nvme_admin_init_request(void *data, struct request *req,
 	return 0;
 }
 
+static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	nvmeq->hctx = NULL;
+}
+
 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 			  unsigned int hctx_idx)
 {
@@ -1335,6 +1342,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_admin_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
+	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_admin_init_request,
 	.timeout	= nvme_timeout,
 };
@@ -1343,6 +1351,7 @@ static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
+	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_init_request,
 	.timeout	= nvme_timeout,
 };
-- 
cgit v0.10.2


From be7837e89d610046ae8dd28dc504df09261d9f91 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 14 Nov 2014 09:50:19 -0700
Subject: NVMe: fail pci initialization if the device doesn't have any BARs

The PCI init of NVMe doesn't check for valid bars before proceeding
to map and use BAR 0. If the device is hosed (or firmware is), then
we should catch this case and give up early.

This fixes a:

[ 1662.035778] WARNING: CPU: 0 PID: 4 at arch/x86/mm/ioremap.c:63 __ioremap_check_ram+0xa7/0xc0()

and later badness on such a device.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ba278ae..c154165 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2156,6 +2156,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	dev->entry[0].vector = pdev->irq;
 	pci_set_master(pdev);
 	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (!bars)
+		goto disable_pci;
+
 	if (pci_request_selected_regions(pdev, bars, "nvme"))
 		goto disable_pci;
 
-- 
cgit v0.10.2


From c78b47136f7adebcf953f541c7d9c4c745a54e3f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 21 Nov 2014 15:16:32 -0700
Subject: NVMe: Update module version major number

It's already near impossible to tell what bits someone is running based on
a 'modinfo nvme', and I don't want to try guessing if someone is running
blk-mq or bio-based. Let's make it obvious with the module version that
the blk-mq conversion is a major change. Future bio-based versions can
increment to 0.10 in a fork if revisions occur.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c154165..c7ea07c 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2888,6 +2888,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.9");
+MODULE_VERSION("1.0");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit v0.10.2


From aae4933da9488827d341c31b970b2f62ac45a496 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 11:05:24 +0800
Subject: md/bcache: use generic io stats accounting functions to simplify io
 stat accounting

Use generic io stats accounting help functions (generic_{start,end}_io_acct)
to simplify io stat accounting.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Acked-by: Kent Overstreet <kmo@datera.io>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 62e6e98..ab43fad 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -601,13 +601,8 @@ static void request_endio(struct bio *bio, int error)
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
-		int cpu, rw = bio_data_dir(s->orig_bio);
-		unsigned long duration = jiffies - s->start_time;
-
-		cpu = part_stat_lock();
-		part_round_stats(cpu, &s->d->disk->part0);
-		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
-		part_stat_unlock();
+		generic_end_io_acct(bio_data_dir(s->orig_bio),
+				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
 		bio_endio(s->orig_bio, s->iop.error);
@@ -959,12 +954,9 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 	struct search *s;
 	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
-	int cpu, rw = bio_data_dir(bio);
+	int rw = bio_data_dir(bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
-	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
-	part_stat_unlock();
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
 
 	bio->bi_bdev = dc->bdev;
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1074,12 +1066,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	struct search *s;
 	struct closure *cl;
 	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
-	int cpu, rw = bio_data_dir(bio);
+	int rw = bio_data_dir(bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
-	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
-	part_stat_unlock();
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
 
 	s = search_alloc(bio, d);
 	cl = &s->cl;
-- 
cgit v0.10.2


From 244808543e6e9b46ea1135589877f4d4bd2925c5 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 11:05:25 +0800
Subject: drbd: use generic io stats accounting functions to simplify io stat
 accounting

Use generic io stats accounting help functions (generic_{start,end}_io_acct)
to simplify io stat accounting.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3b797cd..34f2f0b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -36,29 +36,15 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector,
 /* Update disk stats at start of I/O request */
 static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
-	const int rw = bio_data_dir(req->master_bio);
-	int cpu;
-	cpu = part_stat_lock();
-	part_round_stats(cpu, &device->vdisk->part0);
-	part_stat_inc(cpu, &device->vdisk->part0, ios[rw]);
-	part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9);
-	(void) cpu; /* The macro invocations above want the cpu argument, I do not like
-		       the compiler warning about cpu only assigned but never used... */
-	part_inc_in_flight(&device->vdisk->part0, rw);
-	part_stat_unlock();
+	generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
+			      &device->vdisk->part0);
 }
 
 /* Update disk stats when completing request upwards */
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
-	int rw = bio_data_dir(req->master_bio);
-	unsigned long duration = jiffies - req->start_jif;
-	int cpu;
-	cpu = part_stat_lock();
-	part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
-	part_round_stats(cpu, &device->vdisk->part0);
-	part_dec_in_flight(&device->vdisk->part0, rw);
-	part_stat_unlock();
+	generic_end_io_acct(bio_data_dir(req->master_bio),
+			    &device->vdisk->part0, req->start_jif);
 }
 
 static struct drbd_request *drbd_req_new(struct drbd_device *device,
-- 
cgit v0.10.2


From 18c0b223cf9901727ef3b02da6711ac930b4e5d4 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 11:05:26 +0800
Subject: md: use generic io stats accounting functions to simplify io stat
 accounting

Use generic io stats accounting help functions (generic_{start,end}_io_acct)
to simplify io stat accounting.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927..b1cdf69 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -602,13 +602,10 @@ static void end_io_acct(struct dm_io *io)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending, cpu;
+	int pending;
 	int rw = bio_data_dir(bio);
 
-	cpu = part_stat_lock();
-	part_round_stats(cpu, &dm_disk(md)->part0);
-	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
-	part_stat_unlock();
+	generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
@@ -1648,16 +1645,12 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
-	int cpu;
 	int srcu_idx;
 	struct dm_table *map;
 
 	map = dm_get_live_table(md, &srcu_idx);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
-	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
-	part_stat_unlock();
+	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
 
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9233c71..056ccd2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -247,7 +247,6 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
 	struct mddev *mddev = q->queuedata;
-	int cpu;
 	unsigned int sectors;
 
 	if (mddev == NULL || mddev->pers == NULL
@@ -284,10 +283,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	sectors = bio_sectors(bio);
 	mddev->pers->make_request(mddev, bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
-	part_stat_unlock();
+	generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
 
 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 		wake_up(&mddev->sb_wait);
-- 
cgit v0.10.2


From fa573f72790c2c2b9bfd758c5d33959af4a39915 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 11:05:27 +0800
Subject: block/rsxx: use generic io stats accounting functions to simplify io
 stat accounting

Use generic io stats accounting help functions (generic_{start,end}_io_acct)
to simplify io stat accounting.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 40ee770..ac8c62c 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,37 +112,16 @@ static const struct block_device_operations rsxx_fops = {
 
 static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
 {
-	struct hd_struct *part0 = &card->gendisk->part0;
-	int rw = bio_data_dir(bio);
-	int cpu;
-
-	cpu = part_stat_lock();
-
-	part_round_stats(cpu, part0);
-	part_inc_in_flight(part0, rw);
-
-	part_stat_unlock();
+	generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio),
+			     &card->gendisk->part0);
 }
 
 static void disk_stats_complete(struct rsxx_cardinfo *card,
 				struct bio *bio,
 				unsigned long start_time)
 {
-	struct hd_struct *part0 = &card->gendisk->part0;
-	unsigned long duration = jiffies - start_time;
-	int rw = bio_data_dir(bio);
-	int cpu;
-
-	cpu = part_stat_lock();
-
-	part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio));
-	part_stat_inc(cpu, part0, ios[rw]);
-	part_stat_add(cpu, part0, ticks[rw], duration);
-
-	part_round_stats(cpu, part0);
-	part_dec_in_flight(part0, rw);
-
-	part_stat_unlock();
+	generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0,
+			   start_time);
 }
 
 static void bio_dma_done_cb(struct rsxx_cardinfo *card,
-- 
cgit v0.10.2


From 709c8667ada1fa26ac941bb875892afc0046b3e2 Mon Sep 17 00:00:00 2001
From: Matias Bjorling <m@bjorling.me>
Date: Wed, 26 Nov 2014 14:45:48 -0700
Subject: null_blk: boundary check queue_mode and irqmode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When either queue_mode or irq_mode parameter is set outside its
boundaries, the driver will not complete requests. This stalls driver
initialization when partitions are probed. Fix by setting out of bound
values to the parameters default.

Signed-off-by: Matias Bjørling <m@bjorling.me>

Updated by me to have the parse+check in just one function.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index caa6121..ae9f615 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -78,7 +78,33 @@ module_param(home_node, int, S_IRUGO);
 MODULE_PARM_DESC(home_node, "Home node for the device");
 
 static int queue_mode = NULL_Q_MQ;
-module_param(queue_mode, int, S_IRUGO);
+
+static int null_param_store_val(const char *str, int *val, int min, int max)
+{
+	int ret, new_val;
+
+	ret = kstrtoint(str, 10, &new_val);
+	if (ret)
+		return -EINVAL;
+
+	if (new_val < min || new_val > max)
+		return -EINVAL;
+
+	*val = new_val;
+	return 0;
+}
+
+static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
+{
+	return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+}
+
+static struct kernel_param_ops null_queue_mode_param_ops = {
+	.set	= null_set_queue_mode,
+	.get	= param_get_int,
+};
+
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
 MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
 
 static int gb = 250;
@@ -94,7 +120,19 @@ module_param(nr_devices, int, S_IRUGO);
 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
 
 static int irqmode = NULL_IRQ_SOFTIRQ;
-module_param(irqmode, int, S_IRUGO);
+
+static int null_set_irqmode(const char *str, const struct kernel_param *kp)
+{
+	return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
+					NULL_IRQ_TIMER);
+}
+
+static struct kernel_param_ops null_irqmode_param_ops = {
+	.set	= null_set_irqmode,
+	.get	= param_get_int,
+};
+
+device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
 
 static int completion_nsec = 10000;
-- 
cgit v0.10.2


From 9af8785a38d4528d6675247f873b0f1ae29f3be8 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 3 Dec 2014 17:07:13 -0700
Subject: NVMe: Fix command setup on IO retry

On retry, the req->special is pointing to an already setup IOD, but we
still need to setup the command context and callback, otherwise you'll
see false twice completed errors and leak requests.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c7ea07c..bcbdf83 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -640,8 +640,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	iod->private = req;
 	req->special = iod;
 
-	nvme_set_info(cmd, iod, req_completion);
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -677,6 +675,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(req);
 
  submit_iod:
+	nvme_set_info(cmd, iod, req_completion);
 	spin_lock_irq(&nvmeq->q_lock);
 	if (req->cmd_flags & REQ_DISCARD)
 		nvme_submit_discard(nvmeq, ns, req, iod);
-- 
cgit v0.10.2


From ad42d391ae388bd4ce4d53e8d7b4089cfdcba411 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 1 Dec 2014 14:01:13 +0100
Subject: xen/blkfront: improve protection against issuing unsupported REQ_FUA

Guard against issuing unsupported REQ_FUA and REQ_FLUSH was introduced
in d11e61583 and was factored out into blkif_request_flush_valid() in
0f1ca65ee. However:
1) This check in incomplete. In case we negotiated to feature_flush = REQ_FLUSH
   and flush_op = BLKIF_OP_FLUSH_DISKCACHE (so FUA is unsupported) FUA request
   will still pass the check.
2) blkif_request_flush_valid() is misnamed. It is bool but returns true when
   the request is invalid.
3) When blkif_request_flush_valid() fails -EIO is being returned. It seems that
   -EOPNOTSUPP is more appropriate here.
Fix all of the above issues.

This patch is based on the original patch by Laszlo Ersek and a comment by
Jeff Moyer.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5ac312f..2e6c103 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -582,12 +582,14 @@ static inline void flush_requests(struct blkfront_info *info)
 		notify_remote_via_irq(info->irq);
 }
 
-static inline bool blkif_request_flush_valid(struct request *req,
-					     struct blkfront_info *info)
+static inline bool blkif_request_flush_invalid(struct request *req,
+					       struct blkfront_info *info)
 {
 	return ((req->cmd_type != REQ_TYPE_FS) ||
-		((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
-		!info->flush_op));
+		((req->cmd_flags & REQ_FLUSH) &&
+		 !(info->feature_flush & REQ_FLUSH)) ||
+		((req->cmd_flags & REQ_FUA) &&
+		 !(info->feature_flush & REQ_FUA)));
 }
 
 /*
@@ -612,8 +614,8 @@ static void do_blkif_request(struct request_queue *rq)
 
 		blk_start_request(req);
 
-		if (blkif_request_flush_valid(req, info)) {
-			__blk_end_request_all(req, -EIO);
+		if (blkif_request_flush_invalid(req, info)) {
+			__blk_end_request_all(req, -EOPNOTSUPP);
 			continue;
 		}
 
-- 
cgit v0.10.2


From fdf9b9650366c575650edcaaede3a68efbb24919 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Dec 2014 15:25:10 +0100
Subject: xen/blkfront: remove redundant flush_op

flush_op is unambiguously defined by feature_flush:
    REQ_FUA | REQ_FLUSH -> BLKIF_OP_WRITE_BARRIER
    REQ_FLUSH -> BLKIF_OP_FLUSH_DISKCACHE
    0 -> 0
and thus can be removed. This is just a cleanup.

The patch was suggested by Boris Ostrovsky.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2e6c103..2236c6f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -126,7 +126,6 @@ struct blkfront_info
 	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
 	unsigned int feature_flush;
-	unsigned int flush_op;
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
 	unsigned int discard_granularity;
@@ -479,7 +478,19 @@ static int blkif_queue_request(struct request *req)
 				 * way.  (It's also a FLUSH+FUA, since it is
 				 * guaranteed ordered WRT previous writes.)
 				 */
-				ring_req->operation = info->flush_op;
+				switch (info->feature_flush &
+					((REQ_FLUSH|REQ_FUA))) {
+				case REQ_FLUSH|REQ_FUA:
+					ring_req->operation =
+						BLKIF_OP_WRITE_BARRIER;
+					break;
+				case REQ_FLUSH:
+					ring_req->operation =
+						BLKIF_OP_FLUSH_DISKCACHE;
+					break;
+				default:
+					ring_req->operation = 0;
+				}
 			}
 			ring_req->u.rw.nr_segments = nseg;
 		}
@@ -685,20 +696,26 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	return 0;
 }
 
+static const char *flush_info(unsigned int feature_flush)
+{
+	switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) {
+	case REQ_FLUSH|REQ_FUA:
+		return "barrier: enabled;";
+	case REQ_FLUSH:
+		return "flush diskcache: enabled;";
+	default:
+		return "barrier or flush: disabled;";
+	}
+}
 
 static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_flush(info->rq, info->feature_flush);
-	printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
-	       info->gd->disk_name,
-	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
-		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
-		"flush diskcache" : "barrier or flush"),
-	       info->feature_flush ? "enabled;" : "disabled;",
-	       "persistent grants:",
-	       info->feature_persistent ? "enabled;" : "disabled;",
-	       "indirect descriptors:",
-	       info->max_indirect_segments ? "enabled;" : "disabled;");
+	pr_info("blkfront: %s: %s %s %s %s %s\n",
+		info->gd->disk_name, flush_info(info->feature_flush),
+		"persistent grants:", info->feature_persistent ?
+		"enabled;" : "disabled;", "indirect descriptors:",
+		info->max_indirect_segments ? "enabled;" : "disabled;");
 }
 
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -1190,7 +1207,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				if (error == -EOPNOTSUPP)
 					error = 0;
 				info->feature_flush = 0;
-				info->flush_op = 0;
 				xlvbd_flush(info);
 			}
 			/* fall through */
@@ -1810,7 +1826,6 @@ static void blkfront_connect(struct blkfront_info *info)
 		physical_sector_size = sector_size;
 
 	info->feature_flush = 0;
-	info->flush_op = 0;
 
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			    "feature-barrier", "%d", &barrier,
@@ -1823,10 +1838,8 @@ static void blkfront_connect(struct blkfront_info *info)
 	 *
 	 * If there are barriers, then we use flush.
 	 */
-	if (!err && barrier) {
+	if (!err && barrier)
 		info->feature_flush = REQ_FLUSH | REQ_FUA;
-		info->flush_op = BLKIF_OP_WRITE_BARRIER;
-	}
 	/*
 	 * And if there is "feature-flush-cache" use that above
 	 * barriers.
@@ -1835,10 +1848,8 @@ static void blkfront_connect(struct blkfront_info *info)
 			    "feature-flush-cache", "%d", &flush,
 			    NULL);
 
-	if (!err && flush) {
+	if (!err && flush)
 		info->feature_flush = REQ_FLUSH;
-		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
-	}
 
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			    "feature-discard", "%d", &discard,
-- 
cgit v0.10.2


From c87fd5407e1aa1cfc6b393b03bf67010cf643dbe Mon Sep 17 00:00:00 2001
From: Sam Bradshaw <sbradshaw@micron.com>
Date: Wed, 10 Dec 2014 13:00:31 -0700
Subject: NVMe: fix freeing of wrong request in abort path

We allocate 'abort_req', but free 'req' in case of an error
submitting the IO.

Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index bcbdf83..cf9b8a8 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1053,7 +1053,7 @@ static void nvme_abort_req(struct request *req)
 		dev_warn(nvmeq->q_dmadev,
 				"Could not abort I/O %d QID %d",
 				req->tag, nvmeq->qid);
-		blk_mq_free_request(req);
+		blk_mq_free_request(abort_req);
 	}
 }
 
-- 
cgit v0.10.2


From 97fe383222cb8a01fb67a8823498ad2edcc20b35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 Dec 2014 13:02:44 -0700
Subject: NVMe: fix error return checking from blk_mq_alloc_request()

We return an error pointer or the request, not NULL. Half
the call paths got it right, the others didn't. Fix those up.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index cf9b8a8..2cc2cee 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -875,8 +875,8 @@ static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cm
 	struct request *req;
 
 	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
 	blk_mq_free_request(req);
 	return res;
@@ -896,8 +896,8 @@ int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 
 	req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
 									false);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
 	blk_mq_free_request(req);
 	return res;
@@ -1691,8 +1691,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 
 		req = blk_mq_alloc_request(ns->queue, WRITE,
 						(GFP_KERNEL|__GFP_WAIT), false);
-		if (!req)
-			status = -ENOMEM;
+		if (IS_ERR(req))
+			status = PTR_ERR(req);
 		else {
 			status = nvme_submit_sync_cmd(req, &c, &cmd.result,
 								timeout);
-- 
cgit v0.10.2


From 285dffc9101244ac65c29672a1fb3fe614b52238 Mon Sep 17 00:00:00 2001
From: Indraneel M <indraneel.m@samsung.com>
Date: Thu, 11 Dec 2014 08:24:18 -0700
Subject: NVMe: Fix FS mount issue (hot-remove followed by hot-add)

After Hot-remove of a device with a mounted partition,
when the device is hot-added again, the new node reappears
as nvme0n1. Mounting this new node fails with the error:

mount: mount /dev/nvme0n1p1 on /mnt failed: File exists.

The old nodes's FS entries still exist and the kernel can't re-create
procfs and sysfs entries for the new node with the same name.
The patch fixes this issue.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Indraneel M <indraneel.m@samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 2cc2cee..95f2310 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2493,6 +2493,7 @@ static void nvme_free_dev(struct kref *kref)
 
 	pci_dev_put(dev->pci_dev);
 	nvme_free_namespaces(dev);
+	nvme_release_instance(dev);
 	blk_mq_free_tag_set(&dev->tagset);
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -2780,7 +2781,6 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
 	nvme_free_admin_tags(dev);
-	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
 }
-- 
cgit v0.10.2


From fe54303ee2be293c1c5c7a53a152453789cabc2f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 11 Dec 2014 13:58:39 -0700
Subject: NVMe: fix retry/error logic in nvme_queue_rq()

The logic around retrying and erroring IO in nvme_queue_rq() is broken
in a few ways:

- If we fail allocating dma memory for a discard, we return retry. We
  have the 'iod' stored in ->special, but we free the 'iod'.

- For a normal request, if we fail dma mapping of setting up prps, we
  have the same iod situation. Additionally, we haven't set the callback
  for the request yet, so we also potentially leak IOMMU resources.

Get rid of the ->special 'iod' store. The retry is uncommon enough that
it's not worth optimizing for or holding on to resources to attempt to
speed it up. Additionally, it's usually best practice to free any
request related resources when doing retries.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 95f2310..e92bdf4c 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -621,24 +621,15 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
 	int psegs = req->nr_phys_segments;
-	int result = BLK_MQ_RQ_QUEUE_BUSY;
 	enum dma_data_direction dma_dir;
 	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
 						sizeof(struct nvme_dsm_range);
 
-	/*
-	 * Requeued IO has already been prepped
-	 */
-	iod = req->special;
-	if (iod)
-		goto submit_iod;
-
 	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
 	if (!iod)
-		return result;
+		return BLK_MQ_RQ_QUEUE_BUSY;
 
 	iod->private = req;
-	req->special = iod;
 
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
@@ -651,7 +642,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 						GFP_ATOMIC,
 						&iod->first_dma);
 		if (!range)
-			goto finish_cmd;
+			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
 	} else if (psegs) {
@@ -659,22 +650,22 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 		sg_init_table(iod->sg, psegs);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
-		if (!iod->nents) {
-			result = BLK_MQ_RQ_QUEUE_ERROR;
-			goto finish_cmd;
-		}
+		if (!iod->nents)
+			goto error_cmd;
 
 		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
-			goto finish_cmd;
+			goto retry_cmd;
 
-		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
-						blk_rq_bytes(req), GFP_ATOMIC))
-			goto finish_cmd;
+		if (blk_rq_bytes(req) !=
+                    nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
+					iod->nents, dma_dir);
+			goto retry_cmd;
+		}
 	}
 
 	blk_mq_start_request(req);
 
- submit_iod:
 	nvme_set_info(cmd, iod, req_completion);
 	spin_lock_irq(&nvmeq->q_lock);
 	if (req->cmd_flags & REQ_DISCARD)
@@ -688,10 +679,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	spin_unlock_irq(&nvmeq->q_lock);
 	return BLK_MQ_RQ_QUEUE_OK;
 
- finish_cmd:
-	nvme_finish_cmd(nvmeq, req->tag, NULL);
+ error_cmd:
 	nvme_free_iod(nvmeq->dev, iod);
-	return result;
+	return BLK_MQ_RQ_QUEUE_ERROR;
+ retry_cmd:
+	nvme_free_iod(nvmeq->dev, iod);
+	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
 static int nvme_process_cq(struct nvme_queue *nvmeq)
-- 
cgit v0.10.2


From 849c6e7746e4f6317ace6aa7d2fcdcd844e99ddb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 12 Dec 2014 08:53:40 -0700
Subject: NVMe: fix race condition in nvme_submit_sync_cmd()

If we have a race between the schedule timing out and the command
completing, we could have the task issuing the command exit
nvme_submit_sync_cmd() while the irq is running sync_completion().
If that happens, we could be corrupting memory, since the stack
that held 'cmdinfo' is no longer valid.

Fix this by always calling nvme_abort_cmd_info(). Once that call
completes, we know that we have either run sync_completion() if
the completion came in, or that we will never run it since we now
have special_completion() as the command callback handler.

Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e92bdf4c..b1d5d87 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -804,12 +804,19 @@ static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
 		nvme_finish_cmd(nvmeq, req->tag, NULL);
 		set_current_state(TASK_RUNNING);
 	}
-	schedule_timeout(timeout);
+	ret = schedule_timeout(timeout);
 
-	if (cmdinfo.status == -EINTR) {
-		nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
+	/*
+	 * Ensure that sync_completion has either run, or that it will
+	 * never run.
+	 */
+	nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
+
+	/*
+	 * We never got the completion
+	 */
+	if (cmdinfo.status == -EINTR)
 		return -EINTR;
-	}
 
 	if (result)
 		*result = cmdinfo.result;
-- 
cgit v0.10.2