From c2789bd403f4c0c541a359cf318b1dda9a14234f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 20 Nov 2015 22:16:46 +0100
Subject: block: rename request_queue slab cache

Name the cache after the actual name of the struct.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 5131993b..c0c884e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3513,7 +3513,7 @@ int __init blk_dev_init(void)
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 
-	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
+	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
 	return 0;
-- 
cgit v0.10.2


From d674d4145e1909e0e4c394bb2633ef73d539bd27 Mon Sep 17 00:00:00 2001
From: Wei Tang <tangwei@cmss.chinamobile.com>
Date: Tue, 24 Nov 2015 09:58:45 +0800
Subject: block: do not initialise globals to 0 or NULL

This patch fixes the checkpatch.pl error to blk-exec.c:

ERROR: do not initialise globals to 0 or NULL

Signed-off-by: Wei Tang <tangwei@cmss.chinamobile.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index c0c884e..c88a946 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,7 +51,7 @@ DEFINE_IDA(blk_queue_ida);
 /*
  * For the allocated request tables
  */
-struct kmem_cache *request_cachep = NULL;
+struct kmem_cache *request_cachep;
 
 /*
  * For queue allocation
-- 
cgit v0.10.2


From 1fe8f348416b3fb35ea3f24fa92bb1d29ffe7b0b Mon Sep 17 00:00:00 2001
From: Wei Tang <tangwei@cmss.chinamobile.com>
Date: Tue, 24 Nov 2015 09:58:46 +0800
Subject: block: do not initialise statics to 0 or NULL

This patch fixes the checkpatch.pl error to genhd.c:

ERROR: do not initialise statics to 0 or NULL

Signed-off-by: Wei Tang <tangwei@cmss.chinamobile.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/genhd.c b/block/genhd.c
index e5cafa5..78140b4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1421,7 +1421,7 @@ static DEFINE_MUTEX(disk_events_mutex);
 static LIST_HEAD(disk_events);
 
 /* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs	= 0;
+static unsigned long disk_events_dfl_poll_msecs;
 
 static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
 {
-- 
cgit v0.10.2


From bd5cecea43ef379e82250addd0303e2f9ede6793 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Sat, 21 Nov 2015 17:27:31 +0800
Subject: bio: use offset_in_page macro

Use offset_in_page macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/bio.c b/block/bio.c
index 4f184d9..dbabd48 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1125,7 +1125,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	int i, ret;
 	int nr_pages = 0;
 	unsigned int len = iter->count;
-	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
+	unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
 
 	for (i = 0; i < iter->nr_segs; i++) {
 		unsigned long uaddr;
@@ -1304,7 +1304,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 			goto out_unmap;
 		}
 
-		offset = uaddr & ~PAGE_MASK;
+		offset = offset_in_page(uaddr);
 		for (j = cur_page; j < page_limit; j++) {
 			unsigned int bytes = PAGE_SIZE - offset;
 
-- 
cgit v0.10.2


From 3b627a3f934c493ada71217f14681e5157e95783 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 24 Nov 2015 15:58:53 -0700
Subject: block: clarify blk_add_timer() use case for blk-mq

Just a comment update on not needing queue_lock, and that we aren't
really adding the request to a timeout list for !mq.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index aa40aa9..3610af5 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -186,6 +186,7 @@ unsigned long blk_rq_timeout(unsigned long timeout)
  * Notes:
  *    Each request has its own timer, and as it is added to the queue, we
  *    set up the timer. When the request completes, we cancel the timer.
+ *    Queue lock must be held for the non-mq case, mq case doesn't care.
  */
 void blk_add_timer(struct request *req)
 {
@@ -209,6 +210,11 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->deadline = jiffies + req->timeout;
+
+	/*
+	 * Only the non-mq case needs to add the request to a protected list.
+	 * For the mq case we simply scan the tag map.
+	 */
 	if (!q->mq_ops)
 		list_add_tail(&req->timeout_list, &req->q->timeout_list);
 
-- 
cgit v0.10.2


From d7cf931dd9f18ce8ee7a0a9b7813a19fb2c8f5e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 25 Nov 2015 10:12:54 -0700
Subject: Revert "blk-flush: Queue through IO scheduler when flush not
 required"

This reverts commit 1b2ff19e6a957b1ef0f365ad331b608af80e932e.

Jan writes:

--

Thanks for report! After some investigation I found out we allocate
elevator specific data in __get_request() only for non-flush requests. And
this is actually required since the flush machinery uses the space in
struct request for something else. Doh. So my patch is just wrong and not
easy to fix since at the time __get_request() is called we are not sure
whether the flush machinery will be used in the end. Jens, please revert
1b2ff19e6a957b1ef0f365ad331b608af80e932e. Thanks!

I'm somewhat surprised that you can reliably hit the race where flushing
gets disabled for the device just while the request is in flight. But I
guess during boot it makes some sense.

--

So let's just revert it, we can fix the queue run manually after the
fact. This race is rare enough that it didn't trigger in testing, it
requires the specific disable-while-in-flight scenario to trigger.

diff --git a/block/blk-flush.c b/block/blk-flush.c
index c81d56e..9c423e5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -422,7 +422,7 @@ void blk_insert_flush(struct request *rq)
 		if (q->mq_ops) {
 			blk_mq_insert_request(rq, false, false, true);
 		} else
-			q->elevator->type->ops.elevator_add_req_fn(q, rq);
+			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
 
-- 
cgit v0.10.2


From 6f3b0e8bcf3cbb87a7459b3ed018d31d918df3f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Nov 2015 09:13:05 +0100
Subject: blk-mq: add a flags parameter to blk_mq_alloc_request

We already have the reserved flag, and a nowait flag awkwardly encoded as
a gfp_t.  Add a real flags argument to make the scheme more extensible and
allow for a nicer calling convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index c88a946..5ec9960 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -630,7 +630,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+int blk_queue_enter(struct request_queue *q, bool nowait)
 {
 	while (true) {
 		int ret;
@@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
 		if (percpu_ref_tryget_live(&q->q_usage_counter))
 			return 0;
 
-		if (!gfpflags_allow_blocking(gfp))
+		if (nowait)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -1276,7 +1276,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+		return blk_mq_alloc_request(q, rw,
+			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
+				0 : BLK_MQ_REQ_NOWAIT);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
@@ -2044,8 +2046,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
-
+		if (likely(blk_queue_enter(q, false) == 0)) {
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a07ca34..abdbb47 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	if (tag != -1)
 		return tag;
 
-	if (!gfpflags_allow_blocking(data->gfp))
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
 	bs = bt_wait_ptr(bt, hctx);
@@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = data->q->mq_ops->map_queue(data->q,
 				data->ctx->cpu);
-		if (data->reserved) {
+		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
 			last_tag = &data->ctx->last_tag;
@@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
-	if (!data->reserved)
-		return __blk_mq_get_tag(data);
-
-	return __blk_mq_get_reserved_tag(data);
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		return __blk_mq_get_reserved_tag(data);
+	return __blk_mq_get_tag(data);
 }
 
 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6d6f8fe..93a4e19 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 	return NULL;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
-		bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		unsigned int flags)
 {
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
@@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
-	ret = blk_queue_enter(q, gfp);
+	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
-			reserved, ctx, hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
-				hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 		rq =  __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 	}
@@ -1175,8 +1173,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		rw |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, rw);
-	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
-			hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (unlikely(!rq)) {
 		__blk_mq_run_hw_queue(hctx);
@@ -1185,8 +1182,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q,
-				__GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 		hctx = alloc_data.hctx;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 713820b..eaede8e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	gfp_t gfp;
-	bool reserved;
+	unsigned int flags;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
@@ -105,13 +104,11 @@ struct blk_mq_alloc_data {
 };
 
 static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
-		struct request_queue *q, gfp_t gfp, bool reserved,
-		struct blk_mq_ctx *ctx,
-		struct blk_mq_hw_ctx *hctx)
+		struct request_queue *q, unsigned int flags,
+		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
 {
 	data->q = q;
-	data->gfp = gfp;
-	data->reserved = reserved;
+	data->flags = flags;
 	data->ctx = ctx;
 	data->hctx = hctx;
 }
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3457ac8..10bd8d0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
 	struct request *rq;
 
-	rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
+	rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED);
 	return blk_mq_rq_to_pdu(rq);
 }
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 5c8ba54..fa742dd 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -464,7 +464,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd)
 	struct request *rq;
 	struct bio *bio = rqd->bio;
 
-	rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0);
+	rq = blk_mq_alloc_request(q, bio_rw(bio), 0);
 	if (IS_ERR(rq))
 		return -ENOMEM;
 
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 9202d1a..d5622f9 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -470,7 +470,7 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd)
 	struct bio *bio = rqd->bio;
 	struct nvme_nvm_command *cmd;
 
-	rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0);
+	rq = blk_mq_alloc_request(q, bio_rw(bio), 0);
 	if (IS_ERR(rq))
 		return -ENOMEM;
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f3b53af..b8a0222 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1041,7 +1041,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	struct request *req;
 	int ret;
 
-	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(q, write, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1094,7 +1094,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	struct nvme_cmd_info *cmd_info;
 	struct request *req;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
+	req = blk_mq_alloc_request(dev->admin_q, WRITE,
+			BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1119,7 +1120,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	struct request *req;
 	struct nvme_cmd_info *cmd_rq;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1320,8 +1321,8 @@ static void nvme_abort_req(struct request *req)
 	if (!dev->abort_limit)
 		return;
 
-	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
-									false);
+	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
+			BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(abort_req))
 		return;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c25639e..aa1a4598 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -395,7 +395,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	result = blk_queue_enter(bdev->bd_queue, false);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
@@ -432,7 +432,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	result = blk_queue_enter(bdev->bd_queue, false);
 	if (result)
 		return result;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index daf17d7..7fc9296 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -188,8 +188,14 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
+
+enum {
+	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
+	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
+};
+
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
-		gfp_t gfp, bool reserved);
+		unsigned int flags);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c0d2b79..e711f29 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -794,7 +794,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, gfp_t gfp);
+extern int blk_queue_enter(struct request_queue *q, bool nowait);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
-- 
cgit v0.10.2


From e0e827b9fc71fbed1a9cd246067c2a4dbd3ea220 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 2 Dec 2015 16:57:06 +0530
Subject: blk-mq: Reuse hardware context cpumask for tags

hctx->cpumask is already populated and let the tag cpumask follow that
instead of going through a new for loop.

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 93a4e19..35da318 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1850,6 +1850,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		hctx->tags = set->tags[i];
 		WARN_ON(!hctx->tags);
 
+		cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
@@ -1863,14 +1864,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
-
-	queue_for_each_ctx(q, ctx, i) {
-		if (!cpumask_test_cpu(i, online_mask))
-			continue;
-
-		hctx = q->mq_ops->map_queue(q, i);
-		cpumask_set_cpu(i, hctx->tags->cpumask);
-	}
 }
 
 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
-- 
cgit v0.10.2


From bffed457160ab48282ca6d0d58646b3bbc2fa554 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 2 Dec 2015 16:59:05 +0530
Subject: blk-mq: Avoid memoryless numa node encoded in hctx numa_node

In architecture like powerpc, we can have cpus without any local memory
attached to it (a.k.a memoryless nodes). In such cases cpu to node mapping
can result in memory allocation hints for block hctx->numa_node populated
with node values which does not have real memory.

Instead use local_memory_node(), which is guaranteed to have memory.
local_memory_node is a noop in other architectures that does not support
memoryless nodes.

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8764c24..d0634bc 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
 
 	for_each_possible_cpu(i) {
 		if (index == mq_map[i])
-			return cpu_to_node(i);
+			return local_memory_node(cpu_to_node(i));
 	}
 
 	return NUMA_NO_NODE;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 35da318..6889d71 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1790,7 +1790,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		 * not, we remain on the home node of the device
 		 */
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
-			hctx->numa_node = cpu_to_node(i);
+			hctx->numa_node = local_memory_node(cpu_to_node(i));
 	}
 }
 
-- 
cgit v0.10.2


From cda22646adaa453519fac28222f20b0d73aa8562 Mon Sep 17 00:00:00 2001
From: Mike Krinkin <krinkin.m.u@gmail.com>
Date: Thu, 3 Dec 2015 17:32:30 +0300
Subject: block: add call to split trace point

There is a split tracepoint that is supposed to be called when
bio is splitted, and it was called in bio_split function until
commit 4b1faf931650d4a35b2a ("block: Kill bio_pair_split()").
But now, no one reports splits, so this patch adds call to
trace_block_split back in blk_queue_split right after split.

Signed-off-by: Mike Krinkin <krinkin.m.u@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 41a55ba..0e5643a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,6 +7,8 @@
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
@@ -159,6 +161,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 		split->bi_rw |= REQ_NOMERGE;
 
 		bio_chain(split, *bio);
+		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
 		*bio = split;
 	}
-- 
cgit v0.10.2


From e36f6204288088fda50d1c84830340ccb70f85ff Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 12 Jan 2016 15:08:39 -0700
Subject: block: split bios to max possible length

This splits bio in the middle of a vector to form the largest possible
bio at the h/w's desired alignment, and guarantees the bio being split
will have some data.

The criteria for splitting is changed from the max sectors to the h/w's
optimal sector alignment if it is provided. For h/w that advertise their
block storage's underlying chunk size, it's a big performance win to not
submit commands that cross them. If sector alignment is not provided,
this patch uses the max sectors as before.

This addresses the performance issue commit d380561113 attempted to
fix, but was reverted due to splitting logic error.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: <stable@vger.kernel.org> # 4.4.x-
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0e5643a..237b087 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -83,9 +83,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	struct bio *new = NULL;
 
 	bio_for_each_segment(bv, bio, iter) {
-		if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
-			goto split;
-
 		/*
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
@@ -93,6 +90,22 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
 			goto split;
 
+		if (sectors + (bv.bv_len >> 9) >
+				blk_max_size_offset(q, bio->bi_iter.bi_sector)) {
+			/*
+			 * Consider this a new segment if we're splitting in
+			 * the middle of this vector.
+			 */
+			if (nsegs < queue_max_segments(q) &&
+			    sectors < blk_max_size_offset(q,
+						bio->bi_iter.bi_sector)) {
+				nsegs++;
+				sectors = blk_max_size_offset(q,
+						bio->bi_iter.bi_sector);
+			}
+			goto split;
+		}
+
 		if (bvprvp && blk_queue_cluster(q)) {
 			if (seg_size + bv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
-- 
cgit v0.10.2


From ed8a9d2c816e0b0a2c5a605505e0f09ae9010dd6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 20 Nov 2015 22:18:43 +0100
Subject: block: use bd{grab,put}() instead of open-coding

- bd_acquire() and bd_forget() open-code bdgrab() and bdput()
- raw driver uses igrab() but never checks its return value and always
  holds another ref from bind_set() while calling it, so it's
  equivalent to bdgrab()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 60316fb..9b9809b 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	err = -ENODEV;
 	if (!bdev)
 		goto out;
-	igrab(bdev->bd_inode);
+	bdgrab(bdev);
 	err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
 	if (err)
 		goto out;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aa1a4598..8b02c90 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -696,7 +696,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
-		ihold(bdev->bd_inode);
+		bdgrab(bdev);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
@@ -712,7 +712,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
-			ihold(bdev->bd_inode);
+			bdgrab(bdev);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -735,7 +735,7 @@ void bd_forget(struct inode *inode)
 	spin_unlock(&bdev_lock);
 
 	if (bdev)
-		iput(bdev->bd_inode);
+		bdput(bdev);
 }
 
 /**
-- 
cgit v0.10.2