From 81b76485afb54e3efeb1de5edbef761f4ad2830e Mon Sep 17 00:00:00 2001
From: Howard Cochran <hcochran@kernelspring.com>
Date: Thu, 10 Mar 2016 01:12:39 -0500
Subject: writeback: Fix performance regression in wb_over_bg_thresh()

Commit 947e9762a8dd ("writeback: update wb_over_bg_thresh() to use
wb_domain aware operations") unintentionally changed this function's
meaning from "are there more dirty pages than the background writeback
threshold" to "are there more dirty pages than the writeback threshold".
The background writeback threshold is typically half of the writeback
threshold, so this had the effect of raising the number of dirty pages
required to cause a writeback worker to perform background writeout.

This can cause a very severe performance regression when a BDI uses
BDI_CAP_STRICTLIMIT because balance_dirty_pages() and the writeback worker
can now disagree on whether writeback should be initiated.

For example, in a system having 1GB of RAM, a single spinning disk, and
a "pass-through" FUSE filesystem mounted over the disk, application code
mmapped a 128MB file on the disk and was randomly dirtying pages in that
mapping.

Because FUSE uses strictlimit and has a default max_ratio of only 1%,
in balance_dirty_pages, thresh is ~200, bg_thresh is ~100, and the
dirty_freerun_ceiling is the average of those, ~150. So, it pauses the
dirtying processes when we have 151 dirty pages and wakes up a
background writeback worker. But the worker tests the wrong threshold
(200 instead of 100), so it does not initiate writeback and just
returns.

Thus, balance_dirty_pages keeps looping, sleeping and then waking up the
worker who will do nothing. It remains stuck in this state until the few
dirty pages that we have finally expire and we write them back for that
reason. Then the whole process repeats, resulting in near-zero
throughput through the FUSE BDI.

The fix is to call the parameterized variant of wb_calc_thresh, so that
the worker will do writeback if the bg_thresh is exceeded which was the
bahavior before the referenced commit.

Fixes: 947e9762a8dd ("writeback: update wb_over_bg_thresh() to use
                     wb_domain aware operations")
Signed-off-by: Howard Cochran <hcochran@kernelspring.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 999792d..bc5149d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1910,7 +1910,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 	if (gdtc->dirty > gdtc->bg_thresh)
 		return true;
 
-	if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+	if (wb_stat(wb, WB_RECLAIMABLE) >
+	    wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
 		return true;
 
 	if (mdtc) {
@@ -1924,7 +1925,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 		if (mdtc->dirty > mdtc->bg_thresh)
 			return true;
 
-		if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+		if (wb_stat(wb, WB_RECLAIMABLE) >
+		    wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
 			return true;
 	}
 
-- 
cgit v0.10.2


From 37e58237a16b94fcd2c2d1b7e9c6e1ca661c231b Mon Sep 17 00:00:00 2001
From: Ming Lin <ming.l@ssi.samsung.com>
Date: Tue, 22 Mar 2016 00:24:44 -0700
Subject: block: add offset in blk_add_request_payload()

We could kmalloc() the payload, so need the offset in page.

Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index b60537b..c502277 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1523,6 +1523,7 @@ EXPORT_SYMBOL(blk_put_request);
  * blk_add_request_payload - add a payload to a request
  * @rq: request to update
  * @page: page backing the payload
+ * @offset: offset in page
  * @len: length of the payload.
  *
  * This allows to later add a payload to an already submitted request by
@@ -1533,12 +1534,12 @@ EXPORT_SYMBOL(blk_put_request);
  * discard requests should ever use it.
  */
 void blk_add_request_payload(struct request *rq, struct page *page,
-		unsigned int len)
+		int offset, unsigned int len)
 {
 	struct bio *bio = rq->bio;
 
 	bio->bi_io_vec->bv_page = page;
-	bio->bi_io_vec->bv_offset = 0;
+	bio->bi_io_vec->bv_offset = offset;
 	bio->bi_io_vec->bv_len = len;
 
 	bio->bi_iter.bi_size = len;
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 586f916..9a9ec21 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -562,7 +562,7 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
 	put_unaligned_be32(count, &buf[16]);
 
 	req = skreq->req;
-	blk_add_request_payload(req, page, len);
+	blk_add_request_payload(req, page, 0, len);
 }
 
 static void skd_request_fn_not_online(struct request_queue *q);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index f52b74c..69b0a4a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -779,7 +779,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 	 * discarded on disk. This allows us to report completion on the full
 	 * amount of blocks described by the request.
 	 */
-	blk_add_request_payload(rq, page, len);
+	blk_add_request_payload(rq, page, 0, len);
 	ret = scsi_init_io(cmd);
 	rq->__data_len = nr_bytes;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 669e419..bbaa767 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -779,7 +779,7 @@ extern struct request *blk_make_request(struct request_queue *, struct bio *,
 extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
-		unsigned int len);
+		int offset, unsigned int len);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
-- 
cgit v0.10.2


From e0489487ec9cd79ee1fa0dc5d3789c08b0e51a2c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 10 Mar 2016 13:58:46 +0200
Subject: blk-mq: Export tagset iter function

Its useful to iterate on all the active tags in cases
where we will need to fail all the queues IO.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
[hch: carefully check for valid tagsets]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index abdbb47..2fd0428 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -474,6 +474,18 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 }
 EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
 
+void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
+		busy_tag_iter_fn *fn, void *priv)
+{
+	int i;
+
+	for (i = 0; i < tagset->nr_hw_queues; i++) {
+		if (tagset->tags && tagset->tags[i])
+			blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+	}
+}
+EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9ac9799..c808fec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -240,6 +240,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
+void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
+		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
-- 
cgit v0.10.2


From 93e9d8e836cb1a9a58b33eb6643bf061c6119ef2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 12 Apr 2016 12:32:46 -0600
Subject: block: add ability to flag write back caching on a device

Add an internal helper and flag for setting whether a queue has
write back caching, or write through (or none). Add a sysfs file
to show this as well, and make it changeable from user space.

This will replace the (awkward) blk_queue_flush() interface that
drivers currently use to inform the block layer of write cache state
and capabilities.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index e5d9148..dce25d8 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -141,6 +141,15 @@ control of this block device to that new IO scheduler. Note that writing
 an IO scheduler name to this file will attempt to load that IO scheduler
 module, if it isn't already present in the system.
 
+write_cache (RW)
+----------------
+When read, this file will display whether the device has write back
+caching enabled or not. It will return "write back" for the former
+case, and "write through" for the latter. Writing to this file can
+change the kernels view of the device, but it doesn't alter the
+device state. This means that it might not be safe to toggle the
+setting from "write back" to "write through", since that will also
+eliminate cache flushes issued by the kernel.
 
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 331e4ee..c903bee 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -846,6 +846,32 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
+/**
+ * blk_queue_write_cache - configure queue's write cache
+ * @q:		the request queue for the device
+ * @wc:		write back cache on or off
+ * @fua:	device supports FUA writes, if true
+ *
+ * Tell the block layer about the write cache of @q.
+ */
+void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
+{
+	spin_lock_irq(q->queue_lock);
+	if (wc) {
+		queue_flag_set(QUEUE_FLAG_WC, q);
+		q->flush_flags = REQ_FLUSH;
+	} else
+		queue_flag_clear(QUEUE_FLAG_WC, q);
+	if (fua) {
+		if (wc)
+			q->flush_flags |= REQ_FUA;
+		queue_flag_set(QUEUE_FLAG_FUA, q);
+	} else
+		queue_flag_clear(QUEUE_FLAG_FUA, q);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_write_cache);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 995b58d..9920596 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -347,6 +347,38 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_wc_show(struct request_queue *q, char *page)
+{
+	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+		return sprintf(page, "write back\n");
+
+	return sprintf(page, "write through\n");
+}
+
+static ssize_t queue_wc_store(struct request_queue *q, const char *page,
+			      size_t count)
+{
+	int set = -1;
+
+	if (!strncmp(page, "write back", 10))
+		set = 1;
+	else if (!strncmp(page, "write through", 13) ||
+		 !strncmp(page, "none", 4))
+		set = 0;
+
+	if (set == -1)
+		return -EINVAL;
+
+	spin_lock_irq(q->queue_lock);
+	if (set)
+		queue_flag_set(QUEUE_FLAG_WC, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_WC, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return count;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -478,6 +510,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_wc_entry = {
+	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wc_show,
+	.store = queue_wc_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -503,6 +541,7 @@ static struct attribute *default_attrs[] = {
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
 	&queue_poll_entry.attr,
+	&queue_wc_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bbaa767..ba72687 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -491,6 +491,8 @@ struct request_queue {
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_POLL	       22	/* IO polling enabled if set */
+#define QUEUE_FLAG_WC	       23	/* Write back caching */
+#define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -1009,6 +1011,7 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
+extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
-- 
cgit v0.10.2


From 1fcbcc333f1fae6e11cc0839a6e72bc1a3e830bf Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 25 Apr 2016 16:50:14 -0700
Subject: block: copy NOMERGE flag from bio to request

bio might have NOMERGE flag set, for example blk_queue_split sets it.
When we initiate request, copy this flag too.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 86a38ea..77e5d81 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -208,7 +208,7 @@ enum rq_flag_bits {
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
 	 REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
-	 REQ_SECURE | REQ_INTEGRITY)
+	 REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 #define BIO_NO_ADVANCE_ITER_MASK	(REQ_DISCARD|REQ_WRITE_SAME)
-- 
cgit v0.10.2


From 9082e87bfbf83579b97e3bfc45d81f3e50da2177 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Apr 2016 14:55:27 -0400
Subject: block: remove struct bio_batch

It can be replaced with a combination of bio_chain and submit_bio_wait.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Sagi Grimberg <sagig@grimberg.me>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9ebf653..700d248 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,21 +9,17 @@
 
 #include "blk.h"
 
-struct bio_batch {
-	atomic_t		done;
-	int			error;
-	struct completion	*wait;
-};
-
-static void bio_batch_end_io(struct bio *bio)
+static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages,
+		gfp_t gfp)
 {
-	struct bio_batch *bb = bio->bi_private;
+	struct bio *new = bio_alloc(gfp, nr_pages);
+
+	if (bio) {
+		bio_chain(bio, new);
+		submit_bio(rw, bio);
+	}
 
-	if (bio->bi_error && bio->bi_error != -EOPNOTSUPP)
-		bb->error = bio->bi_error;
-	if (atomic_dec_and_test(&bb->done))
-		complete(bb->wait);
-	bio_put(bio);
+	return new;
 }
 
 /**
@@ -40,13 +36,11 @@ static void bio_batch_end_io(struct bio *bio)
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = REQ_WRITE | REQ_DISCARD;
 	unsigned int granularity;
 	int alignment;
-	struct bio_batch bb;
-	struct bio *bio;
+	struct bio *bio = NULL;
 	int ret = 0;
 	struct blk_plug plug;
 
@@ -66,25 +60,15 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		type |= REQ_SECURE;
 	}
 
-	atomic_set(&bb.done, 1);
-	bb.error = 0;
-	bb.wait = &wait;
-
 	blk_start_plug(&plug);
 	while (nr_sects) {
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
 
-		bio = bio_alloc(gfp_mask, 1);
-		if (!bio) {
-			ret = -ENOMEM;
-			break;
-		}
-
 		/* Make sure bi_size doesn't overflow */
 		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
 
-		/*
+		/**
 		 * If splitting a request, and the next starting sector would be
 		 * misaligned, stop the discard at the previous aligned sector.
 		 */
@@ -98,18 +82,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
+		bio = next_bio(bio, type, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_bdev = bdev;
-		bio->bi_private = &bb;
 
 		bio->bi_iter.bi_size = req_sects << 9;
 		nr_sects -= req_sects;
 		sector = end_sect;
 
-		atomic_inc(&bb.done);
-		submit_bio(type, bio);
-
 		/*
 		 * We can loop for a long time in here, if someone does
 		 * full device discards (like mkfs). Be nice and allow
@@ -118,15 +98,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 */
 		cond_resched();
 	}
+	if (bio)
+		ret = submit_bio_wait(type, bio);
 	blk_finish_plug(&plug);
 
-	/* Wait for bios in-flight */
-	if (!atomic_dec_and_test(&bb.done))
-		wait_for_completion_io(&wait);
-
-	if (bb.error)
-		return bb.error;
-	return ret;
+	return ret != -EOPNOTSUPP ? ret : 0;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
@@ -145,11 +121,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			    sector_t nr_sects, gfp_t gfp_mask,
 			    struct page *page)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int max_write_same_sectors;
-	struct bio_batch bb;
-	struct bio *bio;
+	struct bio *bio = NULL;
 	int ret = 0;
 
 	if (!q)
@@ -158,21 +132,10 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	/* Ensure that max_write_same_sectors doesn't overflow bi_size */
 	max_write_same_sectors = UINT_MAX >> 9;
 
-	atomic_set(&bb.done, 1);
-	bb.error = 0;
-	bb.wait = &wait;
-
 	while (nr_sects) {
-		bio = bio_alloc(gfp_mask, 1);
-		if (!bio) {
-			ret = -ENOMEM;
-			break;
-		}
-
+		bio = next_bio(bio, REQ_WRITE | REQ_WRITE_SAME, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_bdev = bdev;
-		bio->bi_private = &bb;
 		bio->bi_vcnt = 1;
 		bio->bi_io_vec->bv_page = page;
 		bio->bi_io_vec->bv_offset = 0;
@@ -186,18 +149,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
-
-		atomic_inc(&bb.done);
-		submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
 	}
 
-	/* Wait for bios in-flight */
-	if (!atomic_dec_and_test(&bb.done))
-		wait_for_completion_io(&wait);
-
-	if (bb.error)
-		return bb.error;
-	return ret;
+	if (bio)
+		ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio);
+	return ret != -EOPNOTSUPP ? ret : 0;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
@@ -216,28 +172,15 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 				  sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
-	struct bio *bio;
-	struct bio_batch bb;
+	struct bio *bio = NULL;
 	unsigned int sz;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
-	atomic_set(&bb.done, 1);
-	bb.error = 0;
-	bb.wait = &wait;
-
-	ret = 0;
 	while (nr_sects != 0) {
-		bio = bio_alloc(gfp_mask,
-				min(nr_sects, (sector_t)BIO_MAX_PAGES));
-		if (!bio) {
-			ret = -ENOMEM;
-			break;
-		}
-
+		bio = next_bio(bio, WRITE,
+				min(nr_sects, (sector_t)BIO_MAX_PAGES),
+				gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev   = bdev;
-		bio->bi_end_io = bio_batch_end_io;
-		bio->bi_private = &bb;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -247,18 +190,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			if (ret < (sz << 9))
 				break;
 		}
-		ret = 0;
-		atomic_inc(&bb.done);
-		submit_bio(WRITE, bio);
 	}
 
-	/* Wait for bios in-flight */
-	if (!atomic_dec_and_test(&bb.done))
-		wait_for_completion_io(&wait);
-
-	if (bb.error)
-		return bb.error;
-	return ret;
+	if (bio)
+		return submit_bio_wait(WRITE, bio);
+	return 0;
 }
 
 /**
-- 
cgit v0.10.2


From 38f252553300ee1d3346a5273e95fe1dd60ca50a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Apr 2016 14:55:28 -0400
Subject: block: add __blkdev_issue_discard

This is a version of blkdev_issue_discard which doesn't wait for
the I/O to complete, but instead allows the caller to submit
the final bio and/or chain it to others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Sagi Grimberg <sagig@grimberg.me>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 700d248..ccbce2b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -22,45 +22,25 @@ static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages,
 	return new;
 }
 
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev:	blockdev to issue discard for
- * @sector:	start sector
- * @nr_sects:	number of sectors to discard
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = REQ_WRITE | REQ_DISCARD;
+	struct bio *bio = *biop;
 	unsigned int granularity;
 	int alignment;
-	struct bio *bio = NULL;
-	int ret = 0;
-	struct blk_plug plug;
 
 	if (!q)
 		return -ENXIO;
-
 	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
+	if ((type & REQ_SECURE) && !blk_queue_secdiscard(q))
+		return -EOPNOTSUPP;
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
-	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (!blk_queue_secdiscard(q))
-			return -EOPNOTSUPP;
-		type |= REQ_SECURE;
-	}
-
-	blk_start_plug(&plug);
 	while (nr_sects) {
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
@@ -98,7 +78,38 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 */
 		cond_resched();
 	}
-	if (bio)
+
+	*biop = bio;
+	return 0;
+}
+EXPORT_SYMBOL(__blkdev_issue_discard);
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	int type = REQ_WRITE | REQ_DISCARD;
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+	int ret;
+
+	if (flags & BLKDEV_DISCARD_SECURE)
+		type |= REQ_SECURE;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type,
+			&bio);
+	if (!ret && bio)
 		ret = submit_bio_wait(type, bio);
 	blk_finish_plug(&plug);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba72687..b79131a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1131,6 +1131,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-- 
cgit v0.10.2


From a21f2a3ec62abe2e06500d6550659a0ff5624fbb Mon Sep 17 00:00:00 2001
From: Michael Callahan <michaelcallahan@fb.com>
Date: Tue, 3 May 2016 11:12:49 -0400
Subject: block: Minor blk_account_io_start usage cleanup

blk_account_io_start does not need to be wrapped with blk_do_io_stat
ais it already checks for that condition.

Signed-off-by: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1699baf..0c2ed83 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1122,8 +1122,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	init_request_from_bio(rq, bio);
 
-	if (blk_do_io_stat(rq))
-		blk_account_io_start(rq, 1);
+	blk_account_io_start(rq, 1);
 }
 
 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-- 
cgit v0.10.2


From bbd848e0fade51ae51dab86a0683069cef89953f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 May 2016 11:54:21 -0400
Subject: block: reinstate early return of -EOPNOTSUPP from
 blkdev_issue_discard

Commit 38f25255330 ("block: add __blkdev_issue_discard") incorrectly
disallowed the early return of -EOPNOTSUPP if the device doesn't support
discard (or secure discard).  This early return of -EOPNOTSUPP has
always been part of blkdev_issue_discard() interface so there isn't a
good reason to break that behaviour -- especially when it can be easily
reinstated.

The nuance of allowing early return of -EOPNOTSUPP vs disallowing late
return of -EOPNOTSUPP is: if the overall device never advertised support
for discards and one is issued to the device it is beneficial to inform
the caller that discards are not supported via -EOPNOTSUPP.  But if a
device advertises discard support it means that at least a subset of the
device does have discard support -- but it could be that discards issued
to some regions of a stacked device will not be supported.  In that case
the late return of -EOPNOTSUPP must be disallowed.

Fixes: 38f25255330 ("block: add __blkdev_issue_discard")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-lib.c b/block/blk-lib.c
index ccbce2b..23d7f30 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -109,11 +109,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type,
 			&bio);
-	if (!ret && bio)
+	if (!ret && bio) {
 		ret = submit_bio_wait(type, bio);
+		if (ret == -EOPNOTSUPP)
+			ret = 0;
+	}
 	blk_finish_plug(&plug);
 
-	return ret != -EOPNOTSUPP ? ret : 0;
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
-- 
cgit v0.10.2


From 0ef5a50c1658d4d96a44f145bcb92ff3310c75b1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 May 2016 11:54:22 -0400
Subject: block: make bio_inc_remaining() interface accessible again

Commit 326e1dbb57 ("block: remove management of bi_remaining when
restoring original bi_end_io") made bio_inc_remaining() private to bio.c
because the only use-case that made sense was confined to the
bio_chain() interface.

Since that time DM thinp went on to use bio_chain() in its relatively
complex implementation of async discard support.  That implementation,
even when converted over to use the new async __blkdev_issue_discard()
interface, depends on deferred completion of the original discard bio --
which is most appropriately implemented using bio_inc_remaining().

DM thinp foolishly duplicated bio_inc_remaining(), local to dm-thin.c as
__bio_inc_remaining(), so re-exporting bio_inc_remaining() allows us to
put an end to that foolishness.

All said, bio_inc_remaining() should really only be used in conjunction
with bio_chain().  It isn't intended for generic bio reference counting.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/bio.c b/block/bio.c
index 807d25e..0e4aa42 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -311,17 +311,6 @@ static void bio_chain_endio(struct bio *bio)
 	bio_endio(__bio_chain_endio(bio));
 }
 
-/*
- * Increment chain count for the bio. Make sure the CHAIN flag update
- * is visible before the raised count.
- */
-static inline void bio_inc_remaining(struct bio *bio)
-{
-	bio_set_flag(bio, BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6b7481f..9faebf7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -703,6 +703,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 }
 
 /*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio_set_flag(bio, BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+/*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
  * These memory pools in turn all allocate from the bio_slab
-- 
cgit v0.10.2


From b7d7641e2a8640a2f6ac7cdadd6c56eb12728b70 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:13 -0700
Subject: blktrace: delete garbage for message trace

commit f4a1d08ce65 introduces a regression. Originally for
BLK_TN_MESSAGE, we add message in trace and return. The commit ignores
the early return and add garbage info.

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f94e7a2..8ae9ea2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1349,6 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	if (t->action == BLK_TN_MESSAGE) {
 		log_action(iter, long_act ? "message" : "m");
 		blk_log_msg(s, iter->ent);
+		return trace_handle_return(s);
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-- 
cgit v0.10.2


From 8d1547e08dda8848f01185b3fd6bb946b68de99f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:14 -0700
Subject: blktrace: add missed mask name

BLK_TC_NOTIFY is missed in mask_maps, so we can't print out notify or
set mask with 'notify' name.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8ae9ea2..9aef865 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1552,6 +1552,7 @@ static const struct {
 	{ BLK_TC_COMPLETE,	"complete"	},
 	{ BLK_TC_FS,		"fs"		},
 	{ BLK_TC_PC,		"pc"		},
+	{ BLK_TC_NOTIFY,	"notify"	},
 	{ BLK_TC_AHEAD,		"ahead"		},
 	{ BLK_TC_META,		"meta"		},
 	{ BLK_TC_DISCARD,	"discard"	},
-- 
cgit v0.10.2


From 59fa0224cfea31dde596e29555de94c961b139f9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:15 -0700
Subject: blk-throttle: don't parse cgroup path if trace isn't enabled

if trace isn't enabled, parsing cgroup path just wastes cpu

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2149a1d..47a3e54 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -211,15 +211,14 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  *
  * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
  * throtl_grp; otherwise, just "throtl".
- *
- * TODO: this should be made a function and name formatting should happen
- * after testing whether blktrace is enabled.
  */
 #define throtl_log(sq, fmt, args...)	do {				\
 	struct throtl_grp *__tg = sq_to_tg((sq));			\
 	struct throtl_data *__td = sq_to_td((sq));			\
 									\
 	(void)__td;							\
+	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
+		break;							\
 	if ((__tg)) {							\
 		char __pbuf[128];					\
 									\
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index afc1343..0f3172b 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -57,6 +57,14 @@ void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 	} while (0)
 #define BLK_TN_MAX_MSG		128
 
+static inline bool blk_trace_note_message_enabled(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+	if (likely(!bt))
+		return false;
+	return bt->act_mask & BLK_TC_NOTIFY;
+}
+
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
@@ -79,6 +87,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 # define blk_trace_remove_sysfs(dev)			do { } while (0)
+# define blk_trace_note_message_enabled(q)		(false)
 static inline int blk_trace_init_sysfs(struct device *dev)
 {
 	return 0;
-- 
cgit v0.10.2


From b3a834b1596ac668df206aa2bb1f191c31f5f5e4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 16 May 2016 09:54:47 -0600
Subject: blk-mq: fix undefined behaviour in order_to_size()

When this_order variable in blk_mq_init_rq_map() becomes zero
the code incorrectly decrements the variable and passes the result
to order_to_size() helper causing undefined behaviour:

 UBSAN: Undefined behaviour in block/blk-mq.c:1459:27
 shift exponent 4294967295 is too large for 32-bit type 'unsigned int'
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.6.0-rc6-00072-g33656a1 #22

Fix the code by checking this_order variable for not having the zero
value first.

Reported-by: Meelis Roos <mroos@linux.ee>
Fixes: 320ae51feed5 ("blk-mq: new multi-queue block IO queueing mechanism")
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0c2ed83..7df9c92 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1495,7 +1495,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		int to_do;
 		void *p;
 
-		while (left < order_to_size(this_order - 1) && this_order)
+		while (this_order && left < order_to_size(this_order - 1))
 			this_order--;
 
 		do {
-- 
cgit v0.10.2