From 5014c311baa2b21384321fa4a9f617a92e3e56f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 2 Sep 2015 16:46:02 -0600
Subject: block: fix bogus compiler warnings in blk-merge.c

The compiler can't figure out that bvprv is initialized whenever 'prev'
is set to 1 as well. Use a pointer to bvprv instead, setting it to NULL
initially, and get rid of the 'prev' tracking. This dumbs it down
enough that gcc is happy.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index d088cff..cce23ba 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -67,10 +67,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio_set *bs)
 {
 	struct bio *split;
-	struct bio_vec bv, bvprv;
+	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned seg_size = 0, nsegs = 0, sectors = 0;
-	int prev = 0;
 
 	bio_for_each_segment(bv, bio, iter) {
 		sectors += bv.bv_len >> 9;
@@ -82,20 +81,20 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (prev && bvec_gap_to_prev(q, &bvprv, bv.bv_offset))
+		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
 			goto split;
 
-		if (prev && blk_queue_cluster(q)) {
+		if (bvprvp && blk_queue_cluster(q)) {
 			if (seg_size + bv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
-			if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
+			if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
 				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+			if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
 				goto new_segment;
 
 			seg_size += bv.bv_len;
 			bvprv = bv;
-			prev = 1;
+			bvprvp = &bv;
 			continue;
 		}
 new_segment:
@@ -104,7 +103,7 @@ new_segment:
 
 		nsegs++;
 		bvprv = bv;
-		prev = 1;
+		bvprvp = &bv;
 		seg_size = bv.bv_len;
 	}
 
-- 
cgit v0.10.2


From de65d2d26f81f7f84c7258b3a137f20e8fa5bb6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <mb@lightnvm.io>
Date: Mon, 31 Aug 2015 14:17:18 +0200
Subject: null_blk: fix memory leak on cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Driver was not freeing the memory allocated for internal nullb queues.
This patch frees the memory during driver unload.

Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 17269a3..d394a85 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -406,6 +406,22 @@ static struct blk_mq_ops null_mq_ops = {
 	.complete	= null_softirq_done_fn,
 };
 
+static void cleanup_queue(struct nullb_queue *nq)
+{
+	kfree(nq->tag_map);
+	kfree(nq->cmds);
+}
+
+static void cleanup_queues(struct nullb *nullb)
+{
+	int i;
+
+	for (i = 0; i < nullb->nr_queues; i++)
+		cleanup_queue(&nullb->queues[i]);
+
+	kfree(nullb->queues);
+}
+
 static void null_del_dev(struct nullb *nullb)
 {
 	list_del_init(&nullb->list);
@@ -415,6 +431,7 @@ static void null_del_dev(struct nullb *nullb)
 	if (queue_mode == NULL_Q_MQ)
 		blk_mq_free_tag_set(&nullb->tag_set);
 	put_disk(nullb->disk);
+	cleanup_queues(nullb);
 	kfree(nullb);
 }
 
@@ -459,22 +476,6 @@ static int setup_commands(struct nullb_queue *nq)
 	return 0;
 }
 
-static void cleanup_queue(struct nullb_queue *nq)
-{
-	kfree(nq->tag_map);
-	kfree(nq->cmds);
-}
-
-static void cleanup_queues(struct nullb *nullb)
-{
-	int i;
-
-	for (i = 0; i < nullb->nr_queues; i++)
-		cleanup_queue(&nullb->queues[i]);
-
-	kfree(nullb->queues);
-}
-
 static int setup_queues(struct nullb *nullb)
 {
 	nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
-- 
cgit v0.10.2


From 5fdb7e1b976dc9d18aff8c711e51d17c4c324a0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Mon, 31 Aug 2015 14:17:31 +0200
Subject: null_blk: fix wrong capacity when bs is not 512 bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

set_capacity() sets device's capacity using 512 bytes sectors.
null_blk calculates the number of sectors by size / bs, which
set_capacity is called with. This led to null_blk exposing the
wrong number of sectors when bs is not 512 bytes.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index d394a85..a295b98 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -589,8 +589,7 @@ static int null_add_dev(void)
 	blk_queue_physical_block_size(nullb->q, bs);
 
 	size = gb * 1024 * 1024 * 1024ULL;
-	sector_div(size, bs);
-	set_capacity(disk, size);
+	set_capacity(disk, size >> 9);
 
 	disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
 	disk->major		= null_major;
-- 
cgit v0.10.2


From 5e7c4274a70aa2d6f485996d0ca1dad52d0039ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 3 Sep 2015 19:28:20 +0300
Subject: block: Check for gaps on front and back merges

We are checking for gaps to previous bio_vec, which can
only detect back merges gaps. Moreover, at the point where
we check for a gap, we don't know if we will attempt a back
or a front merge. Thus, check for gap to prev in a back merge
attempt and check for a gap to next in a front merge attempt.

Signed-off-by: Jens Axboe <axboe@fb.com>
[sagig: Minor rename change]
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index cce23ba..d9eddbc 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -438,6 +438,8 @@ no_merge:
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio)
 {
+	if (req_gap_back_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -456,6 +458,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio)
 {
+
+	if (req_gap_front_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -482,14 +487,6 @@ static bool req_no_special_merge(struct request *req)
 	return !q->mq_ops && req->special;
 }
 
-static int req_gap_to_prev(struct request *req, struct bio *next)
-{
-	struct bio *prev = req->biotail;
-
-	return bvec_gap_to_prev(req->q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-			next->bi_io_vec[0].bv_offset);
-}
-
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -504,7 +501,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
-	if (req_gap_to_prev(req, next->bio))
+	if (req_gap_back_merge(req, next->bio))
 		return 0;
 
 	/*
@@ -712,10 +709,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	/* Only check gaps if the bio carries data */
-	if (bio_has_data(bio) && req_gap_to_prev(rq, bio))
-		return false;
-
 	return true;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a622f27..2ff94de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1368,6 +1368,26 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
 }
 
+static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
+			 struct bio *next)
+{
+	if (!bio_has_data(prev))
+		return false;
+
+	return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
+				next->bi_io_vec[0].bv_offset);
+}
+
+static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
+{
+	return bio_will_gap(req->q, req->biotail, bio);
+}
+
+static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
+{
+	return bio_will_gap(req->q, bio, req->bio);
+}
+
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-- 
cgit v0.10.2


From 7f39add3b08cbbdb99abe50e6d7c342e6800d684 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Fri, 11 Sep 2015 09:03:04 -0600
Subject: block: Refuse request/bio merges with gaps in the integrity payload

If a driver sets the block queue virtual boundary mask, it means that
it cannot handle gaps so we must not allow those in the integrity
payload as well.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>

Fixed up by me to have duplicate integrity merge functions, depending
on whether block integrity is enabled or not. Fixes a compilations
issue with CONFIG_BLK_DEV_INTEGRITY unset.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index f548b64..75f29cf 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 	    q->limits.max_integrity_segments)
 		return false;
 
+	if (integrity_req_gap_back_merge(req, next->bio))
+		return false;
+
 	return true;
 }
 EXPORT_SYMBOL(blk_integrity_merge_rq);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d9eddbc..574ea7c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -440,6 +440,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 {
 	if (req_gap_back_merge(req, bio))
 		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_back_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -461,6 +464,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 
 	if (req_gap_front_merge(req, bio))
 		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_front_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2ff94de..1aac731 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1514,6 +1514,26 @@ queue_max_integrity_segments(struct request_queue *q)
 	return q->limits.max_integrity_segments;
 }
 
+static inline bool integrity_req_gap_back_merge(struct request *req,
+						struct bio *next)
+{
+	struct bio_integrity_payload *bip = bio_integrity(req->bio);
+	struct bio_integrity_payload *bip_next = bio_integrity(next);
+
+	return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+				bip_next->bip_vec[0].bv_offset);
+}
+
+static inline bool integrity_req_gap_front_merge(struct request *req,
+						 struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
+
+	return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+				bip_next->bip_vec[0].bv_offset);
+}
+
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 struct bio;
@@ -1580,6 +1600,16 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
 {
 	return 0;
 }
+static inline bool integrity_req_gap_back_merge(struct request *req,
+						struct bio *next)
+{
+	return false;
+}
+static inline bool integrity_req_gap_front_merge(struct request *req,
+						 struct bio *bio)
+{
+	return false;
+}
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit v0.10.2


From 87a816df537e096d404add543ef47b796906c130 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Tue, 8 Sep 2015 09:33:35 -0600
Subject: block: Refuse adding appending a gapped integrity page to a bio

This is only theoretical at the moment given that the only
subsystems that generate integrity payloads are the block layer
itself and the scsi target (which generate well aligned integrity
payloads). But when we will expose integrity meta-data to user-space,
we'll need to refuse appending a page with a gap (if the queue
virtual boundary is set).

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4aecca7..14b8faf 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip->bip_vec + bip->bip_vcnt;
 
+	if (bip->bip_vcnt &&
+	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
+		return 0;
+
 	iv->bv_page = page;
 	iv->bv_len = len;
 	iv->bv_offset = offset;
-- 
cgit v0.10.2


From 46348456c1791053dcbe5a9e21825b10a3c8a8fb Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 3 Sep 2015 19:28:23 +0300
Subject: block: Copy a user iovec if it includes gaps

For drivers that don't support gaps in the SG lists handed to
them we must bounce (copy the user buffers) and pass a bio that
does not include gaps. This doesn't matter for any current user,
but will help to allow iser which can't handle gaps to use the
block virtual boundary instead of using driver-local bounce
buffering when handling SG_IO commands.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index 2338416..f565e11 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,6 +9,24 @@
 
 #include "blk.h"
 
+static bool iovec_gap_to_prv(struct request_queue *q,
+			     struct iovec *prv, struct iovec *cur)
+{
+	unsigned long prev_end;
+
+	if (!queue_virt_boundary(q))
+		return false;
+
+	if (prv->iov_base == NULL && prv->iov_len == 0)
+		/* prv is not set - don't check */
+		return false;
+
+	prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
+
+	return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
+		prev_end & queue_virt_boundary(q));
+}
+
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio)
 {
@@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	struct bio *bio;
 	int unaligned = 0;
 	struct iov_iter i;
-	struct iovec iov;
+	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
 
 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		/*
 		 * Keep going so we check length of all segments
 		 */
-		if (uaddr & queue_dma_alignment(q))
+		if ((uaddr & queue_dma_alignment(q)) ||
+		    iovec_gap_to_prv(q, &prv, &iov))
 			unaligned = 1;
+
+		prv.iov_base = iov.iov_base;
+		prv.iov_len = iov.iov_len;
 	}
 
 	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-- 
cgit v0.10.2


From 6fe810bda0bd9a5d7674fc671fac27b8aa8ec243 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 5 Sep 2015 15:47:36 -0400
Subject: block: blkg_destroy_all() should clear q->root_blkg and
 ->root_rl.blkg

While making the root blkg unconditional, ec13b1d6f0a0 ("blkcg: always
create the blkcg_gq for the root blkcg") removed the part which clears
q->root_blkg and ->root_rl.blkg during q exit.  This leaves the two
pointers dangling after blkg_destroy_all().  blk-throttle exit path
performs blkg traversals and dereferences ->root_blkg and can lead to
the following oops.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000558
 IP: [<ffffffff81389746>] __blkg_lookup+0x26/0x70
 ...
 task: ffff88001b4e2580 ti: ffff88001ac0c000 task.ti: ffff88001ac0c000
 RIP: 0010:[<ffffffff81389746>]  [<ffffffff81389746>] __blkg_lookup+0x26/0x70
 ...
 Call Trace:
  [<ffffffff8138d14a>] blk_throtl_drain+0x5a/0x110
  [<ffffffff8138a108>] blkcg_drain_queue+0x18/0x20
  [<ffffffff81369a70>] __blk_drain_queue+0xc0/0x170
  [<ffffffff8136a101>] blk_queue_bypass_start+0x61/0x80
  [<ffffffff81388c59>] blkcg_deactivate_policy+0x39/0x100
  [<ffffffff8138d328>] blk_throtl_exit+0x38/0x50
  [<ffffffff8138a14e>] blkcg_exit_queue+0x3e/0x50
  [<ffffffff8137016e>] blk_release_queue+0x1e/0xc0
 ...

While the bug is a straigh-forward use-after-free bug, it is tricky to
reproduce because blkg release is RCU protected and the rest of exit
path usually finishes before RCU grace period.

This patch fixes the bug by updating blkg_destro_all() to clear
q->root_blkg and ->root_rl.blkg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Richard W.M. Jones" <rjones@redhat.com>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Link: http://lkml.kernel.org/g/CA+5PVA5rzQ0s4723n5rHBcxQa9t0cW8BPPBekr_9aMRoWt2aYg@mail.gmail.com
Fixes: ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg")
Cc: stable@vger.kernel.org # v4.2+
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6283b3..9cc48d1d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -387,6 +387,9 @@ static void blkg_destroy_all(struct request_queue *q)
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
 	}
+
+	q->root_blkg = NULL;
+	q->root_rl.blkg = NULL;
 }
 
 /*
-- 
cgit v0.10.2


From 52cc6eead9095e2faf2ec7afc013aa3af1f01ac5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 17 Sep 2015 09:58:38 -0600
Subject: block: blk-merge: fast-clone bio when splitting rw bios

biovecs has become immutable since v3.13, so it isn't necessary
to allocate biovecs for the new cloned bios, then we can save
one extra biovecs allocation/copy, and the allocation is often
not fixed-length and a bit more expensive.

For example, if the 'max_sectors_kb' of null blk's queue is set
as 16(32 sectors) via sysfs just for making more splits, this patch
can increase throught about ~70% in the sequential read test over
null_blk(direct io, bs: 1M).

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Ming Lin <ming.l@ssi.samsung.com>
Cc: Dongsu Park <dpark@posteo.net>
Signed-off-by: Ming Lei <ming.lei@canonical.com>

This fixes a performance regression introduced by commit 54efd50bfd,
and allows us to take full advantage of the fact that we have immutable
bio_vecs. Hand applied, as it rejected violently with commit
5014c311baa2.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 574ea7c..c4e9c37 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -66,15 +66,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
 					 struct bio_set *bs)
 {
-	struct bio *split;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned seg_size = 0, nsegs = 0, sectors = 0;
 
 	bio_for_each_segment(bv, bio, iter) {
-		sectors += bv.bv_len >> 9;
-
-		if (sectors > queue_max_sectors(q))
+		if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
 			goto split;
 
 		/*
@@ -95,6 +92,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			seg_size += bv.bv_len;
 			bvprv = bv;
 			bvprvp = &bv;
+			sectors += bv.bv_len >> 9;
 			continue;
 		}
 new_segment:
@@ -105,21 +103,12 @@ new_segment:
 		bvprv = bv;
 		bvprvp = &bv;
 		seg_size = bv.bv_len;
+		sectors += bv.bv_len >> 9;
 	}
 
 	return NULL;
 split:
-	split = bio_clone_bioset(bio, GFP_NOIO, bs);
-
-	split->bi_iter.bi_size -= iter.bi_size;
-	bio->bi_iter = iter;
-
-	if (bio_integrity(bio)) {
-		bio_integrity_advance(bio, split->bi_iter.bi_size);
-		bio_integrity_trim(split, 0, bio_sectors(split));
-	}
-
-	return split;
+	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
 
 void blk_queue_split(struct request_queue *q, struct bio **bio,
-- 
cgit v0.10.2


From 994518799930fc363d47cb7cf0d1abed1790bf16 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 18 Sep 2015 00:06:28 +0800
Subject: block: fix bounce_end_io

When bio bounce is involved, one new bio and its biovecs are
cloned from the comming bio, which can be one fast-cloned bio
from upper layer(such as dm).

So it is obviously wrong to assume the start index of the coming(
original) bio's io vector is zero, which can be any value between
0 and (bi_max_vecs - 1), especially in case of bio split.

This patch fixes Fedora's booting oops on i386, often with the
following kernel log together:

> [    9.026738] systemd[1]: Switching root.
> [    9.036467] systemd-journald[149]: Received SIGTERM from PID 1
> (systemd).
> [    9.082262] BUG: Bad page state in process kworker/u5:1  pfn:372ac
> [    9.083989] page:f3d32ae0 count:0 mapcount:0 mapping:f2252178
> index:0x16a
> [    9.085755] flags: 0x40020021(locked|lru|mappedtodisk)
> [    9.087284] page dumped because: page still charged to cgroup
> [    9.088772] bad because of flags:
> [    9.089731] flags: 0x21(locked|lru)
> [    9.090818] page->mem_cgroup:f2c3e400

Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Tested-by: Adam Williamson <awilliam@redhat.com>
Cc: Ming Lin <mlin@kernel.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/bounce.c b/block/bounce.c
index 2c310ea..457ebd5 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,12 +128,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, *org_vec;
 	int i;
+	int start = bio_orig->bi_iter.bi_idx;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
 	bio_for_each_segment_all(bvec, bio, i) {
-		org_vec = bio_orig->bi_io_vec + i;
+		org_vec = bio_orig->bi_io_vec + i + start;
+
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
 
-- 
cgit v0.10.2