From 5014c311baa2b21384321fa4a9f617a92e3e56f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Sep 2015 16:46:02 -0600 Subject: block: fix bogus compiler warnings in blk-merge.c The compiler can't figure out that bvprv is initialized whenever 'prev' is set to 1 as well. Use a pointer to bvprv instead, setting it to NULL initially, and get rid of the 'prev' tracking. This dumbs it down enough that gcc is happy. Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index d088cff..cce23ba 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -67,10 +67,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_set *bs) { struct bio *split; - struct bio_vec bv, bvprv; + struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; - int prev = 0; bio_for_each_segment(bv, bio, iter) { sectors += bv.bv_len >> 9; @@ -82,20 +81,20 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (prev && bvec_gap_to_prev(q, &bvprv, bv.bv_offset)) + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; - if (prev && blk_queue_cluster(q)) { + if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) + if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) + if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) goto new_segment; seg_size += bv.bv_len; bvprv = bv; - prev = 1; + bvprvp = &bv; continue; } new_segment: @@ -104,7 +103,7 @@ new_segment: nsegs++; bvprv = bv; - prev = 1; + bvprvp = &bv; seg_size = bv.bv_len; } -- cgit v0.10.2 From de65d2d26f81f7f84c7258b3a137f20e8fa5bb6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 31 Aug 2015 14:17:18 +0200 Subject: null_blk: fix memory leak on cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Driver was not freeing the memory allocated for internal nullb queues. This patch frees the memory during driver unload. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 17269a3..d394a85 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -406,6 +406,22 @@ static struct blk_mq_ops null_mq_ops = { .complete = null_softirq_done_fn, }; +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); @@ -415,6 +431,7 @@ static void null_del_dev(struct nullb *nullb) if (queue_mode == NULL_Q_MQ) blk_mq_free_tag_set(&nullb->tag_set); put_disk(nullb->disk); + cleanup_queues(nullb); kfree(nullb); } @@ -459,22 +476,6 @@ static int setup_commands(struct nullb_queue *nq) return 0; } -static void cleanup_queue(struct nullb_queue *nq) -{ - kfree(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - static int setup_queues(struct nullb *nullb) { nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue), -- cgit v0.10.2 From 5fdb7e1b976dc9d18aff8c711e51d17c4c324a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 31 Aug 2015 14:17:31 +0200 Subject: null_blk: fix wrong capacity when bs is not 512 bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit set_capacity() sets device's capacity using 512 bytes sectors. null_blk calculates the number of sectors by size / bs, which set_capacity is called with. This led to null_blk exposing the wrong number of sectors when bs is not 512 bytes. Signed-off-by: Matias Bjørling Reviewed-by: Ross Zwisler Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index d394a85..a295b98 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -589,8 +589,7 @@ static int null_add_dev(void) blk_queue_physical_block_size(nullb->q, bs); size = gb * 1024 * 1024 * 1024ULL; - sector_div(size, bs); - set_capacity(disk, size); + set_capacity(disk, size >> 9); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; -- cgit v0.10.2 From 5e7c4274a70aa2d6f485996d0ca1dad52d0039ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Sep 2015 19:28:20 +0300 Subject: block: Check for gaps on front and back merges We are checking for gaps to previous bio_vec, which can only detect back merges gaps. Moreover, at the point where we check for a gap, we don't know if we will attempt a back or a front merge. Thus, check for gap to prev in a back merge attempt and check for a gap to next in a front merge attempt. Signed-off-by: Jens Axboe [sagig: Minor rename change] Signed-off-by: Sagi Grimberg diff --git a/block/blk-merge.c b/block/blk-merge.c index cce23ba..d9eddbc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -438,6 +438,8 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + if (req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -456,6 +458,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + + if (req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -482,14 +487,6 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } -static int req_gap_to_prev(struct request *req, struct bio *next) -{ - struct bio *prev = req->biotail; - - return bvec_gap_to_prev(req->q, &prev->bi_io_vec[prev->bi_vcnt - 1], - next->bi_io_vec[0].bv_offset); -} - static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -504,7 +501,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; - if (req_gap_to_prev(req, next->bio)) + if (req_gap_back_merge(req, next->bio)) return 0; /* @@ -712,10 +709,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; - /* Only check gaps if the bio carries data */ - if (bio_has_data(bio) && req_gap_to_prev(rq, bio)) - return false; - return true; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a622f27..2ff94de 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1368,6 +1368,26 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); } +static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, + struct bio *next) +{ + if (!bio_has_data(prev)) + return false; + + return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], + next->bi_io_vec[0].bv_offset); +} + +static inline bool req_gap_back_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, req->biotail, bio); +} + +static inline bool req_gap_front_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, bio, req->bio); +} + struct work_struct; int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -- cgit v0.10.2 From 7f39add3b08cbbdb99abe50e6d7c342e6800d684 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 11 Sep 2015 09:03:04 -0600 Subject: block: Refuse request/bio merges with gaps in the integrity payload If a driver sets the block queue virtual boundary mask, it means that it cannot handle gaps so we must not allow those in the integrity payload as well. Signed-off-by: Sagi Grimberg Fixed up by me to have duplicate integrity merge functions, depending on whether block integrity is enabled or not. Fixes a compilations issue with CONFIG_BLK_DEV_INTEGRITY unset. Signed-off-by: Jens Axboe diff --git a/block/blk-integrity.c b/block/blk-integrity.c index f548b64..75f29cf 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, q->limits.max_integrity_segments) return false; + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); diff --git a/block/blk-merge.c b/block/blk-merge.c index d9eddbc..574ea7c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -440,6 +440,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, { if (req_gap_back_merge(req, bio)) return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -461,6 +464,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, if (req_gap_front_merge(req, bio)) return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2ff94de..1aac731 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1514,6 +1514,26 @@ queue_max_integrity_segments(struct request_queue *q) return q->limits.max_integrity_segments; } +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + struct bio_integrity_payload *bip = bio_integrity(req->bio); + struct bio_integrity_payload *bip_next = bio_integrity(next); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_payload *bip_next = bio_integrity(req->bio); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; @@ -1580,6 +1600,16 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g) { return 0; } +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + return false; +} +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + return false; +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v0.10.2 From 87a816df537e096d404add543ef47b796906c130 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 8 Sep 2015 09:33:35 -0600 Subject: block: Refuse adding appending a gapped integrity page to a bio This is only theoretical at the moment given that the only subsystems that generate integrity payloads are the block layer itself and the scsi target (which generate well aligned integrity payloads). But when we will expose integrity meta-data to user-space, we'll need to refuse appending a page with a gap (if the queue virtual boundary is set). Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4aecca7..14b8faf 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; + if (bip->bip_vcnt && + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; -- cgit v0.10.2 From 46348456c1791053dcbe5a9e21825b10a3c8a8fb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 3 Sep 2015 19:28:23 +0300 Subject: block: Copy a user iovec if it includes gaps For drivers that don't support gaps in the SG lists handed to them we must bounce (copy the user buffers) and pass a bio that does not include gaps. This doesn't matter for any current user, but will help to allow iser which can't handle gaps to use the block virtual boundary instead of using driver-local bounce buffering when handling SG_IO commands. Signed-off-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe diff --git a/block/blk-map.c b/block/blk-map.c index 2338416..f565e11 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,6 +9,24 @@ #include "blk.h" +static bool iovec_gap_to_prv(struct request_queue *q, + struct iovec *prv, struct iovec *cur) +{ + unsigned long prev_end; + + if (!queue_virt_boundary(q)) + return false; + + if (prv->iov_base == NULL && prv->iov_len == 0) + /* prv is not set - don't check */ + return false; + + prev_end = (unsigned long)(prv->iov_base + prv->iov_len); + + return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || + prev_end & queue_virt_boundary(q)); +} + int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct bio *bio; int unaligned = 0; struct iov_iter i; - struct iovec iov; + struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; if (!iter || !iter->count) return -EINVAL; @@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, /* * Keep going so we check length of all segments */ - if (uaddr & queue_dma_alignment(q)) + if ((uaddr & queue_dma_alignment(q)) || + iovec_gap_to_prv(q, &prv, &iov)) unaligned = 1; + + prv.iov_base = iov.iov_base; + prv.iov_len = iov.iov_len; } if (unaligned || (q->dma_pad_mask & iter->count) || map_data) -- cgit v0.10.2 From 6fe810bda0bd9a5d7674fc671fac27b8aa8ec243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 5 Sep 2015 15:47:36 -0400 Subject: block: blkg_destroy_all() should clear q->root_blkg and ->root_rl.blkg While making the root blkg unconditional, ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg") removed the part which clears q->root_blkg and ->root_rl.blkg during q exit. This leaves the two pointers dangling after blkg_destroy_all(). blk-throttle exit path performs blkg traversals and dereferences ->root_blkg and can lead to the following oops. BUG: unable to handle kernel NULL pointer dereference at 0000000000000558 IP: [] __blkg_lookup+0x26/0x70 ... task: ffff88001b4e2580 ti: ffff88001ac0c000 task.ti: ffff88001ac0c000 RIP: 0010:[] [] __blkg_lookup+0x26/0x70 ... Call Trace: [] blk_throtl_drain+0x5a/0x110 [] blkcg_drain_queue+0x18/0x20 [] __blk_drain_queue+0xc0/0x170 [] blk_queue_bypass_start+0x61/0x80 [] blkcg_deactivate_policy+0x39/0x100 [] blk_throtl_exit+0x38/0x50 [] blkcg_exit_queue+0x3e/0x50 [] blk_release_queue+0x1e/0xc0 ... While the bug is a straigh-forward use-after-free bug, it is tricky to reproduce because blkg release is RCU protected and the rest of exit path usually finishes before RCU grace period. This patch fixes the bug by updating blkg_destro_all() to clear q->root_blkg and ->root_rl.blkg. Signed-off-by: Tejun Heo Reported-by: "Richard W.M. Jones" Reported-by: Josh Boyer Link: http://lkml.kernel.org/g/CA+5PVA5rzQ0s4723n5rHBcxQa9t0cW8BPPBekr_9aMRoWt2aYg@mail.gmail.com Fixes: ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg") Cc: stable@vger.kernel.org # v4.2+ Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d6283b3..9cc48d1d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -387,6 +387,9 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } + + q->root_blkg = NULL; + q->root_rl.blkg = NULL; } /* -- cgit v0.10.2 From 52cc6eead9095e2faf2ec7afc013aa3af1f01ac5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 17 Sep 2015 09:58:38 -0600 Subject: block: blk-merge: fast-clone bio when splitting rw bios biovecs has become immutable since v3.13, so it isn't necessary to allocate biovecs for the new cloned bios, then we can save one extra biovecs allocation/copy, and the allocation is often not fixed-length and a bit more expensive. For example, if the 'max_sectors_kb' of null blk's queue is set as 16(32 sectors) via sysfs just for making more splits, this patch can increase throught about ~70% in the sequential read test over null_blk(direct io, bs: 1M). Cc: Christoph Hellwig Cc: Kent Overstreet Cc: Ming Lin Cc: Dongsu Park Signed-off-by: Ming Lei This fixes a performance regression introduced by commit 54efd50bfd, and allows us to take full advantage of the fact that we have immutable bio_vecs. Hand applied, as it rejected violently with commit 5014c311baa2. Signed-off-by: Jens Axboe diff --git a/block/blk-merge.c b/block/blk-merge.c index 574ea7c..c4e9c37 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -66,15 +66,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs) { - struct bio *split; struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; bio_for_each_segment(bv, bio, iter) { - sectors += bv.bv_len >> 9; - - if (sectors > queue_max_sectors(q)) + if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) goto split; /* @@ -95,6 +92,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; bvprvp = &bv; + sectors += bv.bv_len >> 9; continue; } new_segment: @@ -105,21 +103,12 @@ new_segment: bvprv = bv; bvprvp = &bv; seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; } return NULL; split: - split = bio_clone_bioset(bio, GFP_NOIO, bs); - - split->bi_iter.bi_size -= iter.bi_size; - bio->bi_iter = iter; - - if (bio_integrity(bio)) { - bio_integrity_advance(bio, split->bi_iter.bi_size); - bio_integrity_trim(split, 0, bio_sectors(split)); - } - - return split; + return bio_split(bio, sectors, GFP_NOIO, bs); } void blk_queue_split(struct request_queue *q, struct bio **bio, -- cgit v0.10.2 From 994518799930fc363d47cb7cf0d1abed1790bf16 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 18 Sep 2015 00:06:28 +0800 Subject: block: fix bounce_end_io When bio bounce is involved, one new bio and its biovecs are cloned from the comming bio, which can be one fast-cloned bio from upper layer(such as dm). So it is obviously wrong to assume the start index of the coming( original) bio's io vector is zero, which can be any value between 0 and (bi_max_vecs - 1), especially in case of bio split. This patch fixes Fedora's booting oops on i386, often with the following kernel log together: > [ 9.026738] systemd[1]: Switching root. > [ 9.036467] systemd-journald[149]: Received SIGTERM from PID 1 > (systemd). > [ 9.082262] BUG: Bad page state in process kworker/u5:1 pfn:372ac > [ 9.083989] page:f3d32ae0 count:0 mapcount:0 mapping:f2252178 > index:0x16a > [ 9.085755] flags: 0x40020021(locked|lru|mappedtodisk) > [ 9.087284] page dumped because: page still charged to cgroup > [ 9.088772] bad because of flags: > [ 9.089731] flags: 0x21(locked|lru) > [ 9.090818] page->mem_cgroup:f2c3e400 Reported-by: Josh Boyer Tested-by: Adam Williamson Cc: Ming Lin Cc: Mike Snitzer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/bounce.c b/block/bounce.c index 2c310ea..457ebd5 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -128,12 +128,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, *org_vec; int i; + int start = bio_orig->bi_iter.bi_idx; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; + org_vec = bio_orig->bi_io_vec + i + start; + if (bvec->bv_page == org_vec->bv_page) continue; -- cgit v0.10.2