From 34b48db66e08ca1c1bc07cf305d672ac940268dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Sep 2014 16:08:05 -0700 Subject: block: remove artifical max_hw_sectors cap Set max_sectors to the value the drivers provides as hardware limit by default. Linux had proper I/O throttling for a long time and doesn't rely on a artifically small maximum I/O size anymore. By not limiting the I/O size by default we remove an annoying tuning step required for most Linux installation. Note that both the user, and if absolutely required the driver can still impose a limit for FS requests below max_hw_sectors_kb. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-settings.c b/block/blk-settings.c index aa02247..6ed2cbe 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ __func__, max_hw_sectors); } - limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; } EXPORT_SYMBOL(blk_limits_max_hw_sectors); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index dd73e1f..46c282f 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(q, 1024); q->backing_dev_info.name = "aoe"; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78..74d14db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1186,7 +1186,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, - BLK_DEF_MAX_SECTORS = 1024, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -- cgit v0.10.2 From 74c450521dd8d245b982da62592a18aa6f88b045 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Oct 2014 11:14:52 -0600 Subject: blk-mq: add a 'list' parameter to ->queue_rq() Since we have the notion of a 'last' request in a chain, we can use this to have the hardware optimize the issuing of requests. Add a list_head parameter to queue_rq that the driver can use to temporarily store hw commands for issue when 'last' is true. If we are doing a chain of requests, pass in a NULL list for the first request to force issue of that immediately, then batch the remainder for deferred issue until the last request has been sent. Instead of adding yet another argument to the hot ->queue_rq path, encapsulate the passed arguments in a blk_mq_queue_data structure. This is passed as a constant, and has been tested as faster than passing 4 (or even 3) args through ->queue_rq. Update drivers for the new ->queue_rq() prototype. There are no functional changes in this patch for drivers - if they don't use the passed in list, then they will just queue requests individually like before. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 68929ba..7e53038 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -680,6 +680,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct request *rq; LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + struct list_head *dptr; int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -706,16 +708,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. + */ + dptr = NULL; + + /* * Now process all the entries, sending them to the driver. */ queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); + + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -734,6 +747,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -1153,6 +1173,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (is_sync) { + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; int ret; blk_mq_bio_to_request(rq, bio); @@ -1162,7 +1187,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq, true); + ret = q->mq_ops->queue_rq(data.hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1bd5f52..3bd7ca9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *rq = bd->rq; int ret; if (unlikely(mtip_check_unal_depth(hctx, rq))) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2671a3f..8433bc8 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - cmd->rq = rq; + cmd->rq = bd->rq; cmd->nq = hctx->driver_data; - blk_mq_start_request(rq); + blk_mq_start_request(bd->rq); null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c6a27d5..cecd3f9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -158,10 +158,11 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; @@ -222,7 +223,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9eff8a3..161dcc9 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1858,9 +1858,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c9be158..be01d7a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -79,7 +79,13 @@ struct blk_mq_tag_set { struct list_head tag_list; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); +struct blk_mq_queue_data { + struct request *rq; + struct list_head *list; + bool last; +}; + +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v0.10.2 From e167dfb53cb85fde7b15f644e9dbef7ba31896b6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Oct 2014 11:18:26 -0600 Subject: blk-mq: add BLK_MQ_F_DEFER_ISSUE support flag Drivers can now tell blk-mq if they take advantage of the deferred issue through 'last' or not. If they do, don't do queue-direct for sync IO. This is a preparation patch for the nvme conversion. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 7e53038..b355b59 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1172,7 +1172,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } - if (is_sync) { + /* + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. + */ + if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index be01d7a..c3b64ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -146,6 +146,7 @@ enum { BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SYSFS_UP = 1 << 3, + BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, -- cgit v0.10.2 From b8ab956c544e51a8d19ea3f38b54355d6aa92c33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Nov 2014 12:52:41 +0100 Subject: block: Expand a bit documentation about elevator_allow_merge_fn Explain that two requests can be merged without elevator_allow_merge_fn() being called. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 2101e71..f1323c6 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -946,7 +946,11 @@ elevator_allow_merge_fn called whenever the block layer determines request safely. The io scheduler may still want to stop a merge at this point if it results in some sort of conflict internally, - this hook allows it to do that. + this hook allows it to do that. Note however + that two *requests* can still be merged at later + time. Currently the io scheduler has no way to + prevent that. It can only learn about the fact + from elevator_merge_req_fn callback. elevator_dispatch_fn* fills the dispatch queue with ready requests. I/O schedulers are free to postpone requests by -- cgit v0.10.2 From 9c6ac78eb3521c5937b2dd8a7d1b300f41092f45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Oct 2014 15:38:21 -0400 Subject: writeback: fix a subtle race condition in I_DIRTY clearing After invoking ->dirty_inode(), __mark_inode_dirty() does smp_mb() and tests inode->i_state locklessly to see whether it already has all the necessary I_DIRTY bits set. The comment above the barrier doesn't contain any useful information - memory barriers can't ensure "changes are seen by all cpus" by itself. And it sure enough was broken. Please consider the following scenario. CPU 0 CPU 1 ------------------------------------------------------------------------------- enters __writeback_single_inode() grabs inode->i_lock tests PAGECACHE_TAG_DIRTY which is clear enters __set_page_dirty() grabs mapping->tree_lock sets PAGECACHE_TAG_DIRTY releases mapping->tree_lock leaves __set_page_dirty() enters __mark_inode_dirty() smp_mb() sees I_DIRTY_PAGES set leaves __mark_inode_dirty() clears I_DIRTY_PAGES releases inode->i_lock Now @inode has dirty pages w/ I_DIRTY_PAGES clear. This doesn't seem to lead to an immediately critical problem because requeue_inode() later checks PAGECACHE_TAG_DIRTY instead of I_DIRTY_PAGES when deciding whether the inode needs to be requeued for IO and there are enough unintentional memory barriers inbetween, so while the inode ends up with inconsistent I_DIRTY_PAGES flag, it doesn't fall off the IO list. The lack of explicit barrier may also theoretically affect the other I_DIRTY bits which deal with metadata dirtiness. There is no guarantee that a strong enough barrier exists between I_DIRTY_[DATA]SYNC clearing and write_inode() writing out the dirtied inode. Filesystem inode writeout path likely has enough stuff which can behave as full barrier but it's theoretically possible that the writeout may not see all the updates from ->dirty_inode(). Fix it by adding an explicit smp_mb() after I_DIRTY clearing. Note that I_DIRTY_PAGES needs a special treatment as it always needs to be cleared to be interlocked with the lockless test on __mark_inode_dirty() side. It's cleared unconditionally and reinstated after smp_mb() if the mapping still has dirty pages. Also add comments explaining how and why the barriers are paired. Lightly tested. Signed-off-by: Tejun Heo Cc: Jan Kara Cc: Mikulas Patocka Cc: Jens Axboe Cc: Al Viro Cc: stable@vger.kernel.org Reviewed-by: Jan Kara Signed-off-by: Jens Axboe diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef9bef1..2d609a5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode->i_lock); - /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state &= ~I_DIRTY_PAGES; + dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + inode->i_state &= ~I_DIRTY; + + /* + * Paired with smp_mb() in __mark_inode_dirty(). This allows + * __mark_inode_dirty() to test i_state without grabbing i_lock - + * either they see the I_DIRTY bits cleared or we see the dirtied + * inode. + * + * I_DIRTY_PAGES is always cleared together above even if @mapping + * still has dirty pages. The flag is reinstated after smp_mb() if + * necessary. This guarantees that either __mark_inode_dirty() + * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. + */ + smp_mb(); + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state |= I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); + /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) } /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * Paired with smp_mb() in __writeback_single_inode() for the + * following lockless i_state test. See there for details. */ smp_mb(); - /* avoid the locking if we can */ if ((inode->i_state & flags) == flags) return; -- cgit v0.10.2 From 398205b8391b208f0034a392242867b28ad8af3d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:03:59 +0100 Subject: blk_mq: call preempt_disable/enable in blk_mq_run_hw_queue, and only if needed preempt_disable/enable surrounds every call to blk_mq_run_hw_queue, except the one in blk-flush.c. In fact that one is always asynchronous, and it does not need smp_processor_id(). We can do the same for all other calls, avoiding preempt_disable when async is true. This avoids peppering blk-mq.c with preemption-disabled regions. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index b355b59..8b309e8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -801,9 +801,18 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) + if (!async) { + preempt_disable(); + if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + preempt_enable(); + return; + } + + preempt_enable(); + } + + if (hctx->queue->nr_hw_queues == 1) kblockd_schedule_delayed_work(&hctx->run_work, 0); else { unsigned int cpu; @@ -824,9 +833,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -853,9 +860,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -880,9 +885,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -- cgit v0.10.2 From 2a90d4aae5509e9cf1ba848c5d0b3458201160a0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:04:00 +0100 Subject: blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable blk-mq is using preempt_disable/enable in order to ensure that the queue runners are placed on the right CPU. This does not work with the RT patches, because __blk_mq_run_hw_queue takes a non-raw spinlock with the preemption-disabled region. If there is contention on the lock, this violates the rules for preemption-disabled regions. While this should be easily fixable within the RT patches just by doing migrate_disable/enable, we can do better and document _why_ this particular region runs with disabled preemption. After the previous patch, it is trivial to switch it to get/put_cpu; the RT patches then can change it to get_cpu_light, which lets virtio-blk run under RT kernels. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 8b309e8..06ab068 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -802,14 +802,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; if (!async) { - preempt_disable(); - if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - preempt_enable(); + put_cpu(); return; } - preempt_enable(); + put_cpu(); } if (hctx->queue->nr_hw_queues == 1) -- cgit v0.10.2 From 1a3b595a281a44be4074fe33b317a0a4854b4197 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:40:48 -0700 Subject: blk-mq: export blk_mq_free_request() Drivers that know they are blk-mq should just use this function instead of calling through blk_put_request(). Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 06ab068..fdf1215 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -280,6 +280,7 @@ void blk_mq_free_request(struct request *rq) hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { -- cgit v0.10.2 From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:41:57 -0700 Subject: blk-mq: add blk_mq_free_hctx_request() It's silly to use blk_mq_free_request() which in turn maps the request to the hardware queue, for places where we already know what the hardware queue is. This saves us an extra mapping of a hardware queue on request completion, if the caller knows this information already. Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index fdf1215..4347aa2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c3b64ec..fb0a4fb 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -169,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); -- cgit v0.10.2 From 5fabcb4c33fe11c7e3afdf805fde26c1a54d0953 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Nov 2014 13:06:22 -0700 Subject: genhd: check for int overflow in disk_expand_part_tbl() We can get here from blkdev_ioctl() -> blkpg_ioctl() -> add_partition() with a user passed in partno value. If we pass in 0x7fffffff, the new target in disk_expand_part_tbl() overflows the 'int' and we access beyond the end of ptbl->part[] and even write to it when we do the rcu_assign_pointer() to assign the new partition. Reported-by: David Ramos Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index bd30606..0a536dc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) -- cgit v0.10.2 From b657d7e632e0bc40e5e231332be39d69b2f1a0bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Nov 2014 09:27:23 +0100 Subject: blk-mq: handle the single queue case in blk_mq_hctx_next_cpu Don't duplicate the code to handle the not cpu bounce case in the caller, do it inside blk_mq_hctx_next_cpu instead. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 4347aa2..27a347f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -788,10 +788,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; + int cpu = hctx->next_cpu, next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -799,9 +800,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx->next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) @@ -820,14 +823,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->run_work, 0); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -919,16 +916,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->delay_work, tmo); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); -- cgit v0.10.2 From 394ffa503bc40e32d7f54a9b817264e81ce131b4 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Mon, 24 Nov 2014 11:05:22 +0800 Subject: blk: introduce generic io stat accounting help function Many block drivers accounting io stat based on bio (e.g. NVMe...), the blk_account_io_start/end() which is based on request does not make sense to them, so here we introduce the similar help function named generic_start/end_io_acct base on raw sectors, and it can simplify some driver's open io accounting code. Signed-off-by: Gu Zheng Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 3e6e198..3d4a072 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1739,6 +1739,34 @@ void bio_check_pages_dirty(struct bio *bio) } } +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu = part_stat_lock(); + + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + part_inc_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_start_io_acct); + +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_end_io_acct); + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 7347f48..efead0b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part); +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time); + #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif -- cgit v0.10.2 From a33c1ba2913802b6fb23e974bb2f6a4e73c8b7ce Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Nov 2014 15:02:42 -0700 Subject: blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map We currently use num_possible_cpus(), but that breaks on sparc64 where the CPU ID space is discontig. Use nr_cpu_ids as the highest CPU ID instead, so we don't end up reading from invalid memory. Cc: stable@kernel.org # 3.13+ Signed-off-by: Jens Axboe diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 1065d7c..72e5ed6 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) unsigned int *map; /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); if (!map) return NULL; -- cgit v0.10.2 From 70114c393ccaa43ca38e6b36b9469ed2c35acc49 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Nov 2014 15:52:30 -0700 Subject: blk-mq: cleanup tag free handling We only call __blk_mq_put_tag() and __blk_mq_put_reserved_tag() from blk_mq_put_tag(), so just inline the two calls instead of having them as separate functions. Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8317175..230ef30 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -360,21 +360,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) } } -static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) -{ - BUG_ON(tag >= tags->nr_tags); - - bt_clear_tag(&tags->bitmap_tags, tag); -} - -static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, - unsigned int tag) -{ - BUG_ON(tag >= tags->nr_reserved_tags); - - bt_clear_tag(&tags->breserved_tags, tag); -} - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { @@ -383,10 +368,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; - __blk_mq_put_tag(tags, real_tag); + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); *last_tag = real_tag; - } else - __blk_mq_put_reserved_tag(tags, tag); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } } static void bt_for_each(struct blk_mq_hw_ctx *hctx, -- cgit v0.10.2 From 6637fadf25657e619a50fde5ff3ae09a98d20eb5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 30 Nov 2014 16:00:58 -0800 Subject: blk-mq: move the kdump check to blk_mq_alloc_tag_set We call blk_mq_alloc_tag_set() first then blk_mq_init_queue(). The requests are allocated in the former function. So the kdump check should be moved to there to really save memory. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 27a347f..4854e70 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1804,16 +1804,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); - /* - * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. - */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->queue_depth = min(64U, set->queue_depth); - } - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -2070,6 +2060,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); -- cgit v0.10.2 From b32232073e8061b41258bff2a10a06a91677480a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Dec 2014 08:46:34 -0700 Subject: blk-mq: fix hang in bt_get() Avoid that if there are fewer hardware queues than CPU threads that bt_get() can hang. The symptoms of the hang were as follows: * All tags allocated for a particular hardware queue. * (nr_tags) pending commands for that hardware queue. * No pending commands for the software queues associated with that hardware queue. Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 230ef30..eb55492 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -254,6 +254,13 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) break; + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. + */ + blk_mq_run_hw_queue(hctx, false); + blk_mq_put_ctx(data->ctx); io_schedule(); -- cgit v0.10.2 From 080ff3511450fd73948697fef34a3cc382675b59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Dec 2014 08:49:06 -0700 Subject: blk-mq: re-check for available tags after running the hardware queue If we run out of tags and have to sleep, we run the hardware queue to kick pending IO into gear. During that run, we may have completed requests, so re-check if we have free tags before going to sleep. Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index eb55492..bab4bff 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -261,6 +261,14 @@ static int bt_get(struct blk_mq_alloc_data *data, */ blk_mq_run_hw_queue(hctx, false); + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + blk_mq_put_ctx(data->ctx); io_schedule(); -- cgit v0.10.2 From 19c66e59ce57e7b181625cbb408d48eb10837763 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 3 Dec 2014 19:38:04 +0800 Subject: blk-mq: prevent unmapped hw queue from being scheduled When one hardware queue has no mapped software queues, it shouldn't have been scheduled. Otherwise WARNING or OOPS can triggered. blk_mq_hw_queue_mapped() helper is introduce for fixing the problem. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 4854e70..b21a3b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -589,7 +589,7 @@ static void blk_mq_rq_timer(unsigned long priv) * If not software queues are currently mapped to this * hardware queue, there's nothing to check */ - if (!hctx->nr_ctx || !hctx->tags) + if (!blk_mq_hw_queue_mapped(hctx)) continue; blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); @@ -809,7 +809,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; if (!async) { @@ -916,6 +917,9 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } diff --git a/block/blk-mq.h b/block/blk-mq.h index d567d52..206230e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} + #endif -- cgit v0.10.2 From 45a9c9d909b24c6ad0e28a7946e7486e73010319 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:57:48 +0100 Subject: blk-mq: Fix a use-after-free blk-mq users are allowed to free the memory request_queue.tag_set points at after blk_cleanup_queue() has finished but before blk_release_queue() has started. This can happen e.g. in the SCSI core. The SCSI core namely embeds the tag_set structure in a SCSI host structure. The SCSI host structure is freed by scsi_host_dev_release(). This function is called after blk_cleanup_queue() finished but can be called before blk_release_queue(). This means that it is not safe to access request_queue.tag_set from inside blk_release_queue(). Hence remove the blk_sync_queue() call from blk_release_queue(). This call is not necessary - outstanding requests must have finished before blk_release_queue() is called. Additionally, move the blk_mq_free_queue() call from blk_release_queue() to blk_cleanup_queue() to avoid that struct request_queue.tag_set gets accessed after it has been freed. This patch avoids that the following kernel oops can be triggered when deleting a SCSI host for which scsi-mq was enabled: Call Trace: [] lock_acquire+0xc4/0x270 [] mutex_lock_nested+0x61/0x380 [] blk_mq_free_queue+0x30/0x180 [] blk_release_queue+0x84/0xd0 [] kobject_cleanup+0x7b/0x1a0 [] kobject_put+0x30/0x70 [] blk_put_queue+0x15/0x20 [] disk_release+0x99/0xd0 [] device_release+0x36/0xb0 [] kobject_cleanup+0x7b/0x1a0 [] kobject_put+0x30/0x70 [] put_disk+0x1a/0x20 [] __blkdev_put+0x135/0x1b0 [] blkdev_put+0x50/0x160 [] kill_block_super+0x44/0x70 [] deactivate_locked_super+0x44/0x60 [] deactivate_super+0x4e/0x70 [] cleanup_mnt+0x43/0x90 [] __cleanup_mnt+0x12/0x20 [] task_work_run+0xac/0xe0 [] do_notify_resume+0x61/0xa0 [] int_signal+0x12/0x17 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53..93f9152 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1fac434..935ea2a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * Currently, its primary task it to free all the &struct request * structures that were allocated to the queue and the queue itself. * - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... + * Note: + * The low level driver must have finished any outstanding requests first + * via blk_cleanup_queue(). **/ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - blk_sync_queue(q); - blkcg_exit_queue(q); if (q->elevator) { @@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - if (q->mq_ops) - blk_mq_free_queue(q); - else + if (!q->mq_ops) blk_free_flush_queue(q->fq); blk_trace_shutdown(q); -- cgit v0.10.2 From 9e98e9d7cf6e9d2ec1cce45e8d5ccaf3f9b386f3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:58:11 +0100 Subject: blk-mq: Avoid that __bt_get_word() wraps multiple times If __bt_get_word() is called with last_tag != 0, if the first find_next_zero_bit() fails, if after wrap-around the test_and_set_bit() call fails and find_next_zero_bit() succeeds, if the next test_and_set_bit() call fails and subsequently find_next_zero_bit() does not find a zero bit, then another wrap-around will occur. Avoid this by introducing an additional local variable. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bab4bff..0f5e22a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; + bool wrap = last_tag != 0; org_last_tag = last_tag; end = bm->depth; @@ -148,8 +149,9 @@ restart: * We started with an offset, start from 0 to * exhaust the map. */ - if (org_last_tag && last_tag) { - end = last_tag; + if (wrap) { + wrap = false; + end = org_last_tag; last_tag = 0; goto restart; } -- cgit v0.10.2 From c38d185d4af12e8be63ca4b6745d99449c450f12 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:58:35 +0100 Subject: blk-mq: Fix a race between bt_clear_tag() and bt_get() What we need is the following two guarantees: * Any thread that observes the effect of the test_and_set_bit() by __bt_get_word() also observes the preceding addition of 'current' to the appropriate wait list. This is guaranteed by the semantics of the spin_unlock() operation performed by prepare_and_wait(). Hence the conversion of test_and_set_bit_lock() into test_and_set_bit(). * The wait lists are examined by bt_clear() after the tag bit has been cleared. clear_bit_unlock() guarantees that any thread that observes that the bit has been cleared also observes the store operations preceding clear_bit_unlock(). However, clear_bit_unlock() does not prevent that the wait lists are examined before that the tag bit is cleared. Hence the addition of a memory barrier between clear_bit() and the wait list examination. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 0f5e22a..e47c4c7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -158,7 +158,7 @@ restart: return -1; } last_tag = tag + 1; - } while (test_and_set_bit_lock(tag, &bm->word)); + } while (test_and_set_bit(tag, &bm->word)); return tag; } @@ -357,11 +357,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) struct bt_wait_state *bs; int wait_cnt; - /* - * The unlock memory barrier need to order access to req in free - * path and clearing tag bit - */ - clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); bs = bt_wake_ptr(bt); if (!bs) -- cgit v0.10.2 From 52f7eb945f2ba62b324bb9ae16d945326a961dcf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:59:48 +0100 Subject: blk-mq: Micro-optimize bt_get() Remove a superfluous finish_wait() call. Convert the two bt_wait_ptr() calls into a single call. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Signed-off-by: Jens Axboe diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e47c4c7..1b7229f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -248,8 +248,8 @@ static int bt_get(struct blk_mq_alloc_data *data, if (!(data->gfp & __GFP_WAIT)) return -1; - bs = bt_wait_ptr(bt, hctx); do { + bs = bt_wait_ptr(bt, hctx); prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag); @@ -285,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data, hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); } while (1); finish_wait(&bs->wait, &wait); -- cgit v0.10.2 From 959f5f5b2fa7ac3bdd37c91076e560c06513f1e6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:59:21 +0100 Subject: blk-mq: Use all available hardware queues Suppose that a system has two CPU sockets, three cores per socket, that it does not support hyperthreading and that four hardware queues are provided by a block driver. With the current algorithm this will lead to the following assignment of CPU cores to hardware queues: HWQ 0: 0 1 HWQ 1: 2 3 HWQ 2: 4 5 HWQ 3: (none) This patch changes the queue assignment into: HWQ 0: 0 1 HWQ 1: 2 HWQ 2: 3 4 HWQ 3: 5 In other words, this patch has the following three effects: - All four hardware queues are used instead of only three. - CPU cores are spread more evenly over hardware queues. For the above example the range of the number of CPU cores associated with a single HWQ is reduced from [0..2] to [1..2]. - If the number of HWQ's is a multiple of the number of CPU sockets it is now guaranteed that all CPU cores associated with a single HWQ reside on the same CPU socket. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: Alexander Gordeev Signed-off-by: Jens Axboe diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 72e5ed6..5f13f4d 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -17,7 +17,7 @@ static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, const int cpu) { - return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); + return cpu * nr_queues / nr_cpus; } static int get_first_sibling(unsigned int cpu) -- cgit v0.10.2 From 7eca210375dcc029ad69c0cf48d2bf7a71f0121d Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Tue, 9 Dec 2014 14:57:45 -0700 Subject: blktrace: don't let the sysfs interface remove trace from running list Currently, blktrace can be started/stopped via its ioctl-based interface (used by the userspace blktrace tool) or via its ftrace interface. The function blk_trace_remove_queue(), called each time an "enable" tunable of the ftrace interface transitions to zero, removes the trace from the running list, even if no function from the sysfs interface adds it to such a list. This leads to a null pointer dereference. This commit changes the blk_trace_remove_queue() function so that it does not remove the blk_trace from the running list. v2: - Now the patch removes the invocation of list_del() instead of adding an useless if branch, as suggested by Namhyung Kim. Signed-off-by: Arianna Avanzini Signed-off-by: Jens Axboe diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ad..bd05fd2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1493,9 +1493,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } -- cgit v0.10.2 From 06a41a99d13d8e919e9a00a4849e6b85ae492592 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Dec 2014 16:38:30 +0100 Subject: blk-mq: Fix uninitialized kobject at CPU hotplugging When a CPU is hotplugged, the current blk-mq spews a warning like: kobject '(null)' (ffffe8ffffc8b5d8): tried to add an uninitialized object, something is seriously wrong. CPU: 1 PID: 1386 Comm: systemd-udevd Not tainted 3.18.0-rc7-2.g088d59b-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_171129-lamiak 04/01/2014 0000000000000000 0000000000000002 ffffffff81605f07 ffffe8ffffc8b5d8 ffffffff8132c7a0 ffff88023341d370 0000000000000020 ffff8800bb05bd58 ffff8800bb05bd08 000000000000a0a0 000000003f441940 0000000000000007 Call Trace: [] dump_trace+0x86/0x330 [] show_stack_log_lvl+0x94/0x170 [] show_stack+0x21/0x50 [] dump_stack+0x41/0x51 [] kobject_add+0xa0/0xb0 [] blk_mq_register_hctx+0x91/0xb0 [] blk_mq_sysfs_register+0x3e/0x60 [] blk_mq_queue_reinit_notify+0xf8/0x190 [] notifier_call_chain+0x4c/0x70 [] cpu_notify+0x23/0x50 [] _cpu_up+0x157/0x170 [] cpu_up+0x89/0xb0 [] cpu_subsys_online+0x35/0x80 [] device_online+0x5d/0xa0 [] online_store+0x75/0x80 [] kernfs_fop_write+0xda/0x150 [] vfs_write+0xb2/0x1f0 [] SyS_write+0x42/0xb0 [] system_call_fastpath+0x16/0x1b [<00007f0132fb24e0>] 0x7f0132fb24e0 This is indeed because of an uninitialized kobject for blk_mq_ctx. The blk_mq_ctx kobjects are initialized in blk_mq_sysfs_init(), but it goes loop over hctx_for_each_ctx(), i.e. it initializes only for online CPUs. Thus, when a CPU is hotplugged, the ctx for the newly onlined CPU is registered without initialization. This patch fixes the issue by initializing the all ctx kobjects belonging to each queue. Bugzilla: https://bugzilla.novell.com/show_bug.cgi?id=908794 Cc: Signed-off-by: Takashi Iwai Signed-off-by: Jens Axboe diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 371d880..1630a20 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - int i, j; + int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - hctx_for_each_ctx(hctx, ctx, j) - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - } + queue_for_each_ctx(q, ctx, i) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } /* see blk_register_queue() */ -- cgit v0.10.2 From fcbf6a087a7e4d3f03d28333678a1010810a53c3 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Wed, 10 Dec 2014 14:16:53 -0800 Subject: bio: modify __bio_add_page() to accept pages that don't start a new segment The original behaviour is to refuse to add a new page if the maximum number of segments has been reached, regardless of the fact the page we are going to add can be merged into the last segment or not. Unfortunately, when the system runs under heavy memory fragmentation conditions, a driver may try to add multiple pages to the last segment. The original code won't accept them and EBUSY will be reported to userspace. This patch modifies the function so it refuses to add a page only in case the latter starts a new segment and the maximum number of segments has already been reached. The bug can be easily reproduced with the st driver: 1) set CONFIG_SCSI_MPT2SAS_MAX_SGE or CONFIG_SCSI_MPT3SAS_MAX_SGE to 16 2) modprobe st buffer_kbs=1024 3) #dd if=/dev/zero of=/dev/st0 bs=1M count=10 dd: error writing `/dev/st0': Device or resource busy Signed-off-by: Maurizio Lombardi Signed-off-by: Ming Lei Cc: Jet Chen Cc: Tomas Henzl Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/bio.c b/block/bio.c index 3d4a072..471d738 100644 --- a/block/bio.c +++ b/block/bio.c @@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } } + bio->bi_iter.bi_size += len; goto done; } @@ -764,29 +765,32 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + bio->bi_iter.bi_size += len; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the * queue to get further control @@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, + .bi_size = bio->bi_iter.bi_size - len, .bi_rw = bio->bi_rw, }; @@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: - bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + bio->bi_iter.bi_size -= len; + blk_recount_segments(q, bio); + return 0; } /** -- cgit v0.10.2