From ab4bd22d3cce6977dc039664cc2d052e3147d662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Jun 2011 06:01:13 +0200 Subject: cfq-iosched: fix locking around ioc->ioc_data assignment Since we are modifying this RCU pointer, we need to hold the lock protecting it around it. This fixes a potential reuse and double free of a cfq io_context structure. The bug has been in CFQ for a long time, it hit very few people but those it did hit seemed to see it a lot. Tracked in RH bugzilla here: https://bugzilla.redhat.com/show_bug.cgi?id=577968 Credit goes to Paul Bolle for figuring out that the issue was around the one-hit ioc->ioc_data cache. Thanks to his hard work the issue is now fixed. Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3c7b537..545b8d4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2772,8 +2772,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); - if (ioc->ioc_data == cic) + if (rcu_dereference(ioc->ioc_data) == cic) { + spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); -- cgit v0.10.2 From a9dce2a3b4f0686dd66cb44d4826a59508bce969 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:54 +0200 Subject: block: don't use non-syncing event blocking in disk_check_events() This patch is part of fix for triggering of WARN_ON_ONCE() in disk_clear_events() reported in bug#34662. https://bugzilla.kernel.org/show_bug.cgi?id=34662 disk_clear_events() blocks events, schedules and flushes the event work. It expects the work to have started execution on schedule and finished on return from flush. WARN_ON_ONCE() triggers if the event work hasn't executed as expected. This problem happens because __disk_block_events() fails to guarantee that the event work item is not in flight on return from the function in race-free manner. The problem is two-fold and this patch addresses one of them. When __disk_block_events() is called with @sync == %false, it bumps event block count, calls cancel_delayed_work() and return. This makes it impossible to guarantee that event polling is not in flight on return from syncing __disk_block_events() - if the first blocker was non-syncing, polling could still be in progress and later syncing ones would assume that the first blocker already canceled it. Making __disk_block_events() cancel_sync regardless of block count isn't feasible either as it may race with forced event checking in disk_clear_events(). As disk_check_events() is the only user of non-syncing __disk_block_events(), updating it to directly cancel and schedule event work is the easiest way to solve the issue. Note that there's another bug in __disk_block_events() and this patch doesn't fix the issue completely. Later patch will fix the other bug. Signed-off-by: Tejun Heo Tested-by: Sitsofe Wheeler Reported-by: Sitsofe Wheeler Reported-by: Borislav Petkov Reported-by: Meelis Roos Reported-by: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Kay Sievers Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index 95822ae..3f09330 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1508,10 +1508,18 @@ void disk_unblock_events(struct gendisk *disk) */ void disk_check_events(struct gendisk *disk) { - if (disk->ev) { - __disk_block_events(disk, false); - __disk_unblock_events(disk, true); + struct disk_events *ev = disk->ev; + unsigned long flags; + + if (!ev) + return; + + spin_lock_irqsave(&ev->lock, flags); + if (!ev->block) { + cancel_delayed_work(&ev->dwork); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); } + spin_unlock_irqrestore(&ev->lock, flags); } EXPORT_SYMBOL_GPL(disk_check_events); -- cgit v0.10.2 From c3af54afbac3675337cedf326b7b127ffa7f7327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:55 +0200 Subject: block: remove non-syncing __disk_block_events() and fold it into disk_block_events() After the previous update to disk_check_events(), nobody is using non-syncing __disk_block_events(). Remove @sync and, as this makes __disk_block_events() virtually identical to disk_block_events(), remove the underscore prefixed version. Signed-off-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index 3f09330..ab0731d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1414,22 +1414,36 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk) return msecs_to_jiffies(intv_msecs); } -static void __disk_block_events(struct gendisk *disk, bool sync) +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; + if (!ev) + return; + spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); - if (cancel) { - if (sync) - cancel_delayed_work_sync(&disk->ev->dwork); - else - cancel_delayed_work(&disk->ev->dwork); - } + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) @@ -1461,27 +1475,6 @@ out_unlock: } /** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_block_events(disk, true); -} - -/** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for * @@ -1554,7 +1547,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); /* uncondtionally schedule event check and wait for it to finish */ - __disk_block_events(disk, true); + disk_block_events(disk); queue_delayed_work(system_nrt_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); @@ -1672,7 +1665,7 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, if (intv < 0 && intv != -1) return -EINVAL; - __disk_block_events(disk, true); + disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); @@ -1778,7 +1771,7 @@ static void disk_del_events(struct gendisk *disk) if (!disk->ev) return; - __disk_block_events(disk, true); + disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); -- cgit v0.10.2 From fdd514e16bb2531c0c61ae8a1f87740ce217f630 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:59 +0200 Subject: block: make disk_block_events() properly wait for work cancellation disk_block_events() should guarantee that the event work is not in flight on return and once blocked it shouldn't issue further cancellations. Because there was no synchronization between the first blocker doing cancel_delayed_work_sync() and the following blockers, the following blockers could finish before cancellation was complete, which broke both guarantees - event work could be in flight and cancellation could happen after return. This bug triggered WARN_ON_ONCE() in disk_clear_events() reported in bug#34662. https://bugzilla.kernel.org/show_bug.cgi?id=34662 Fix it by adding an outer mutex which protects both block count manipulation and work cancellation. -v2: Use outer mutex instead of bit waitqueue per Linus. Signed-off-by: Tejun Heo Tested-by: Sitsofe Wheeler Reported-by: Sitsofe Wheeler Reported-by: Borislav Petkov Reported-by: Meelis Roos Reported-by: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Kay Sievers Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index ab0731d..3608289 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1371,6 +1371,7 @@ struct disk_events { struct gendisk *disk; /* the associated disk */ spinlock_t lock; + struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ @@ -1438,12 +1439,20 @@ void disk_block_events(struct gendisk *disk) if (!ev) return; + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); if (cancel) cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) @@ -1751,6 +1760,7 @@ static void disk_add_events(struct gendisk *disk) INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); -- cgit v0.10.2 From 08e8138adebdd511e0955e8d6c051904bb4082af Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Jun 2011 10:42:49 +0200 Subject: block: Add __attribute__((format(printf...) and fix fallout Use the compiler to verify format strings and arguments. Fix fallout. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a62be8d..3689f83 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q) bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", total_nr_queued(td), td->nr_queued[READ], td->nr_queued[WRITE]); @@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) } queue_bio: - throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" " iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 545b8d4..f379943 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" - " sect=%u", used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, + "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", + used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, unaccounted_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); @@ -2023,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ if (sample_valid(cic->ttime_samples) && (cfqq->slice_end - jiffies < cic->ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", - cic->ttime_mean); + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", + cic->ttime_mean); return; } diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index b22fb0d..8c7c2de 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -169,7 +169,8 @@ extern void blk_trace_shutdown(struct request_queue *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts); -extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); +extern __attribute__((format(printf, 2, 3))) +void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream -- cgit v0.10.2 From d4c208b86b8be4254eba0e74071496e599f94639 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Jun 2011 12:45:48 +0200 Subject: block: use the passed in @bdev when claiming if partno is zero 6b4517a791 (block: implement bd_claiming and claiming block) introduced claiming block to support O_EXCL blkdev opens properly. bd_start_claiming() looks up the part 0 bdev and starts claiming block. The function assumed that there is only one part 0 bdev and always used bdget_disk(disk, 0) to look it up; unfortunately, this isn't true for some drivers (floppy) which use multiple block devices to denote different operating parameters for the same physical device. There can be multiple part 0 bdev's for the same device number. This incorrect assumption caused the wrong bdev to be used during claiming leading to unbalanced bd_holders as reported in the following bug. https://bugzilla.kernel.org/show_bug.cgi?id=28522 This patch updates bd_start_claiming() such that it uses the bdev specified as argument if its partno is zero. Note that this means that different bdev's can be used for the same device and O_EXCL check can be effectively bypassed. It has always been broken that way and floppy is fortunately on its way out. Leave that breakage alone. Signed-off-by: Tejun Heo Reported-by: Alex Villacis Lasso Tested-by: Alex Villacis Lasso Cc: stable@kernel.org # >= v2.6.36 Signed-off-by: Jens Axboe diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f..610e8e0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) -- cgit v0.10.2 From 155d109b5f52ffd749219b27702462dcd9cf4f8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 20 Jun 2011 13:23:14 +0200 Subject: block: add REQ_SECURE to REQ_COMMON_MASK Add REQ_SECURE flag to REQ_COMMON_MASK so that init_request_from_bio() can pass it to @req->cmd_flags. Signed-off-by: Namhyung Kim Acked-by: Adrian Hunter Cc: stable@kernel.org # 2.6.36 and newer Signed-off-by: Jens Axboe diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2a7cea5..6395692 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -167,7 +167,7 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_DISCARD | \ - REQ_NOIDLE | REQ_FLUSH | REQ_FUA) + REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK #define REQ_RAHEAD (1 << __REQ_RAHEAD) -- cgit v0.10.2