diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 14 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bcache/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 34 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 185 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 2 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 75 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 127 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 76 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 538 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 35 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 17 | ||||
-rw-r--r-- | drivers/md/dm.c | 177 | ||||
-rw-r--r-- | drivers/md/md.c | 55 | ||||
-rw-r--r-- | drivers/md/md.h | 8 | ||||
-rw-r--r-- | drivers/md/raid0.c | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 45 | ||||
-rw-r--r-- | drivers/md/raid10.c | 112 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 |
25 files changed, 1148 insertions, 395 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3bfc8f1..30b426e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -412,4 +412,18 @@ config DM_VERITY If unsure, say N. +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM + ---help--- + This device-mapper target creates a device that supports an arbitrary + mapping of fixed-size regions of I/O across a fixed set of paths. + The path used for any specific region can be switched dynamically + by sending the target a message. + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1439fd4..5ef78ef 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 05c220d..f950c9d 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,6 @@ config BCACHE tristate "Block device as cache" - select CLOSURES ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 340146d..d3e15b4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1241,7 +1241,7 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_writeback_init_cached_dev(struct cached_dev *); +void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); void bch_cache_allocator_exit(struct cache *ca); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cb4578a..1d27d3a 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -395,7 +395,7 @@ void inorder_test(void) #endif /* - * Cacheline/offset <-> bkey pointer arithmatic: + * Cacheline/offset <-> bkey pointer arithmetic: * * t->tree is a binary search tree in an array; each node corresponds to a key * in one cacheline in t->set (BSET_CACHELINE bytes). @@ -404,7 +404,7 @@ void inorder_test(void) * the binary tree points to; to_inorder() gives us the cacheline, and then * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. * - * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to * make this work. * * To construct the bfloat for an arbitrary key we need to know what the key diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 64e6794..b8730e7 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -93,24 +93,6 @@ static struct attribute *bch_stats_files[] = { }; static KTYPE(bch_stats); -static void scale_accounting(unsigned long data); - -void bch_cache_accounting_init(struct cache_accounting *acc, - struct closure *parent) -{ - kobject_init(&acc->total.kobj, &bch_stats_ktype); - kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); - kobject_init(&acc->hour.kobj, &bch_stats_ktype); - kobject_init(&acc->day.kobj, &bch_stats_ktype); - - closure_init(&acc->cl, parent); - init_timer(&acc->timer); - acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; - add_timer(&acc->timer); -} - int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, struct kobject *parent) { @@ -244,3 +226,19 @@ void bch_mark_sectors_bypassed(struct search *s, int sectors) atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); } + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + init_timer(&acc->timer); + acc->timer.expires = jiffies + accounting_delay; + acc->timer.data = (unsigned long) acc; + acc->timer.function = scale_accounting; + add_timer(&acc->timer); +} diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c8046bc..f88e2b6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -634,11 +634,10 @@ static int open_dev(struct block_device *b, fmode_t mode) return 0; } -static int release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b, fmode_t mode) { struct bcache_device *d = b->private_data; closure_put(&d->cl); - return 0; } static int ioctl_dev(struct block_device *b, fmode_t mode, @@ -732,8 +731,7 @@ static void bcache_device_free(struct bcache_device *d) if (d->c) bcache_device_detach(d); - - if (d->disk) + if (d->disk && d->disk->flags & GENHD_FL_UP) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); @@ -756,12 +754,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || - bio_split_pool_init(&d->bio_split_hook)) - - return -ENOMEM; - - d->disk = alloc_disk(1); - if (!d->disk) + bio_split_pool_init(&d->bio_split_hook) || + !(d->disk = alloc_disk(1)) || + !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); @@ -771,10 +766,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) - return -ENOMEM; - blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; @@ -999,14 +990,17 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); list_del(&dc->list); mutex_unlock(&bch_register_lock); if (!IS_ERR_OR_NULL(dc->bdev)) { - blk_sync_queue(bdev_get_queue(dc->bdev)); + if (dc->bdev->bd_disk) + blk_sync_queue(bdev_get_queue(dc->bdev)); + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } @@ -1028,73 +1022,67 @@ static void cached_dev_flush(struct closure *cl) static int cached_dev_init(struct cached_dev *dc, unsigned block_size) { - int err; + int ret; struct io *io; - - closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + struct request_queue *q = bdev_get_queue(dc->bdev); __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); - - bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - - err = bcache_device_init(&dc->disk, block_size); - if (err) - goto err; - - spin_lock_init(&dc->io_lock); - closure_init_unlocked(&dc->sb_write); INIT_WORK(&dc->detach, cached_dev_detach_finish); + closure_init_unlocked(&dc->sb_write); + INIT_LIST_HEAD(&dc->io_lru); + spin_lock_init(&dc->io_lock); + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; - INIT_LIST_HEAD(&dc->io_lru); - dc->sb_bio.bi_max_vecs = 1; - dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; - for (io = dc->io; io < dc->io + RECENT_IO; io++) { list_add(&io->lru, &dc->io_lru); hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - bch_writeback_init_cached_dev(dc); + ret = bcache_device_init(&dc->disk, block_size); + if (ret) + return ret; + + set_capacity(dc->disk.disk, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + dc->disk.disk->queue->backing_dev_info.ra_pages = + max(dc->disk.disk->queue->backing_dev_info.ra_pages, + q->backing_dev_info.ra_pages); + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); return 0; -err: - bcache_device_stop(&dc->disk); - return err; } /* Cached device - bcache superblock */ -static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, +static void register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - struct gendisk *g; struct cache_set *c; - if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) - return err; - memcpy(&dc->sb, sb, sizeof(struct cache_sb)); - dc->sb_bio.bi_io_vec[0].bv_page = sb_page; dc->bdev = bdev; dc->bdev->bd_holder = dc; - g = dc->disk.disk; - - set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - - g->queue->backing_dev_info.ra_pages = - max(g->queue->backing_dev_info.ra_pages, - bdev->bd_queue->backing_dev_info.ra_pages); + bio_init(&dc->sb_bio); + dc->sb_bio.bi_max_vecs = 1; + dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); - bch_cached_dev_request_init(dc); + if (cached_dev_init(dc, sb->block_size << 9)) + goto err; err = "error creating kobject"; if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, @@ -1103,6 +1091,8 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; + pr_info("registered backing device %s", bdevname(bdev, name)); + list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c); @@ -1111,15 +1101,10 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return NULL; + return; err: - kobject_put(&dc->disk.kobj); pr_notice("error opening %s: %s", bdevname(bdev, name), err); - /* - * Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; + bcache_device_stop(&dc->disk); } /* Flash only volumes */ @@ -1717,20 +1702,11 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) size_t free; struct bucket *b; - if (!ca) - return -ENOMEM; - __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - memcpy(&ca->sb, sb, sizeof(struct cache_sb)); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->sb_bio); - ca->sb_bio.bi_max_vecs = 1; - ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; @@ -1742,18 +1718,17 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || - !(ca->buckets = vmalloc(sizeof(struct bucket) * + !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) - goto err; + return -ENOMEM; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); for_each_bucket(b, ca) atomic_set(&b->pin, 0); @@ -1766,22 +1741,28 @@ err: return -ENOMEM; } -static const char *register_cache(struct cache_sb *sb, struct page *sb_page, +static void register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - if (cache_alloc(sb, ca) != 0) - return err; - - ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; + bio_init(&ca->sb_bio); + ca->sb_bio.bi_max_vecs = 1; + ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); + if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); + if (cache_alloc(sb, ca) != 0) + goto err; + err = "error creating kobject"; if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; @@ -1791,15 +1772,10 @@ static const char *register_cache(struct cache_sb *sb, struct page *sb_page, goto err; pr_info("registered cache device %s", bdevname(bdev, name)); - - return NULL; + return; err: + pr_notice("error opening %s: %s", bdevname(bdev, name), err); kobject_put(&ca->kobj); - pr_info("error opening %s: %s", bdevname(bdev, name), err); - /* Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; } /* Global interfaces/init */ @@ -1833,12 +1809,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; - - if (IS_ERR(bdev) || - set_blocksize(bdev, 4096)) + if (IS_ERR(bdev)) { + if (bdev == ERR_PTR(-EBUSY)) + err = "device busy"; goto err; + } + + err = "failed to set blocksize"; + if (set_blocksize(bdev, 4096)) + goto err_close; err = read_super(sb, bdev, &sb_page); if (err) @@ -1846,33 +1825,33 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) + goto err_close; - err = register_bdev(sb, sb_page, bdev, dc); + register_bdev(sb, sb_page, bdev, dc); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto err_close; - err = register_cache(sb, sb_page, bdev, ca); + register_cache(sb, sb_page, bdev, ca); } - - if (err) { - /* register_(bdev|cache) will only return an error if they - * didn't get far enough to create the kobject - if they did, - * the kobject destructor will do this cleanup. - */ +out: + if (sb_page) put_page(sb_page); -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); - ret = -EINVAL; - } - kfree(sb); kfree(path); mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; + +err_close: + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +err: + if (attr != &ksysfs_register_quiet) + pr_info("error opening %s: %s", path, err); + ret = -EINVAL; + goto out; } static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 93e7e31..2714ed3 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -375,7 +375,7 @@ err: refill_dirty(cl); } -void bch_writeback_init_cached_dev(struct cached_dev *dc) +void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a2c754..a7fd821 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2002,9 +2002,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { int rv; if (buf[0] == '+') - rv = strict_strtoll(buf+1, 10, &offset); + rv = kstrtoll(buf+1, 10, &offset); else - rv = strict_strtoll(buf, 10, &offset); + rv = kstrtoll(buf, 10, &offset); if (rv) return rv; if (offset == 0) @@ -2139,7 +2139,7 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; - int rv = strict_strtoul(buf, 10, &backlog); + int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) @@ -2165,7 +2165,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) unsigned long csize; if (mddev->bitmap) return -EBUSY; - rv = strict_strtoul(buf, 10, &csize); + rv = kstrtoul(buf, 10, &csize); if (rv) return rv; if (csize < 512 || diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0387e05..5227e07 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -145,6 +145,7 @@ struct dm_buffer { unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; + struct list_head write_list; struct bio bio; struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; }; @@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, if (gfp_mask & __GFP_NORETRY) noio_flag = memalloc_noio_save(); - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); if (gfp_mask & __GFP_NORETRY) memalloc_noio_restore(noio_flag); @@ -630,7 +631,8 @@ static int do_io_schedule(void *word) * - Submit our write and don't wait on it. We set B_WRITING indicating * that there is a write in progress. */ -static void __write_dirty_buffer(struct dm_buffer *b) +static void __write_dirty_buffer(struct dm_buffer *b, + struct list_head *write_list) { if (!test_bit(B_DIRTY, &b->state)) return; @@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b) wait_on_bit_lock(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); - submit_io(b, WRITE, b->block, write_endio); + if (!write_list) + submit_io(b, WRITE, b->block, write_endio); + else + list_add_tail(&b->write_list, write_list); +} + +static void __flush_write_list(struct list_head *write_list) +{ + struct blk_plug plug; + blk_start_plug(&plug); + while (!list_empty(write_list)) { + struct dm_buffer *b = + list_entry(write_list->next, struct dm_buffer, write_list); + list_del(&b->write_list); + submit_io(b, WRITE, b->block, write_endio); + dm_bufio_cond_resched(); + } + blk_finish_plug(&plug); } /* @@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b) return; wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); - __write_dirty_buffer(b); + __write_dirty_buffer(b, NULL); wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); } @@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b) wake_up(&c->free_buffer_wait); } -static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, + struct list_head *write_list) { struct dm_buffer *b, *tmp; @@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) if (no_wait && test_bit(B_WRITING, &b->state)) return; - __write_dirty_buffer(b); + __write_dirty_buffer(b, write_list); dm_bufio_cond_resched(); } } @@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, * If we are over threshold_buffers, start freeing buffers. * If we're over "limit_buffers", block until we get under the limit. */ -static void __check_watermark(struct dm_bufio_client *c) +static void __check_watermark(struct dm_bufio_client *c, + struct list_head *write_list) { unsigned long threshold_buffers, limit_buffers; @@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c) } if (c->n_buffers[LIST_DIRTY] > threshold_buffers) - __write_dirty_buffers_async(c, 1); + __write_dirty_buffers_async(c, 1, write_list); } /* @@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) *--------------------------------------------------------------*/ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, int *need_submit) + enum new_flag nf, int *need_submit, + struct list_head *write_list) { struct dm_buffer *b, *new_b = NULL; @@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, goto found_buffer; } - __check_watermark(c); + __check_watermark(c, write_list); b = new_b; b->hold_count = 1; @@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, int need_submit; struct dm_buffer *b; + LIST_HEAD(write_list); + dm_bufio_lock(c); - b = __bufio_new(c, block, nf, &need_submit); + b = __bufio_new(c, block, nf, &need_submit, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); + if (!b) return b; @@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); blk_start_plug(&plug); @@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, for (; n_blocks--; block++) { int need_submit; struct dm_buffer *b; - b = __bufio_new(c, block, NF_PREFETCH, &need_submit); + b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + blk_finish_plug(&plug); + __flush_write_list(&write_list); + blk_start_plug(&plug); + dm_bufio_lock(c); + } if (unlikely(b != NULL)) { dm_bufio_unlock(c); @@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, goto flush_plug; dm_bufio_lock(c); } - } dm_bufio_unlock(c); @@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) { + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); + __write_dirty_buffers_async(c, 0, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); } EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); @@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; + LIST_HEAD(write_list); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); again: list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { @@ -1274,7 +1317,7 @@ retry: BUG_ON(!b->hold_count); BUG_ON(test_bit(B_READING, &b->state)); - __write_dirty_buffer(b); + __write_dirty_buffer(b, NULL); if (b->hold_count == 1) { wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index df44b60..0df3ec0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ +#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 +__always_inline +#endif static dm_block_t block_div(dm_block_t b, uint32_t n) { do_div(b, n); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 7fcf21c..c80a0ec 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (!fc) { - ti->error = "Cannot allocate linear context"; + ti->error = "Cannot allocate context"; return -ENOMEM; } fc->start_time = jiffies; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index aa04f02..f1b7586 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -36,6 +36,14 @@ struct hash_cell { struct dm_table *new_map; }; +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + struct vers_iter { size_t param_size; struct dm_target_versions *vers, *old_vers; @@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi return -EBUSY; } -static void __hash_remove(struct hash_cell *hc) +static struct dm_table *__hash_remove(struct hash_cell *hc) { struct dm_table *table; + int srcu_idx; /* remove from the dev hash */ list_del(&hc->uuid_list); @@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc) dm_set_mdptr(hc->md, NULL); mutex_unlock(&dm_hash_cells_mutex); - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); + table = NULL; if (hc->new_map) - dm_table_destroy(hc->new_map); + table = hc->new_map; dm_put(hc->md); free_cell(hc); + + return table; } static void dm_hash_remove_all(int keep_open_devices) @@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices) int i, dev_skipped; struct hash_cell *hc; struct mapped_device *md; + struct dm_table *t; retry: dev_skipped = 0; @@ -287,10 +299,14 @@ retry: continue; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } dm_put(md); if (likely(keep_open_devices)) dm_destroy(md); @@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, struct dm_table *table; struct mapped_device *md; unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + int srcu_idx; /* * duplicate new. @@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Wake up any dm event waiters. */ - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -620,11 +636,14 @@ static int check_name(const char *name) * _hash_lock without first calling dm_table_put, because dm_table_destroy * waits for this dm_table_put and could be called under this lock. */ -static struct dm_table *dm_get_inactive_table(struct mapped_device *md) +static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) { struct hash_cell *hc; struct dm_table *table = NULL; + /* increment rcu count, we don't care about the table pointer */ + dm_get_live_table(md, srcu_idx); + down_read(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md) } table = hc->new_map; - if (table) - dm_table_get(table); out: up_read(&_hash_lock); @@ -643,10 +660,11 @@ out: } static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, - struct dm_ioctl *param) + struct dm_ioctl *param, + int *srcu_idx) { return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? - dm_get_inactive_table(md) : dm_get_live_table(md); + dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx); } /* @@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; + int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); @@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->event_nr = dm_get_event_nr(md); param->target_count = 0; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (table) { if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); } - dm_table_put(table); param->flags |= DM_ACTIVE_PRESENT_FLAG; } + dm_put_live_table(md, srcu_idx); if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { - table = dm_get_inactive_table(md); + int srcu_idx; + table = dm_get_inactive_table(md, &srcu_idx); if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); - dm_table_put(table); } + dm_put_live_table(md, srcu_idx); } } @@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) struct hash_cell *hc; struct mapped_device *md; int r; + struct dm_table *t; down_write(&_hash_lock); hc = __find_device_hash_cell(param); @@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) return r; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param) old_map = dm_swap_table(md, new_map); if (IS_ERR(old_map)) { + dm_sync_table(md); dm_table_destroy(new_map); dm_put(md); return PTR_ERR(old_map); @@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param) param->flags |= DM_UEVENT_GENERATED_FLAG; } + /* + * Since dm_swap_table synchronizes RCU, nobody should be in + * read-side critical section already. + */ if (old_map) dm_table_destroy(old_map); @@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) int r = 0; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) */ __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); out: dm_put(md); @@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; - struct dm_table *t; + struct dm_table *t, *old_map = NULL; struct mapped_device *md; struct target_type *immutable_target_type; @@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size) hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); - dm_table_destroy(t); up_write(&_hash_lock); + dm_table_destroy(t); r = -ENXIO; goto out; } if (hc->new_map) - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = t; up_write(&_hash_lock); @@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); out: + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } + dm_put(md); return r; @@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) { struct hash_cell *hc; struct mapped_device *md; + struct dm_table *old_map = NULL; down_write(&_hash_lock); @@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) } if (hc->new_map) { - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = NULL; } @@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } dm_put(md); return 0; @@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_deps(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) struct dm_target_msg *tmsg = (void *) param + param->data_start; size_t maxlen; char *result = get_result_buffer(param, param_size, &maxlen); + int srcu_idx; md = find_device(param); if (!md) @@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (r <= 1) goto out_argv; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (!table) - goto out_argv; + goto out_table; if (dm_deleting_md(md)) { r = -ENXIO; @@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) } out_table: - dm_table_put(table); + dm_put_live_table(md, srcu_idx); out_argv: kfree(argv); out: @@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern } if (!dmi) { - dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); + unsigned noio_flag; + noio_flag = memalloc_noio_save(); + dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); + memalloc_noio_restore(noio_flag); if (dmi) *param_flags |= DM_PARAMS_VMALLOC; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bdf26f5..5adede1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long flags; int r; -again: bdev = NULL; mode = 0; r = 0; @@ -1579,7 +1578,7 @@ again: } if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) - r = -EAGAIN; + r = -ENOTCONN; else if (!bdev) r = -EIO; @@ -1591,11 +1590,8 @@ again: if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) r = scsi_verify_blk_ioctl(NULL, cmd); - if (r == -EAGAIN && !fatal_signal_pending(current)) { + if (r == -ENOTCONN && !fatal_signal_pending(current)) queue_work(kmultipathd, &m->process_queued_ios); - msleep(10); - goto again; - } return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1d3fe1a..4880b69 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; - unsigned rebuilds_per_group, copies, d; + unsigned rebuilds_per_group = 0, copies, d; unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) @@ -504,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, * First, parse the in-order required arguments * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0)) { + if ((kstrtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; } else if (rs->raid_type->level == 1) { @@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, continue; } - if (strict_strtoul(argv[i], 10, &value) < 0) { + if (kstrtoul(argv[i], 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; } @@ -1181,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) argv++; /* number of RAID parameters */ - if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + if (kstrtoul(argv[0], 10, &num_raid_params) < 0) { ti->error = "Cannot understand number of RAID parameters"; return -EINVAL; } @@ -1194,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || (num_raid_devs >= INT_MAX)) { ti->error = "Cannot understand number of raid devices"; return -EINVAL; @@ -1388,6 +1388,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * performing a "check" of the array. */ DMEMIT(" %llu", + (strcmp(rs->md.last_sync_action, "check")) ? 0 : (unsigned long long) atomic64_read(&rs->md.resync_mismatches)); break; @@ -1572,6 +1573,62 @@ static void raid_postsuspend(struct dm_target *ti) mddev_suspend(&rs->md); } +static void attempt_restore_of_faulty_devices(struct raid_set *rs) +{ + int i; + uint64_t failed_devices, cleared_failed_devices = 0; + unsigned long flags; + struct dm_raid_superblock *sb; + struct md_rdev *r; + + for (i = 0; i < rs->md.raid_disks; i++) { + r = &rs->dev[i].rdev; + if (test_bit(Faulty, &r->flags) && r->sb_page && + sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) { + DMINFO("Faulty %s device #%d has readable super block." + " Attempting to revive it.", + rs->raid_type->name, i); + + /* + * Faulty bit may be set, but sometimes the array can + * be suspended before the personalities can respond + * by removing the device from the array (i.e. calling + * 'hot_remove_disk'). If they haven't yet removed + * the failed device, its 'raid_disk' number will be + * '>= 0' - meaning we must call this function + * ourselves. + */ + if ((r->raid_disk >= 0) && + (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0)) + /* Failed to revive this device, try next */ + continue; + + r->raid_disk = i; + r->saved_raid_disk = i; + flags = r->flags; + clear_bit(Faulty, &r->flags); + clear_bit(WriteErrorSeen, &r->flags); + clear_bit(In_sync, &r->flags); + if (r->mddev->pers->hot_add_disk(r->mddev, r)) { + r->raid_disk = -1; + r->saved_raid_disk = -1; + r->flags = flags; + } else { + r->recovery_offset = 0; + cleared_failed_devices |= 1 << i; + } + } + } + if (cleared_failed_devices) { + rdev_for_each(r, &rs->md) { + sb = page_address(r->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + failed_devices &= ~cleared_failed_devices; + sb->failed_devices = cpu_to_le64(failed_devices); + } + } +} + static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; @@ -1580,6 +1637,13 @@ static void raid_resume(struct dm_target *ti) if (!rs->bitmap_loaded) { bitmap_load(&rs->md); rs->bitmap_loaded = 1; + } else { + /* + * A secondary resume while the device is active. + * Take this opportunity to check whether any failed + * devices are reachable again. + */ + attempt_restore_of_faulty_devices(rs); } clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); @@ -1588,7 +1652,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 5, 0}, + .version = {1, 5, 2}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c new file mode 100644 index 0000000..ff9ac4b --- /dev/null +++ b/drivers/md/dm-switch.c @@ -0,0 +1,538 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2013 Red Hat, Inc. + * + * This file is released under the GPL. + * + * dm-switch is a device-mapper target that maps IO to underlying block + * devices efficiently when there are a large number of fixed-sized + * address regions but there is no simple pattern to allow for a compact + * mapping representation such as dm-stripe. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "switch" + +/* + * One region_table_slot_t holds <region_entries_per_slot> region table + * entries each of which is <region_table_entry_bits> in size. + */ +typedef unsigned long region_table_slot_t; + +/* + * A device with the offset to its start sector. + */ +struct switch_path { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Context block for a dm switch device. + */ +struct switch_ctx { + struct dm_target *ti; + + unsigned nr_paths; /* Number of paths in path_list. */ + + unsigned region_size; /* Region size in 512-byte sectors */ + unsigned long nr_regions; /* Number of regions making up the device */ + signed char region_size_bits; /* log2 of region_size or -1 */ + + unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ + unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ + signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ + + region_table_slot_t *region_table; /* Region table */ + + /* + * Array of dm devices to switch between. + */ + struct switch_path path_list[0]; +}; + +static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, + unsigned region_size) +{ + struct switch_ctx *sctx; + + sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), + GFP_KERNEL); + if (!sctx) + return NULL; + + sctx->ti = ti; + sctx->region_size = region_size; + + ti->private = sctx; + + return sctx; +} + +static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) +{ + struct switch_ctx *sctx = ti->private; + sector_t nr_regions = ti->len; + sector_t nr_slots; + + if (!(sctx->region_size & (sctx->region_size - 1))) + sctx->region_size_bits = __ffs(sctx->region_size); + else + sctx->region_size_bits = -1; + + sctx->region_table_entry_bits = 1; + while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && + (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) + sctx->region_table_entry_bits++; + + sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; + if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) + sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); + else + sctx->region_entries_per_slot_bits = -1; + + if (sector_div(nr_regions, sctx->region_size)) + nr_regions++; + + sctx->nr_regions = nr_regions; + if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { + ti->error = "Region table too large"; + return -EINVAL; + } + + nr_slots = nr_regions; + if (sector_div(nr_slots, sctx->region_entries_per_slot)) + nr_slots++; + + if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { + ti->error = "Region table too large"; + return -EINVAL; + } + + sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); + if (!sctx->region_table) { + ti->error = "Cannot allocate region table"; + return -ENOMEM; + } + + return 0; +} + +static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, + unsigned long *region_index, unsigned *bit) +{ + if (sctx->region_entries_per_slot_bits >= 0) { + *region_index = region_nr >> sctx->region_entries_per_slot_bits; + *bit = region_nr & (sctx->region_entries_per_slot - 1); + } else { + *region_index = region_nr / sctx->region_entries_per_slot; + *bit = region_nr % sctx->region_entries_per_slot; + } + + *bit *= sctx->region_table_entry_bits; +} + +/* + * Find which path to use at given offset. + */ +static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) +{ + unsigned long region_index; + unsigned bit, path_nr; + sector_t p; + + p = offset; + if (sctx->region_size_bits >= 0) + p >>= sctx->region_size_bits; + else + sector_div(p, sctx->region_size); + + switch_get_position(sctx, p, ®ion_index, &bit); + path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(path_nr >= sctx->nr_paths)) + path_nr = 0; + + return path_nr; +} + +static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, + unsigned value) +{ + unsigned long region_index; + unsigned bit; + region_table_slot_t pte; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + pte = sctx->region_table[region_index]; + pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); + pte |= (region_table_slot_t)value << bit; + sctx->region_table[region_index] = pte; +} + +/* + * Fill the region table with an initial round robin pattern. + */ +static void initialise_region_table(struct switch_ctx *sctx) +{ + unsigned path_nr = 0; + unsigned long region_nr; + + for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { + switch_region_table_write(sctx, region_nr, path_nr); + if (++path_nr >= sctx->nr_paths) + path_nr = 0; + } +} + +static int parse_path(struct dm_arg_set *as, struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + unsigned long long start; + int r; + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &sctx->path_list[sctx->nr_paths].dmdev); + if (r) { + ti->error = "Device lookup failed"; + return r; + } + + if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + return -EINVAL; + } + + sctx->path_list[sctx->nr_paths].start = start; + + sctx->nr_paths++; + + return 0; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + + while (sctx->nr_paths--) + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + + vfree(sctx->region_table); + kfree(sctx); +} + +/* + * Constructor arguments: + * <num_paths> <region_size> <num_optional_args> [<optional_args>...] + * [<dev_path> <offset>]+ + * + * Optional args are to allow for future extension: currently this + * parameter must be 0. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + static struct dm_arg _args[] = { + {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, + {1, UINT_MAX, "Invalid region size"}, + {0, 0, "Invalid number of optional args"}, + }; + + struct switch_ctx *sctx; + struct dm_arg_set as; + unsigned nr_paths, region_size, nr_optional_args; + int r; + + as.argc = argc; + as.argv = argv; + + r = dm_read_arg(_args, &as, &nr_paths, &ti->error); + if (r) + return -EINVAL; + + r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); + if (r) + return r; + + r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); + if (r) + return r; + /* parse optional arguments here, if we add any */ + + if (as.argc != nr_paths * 2) { + ti->error = "Incorrect number of path arguments"; + return -EINVAL; + } + + sctx = alloc_switch_ctx(ti, nr_paths, region_size); + if (!sctx) { + ti->error = "Cannot allocate redirection context"; + return -ENOMEM; + } + + r = dm_set_target_max_io_len(ti, region_size); + if (r) + goto error; + + while (as.argc) { + r = parse_path(&as, ti); + if (r) + goto error; + } + + r = alloc_region_table(ti, nr_paths); + if (r) + goto error; + + initialise_region_table(sctx); + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_bios = 1; + + return 0; + +error: + switch_dtr(ti); + + return r; +} + +static int switch_map(struct dm_target *ti, struct bio *bio) +{ + struct switch_ctx *sctx = ti->private; + sector_t offset = dm_target_offset(ti, bio->bi_sector); + unsigned path_nr = switch_get_path_nr(sctx, offset); + + bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; + bio->bi_sector = sctx->path_list[path_nr].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers in the message as quickly as possible. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ +static const unsigned char hex_table[256] = { +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 +}; + +static __always_inline unsigned long parse_hex(const char **string) +{ + unsigned char d; + unsigned long r = 0; + + while ((d = hex_table[(unsigned char)**string]) < 16) { + r = (r << 4) | d; + (*string)++; + } + + return r; +} + +static int process_set_region_mappings(struct switch_ctx *sctx, + unsigned argc, char **argv) +{ + unsigned i; + unsigned long region_index = 0; + + for (i = 1; i < argc; i++) { + unsigned long path_nr; + const char *string = argv[i]; + + if (*string == ':') + region_index++; + else { + region_index = parse_hex(&string); + if (unlikely(*string != ':')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + } + + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + path_nr = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + if (unlikely(region_index >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); + return -EINVAL; + } + if (unlikely(path_nr >= sctx->nr_paths)) { + DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); + return -EINVAL; + } + + switch_region_table_write(sctx, region_index, path_nr); + } + + return 0; +} + +/* + * Messages are processed one-at-a-time. + * + * Only set_region_mappings is supported. + */ +static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *sctx = ti->private; + int r = -EINVAL; + + mutex_lock(&message_mutex); + + if (!strcasecmp(argv[0], "set_region_mappings")) + r = process_set_region_mappings(sctx, argc, argv); + else + DMWARN("Unrecognised message received."); + + mutex_unlock(&message_mutex); + + return r; +} + +static void switch_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct switch_ctx *sctx = ti->private; + unsigned sz = 0; + int path_nr; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) + DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, + (unsigned long long)sctx->path_list[path_nr].start); + break; + } +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct switch_ctx *sctx = ti->private; + struct block_device *bdev; + fmode_t mode; + unsigned path_nr; + int r = 0; + + path_nr = switch_get_path_nr(sctx, 0); + + bdev = sctx->path_list[path_nr].dmdev->bdev; + mode = sctx->path_list[path_nr].dmdev->mode; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *sctx = ti->private; + int path_nr; + int r; + + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { + r = fn(ti, sctx->path_list[path_nr].dmdev, + sctx->path_list[path_nr].start, ti->len, data); + if (r) + return r; + } + + return 0; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .ioctl = switch_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +static int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r < 0) + DMERR("dm_register_target() failed %d", r); + + return r; +} + +static void __exit dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); +MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); +MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); +MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); +MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1ff252a..f221812 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -26,22 +26,8 @@ #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) -/* - * The table has always exactly one reference from either mapped_device->map - * or hash_cell->new_map. This reference is not counted in table->holders. - * A pair of dm_create_table/dm_destroy_table functions is used for table - * creation/destruction. - * - * Temporary references from the other code increase table->holders. A pair - * of dm_table_get/dm_table_put functions is used to manipulate it. - * - * When the table is about to be destroyed, we wait for table->holders to - * drop to zero. - */ - struct dm_table { struct mapped_device *md; - atomic_t holders; unsigned type; /* btree table */ @@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); INIT_LIST_HEAD(&t->target_callbacks); - atomic_set(&t->holders, 0); if (!num_targets) num_targets = KEYS_PER_NODE; @@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t) if (!t) return; - while (atomic_read(&t->holders)) - msleep(1); - smp_mb(); - /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t) kfree(t); } -void dm_table_get(struct dm_table *t) -{ - atomic_inc(&t->holders); -} -EXPORT_SYMBOL(dm_table_get); - -void dm_table_put(struct dm_table *t) -{ - if (!t) - return; - - smp_mb__before_atomic_dec(); - atomic_dec(&t->holders); -} -EXPORT_SYMBOL(dm_table_put); - /* * Checks to see if we need to extend highs or targets. */ diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index b948fd8..4b7941d 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -451,7 +451,7 @@ static void verity_prefetch_io(struct work_struct *work) goto no_prefetch_cluster; if (unlikely(cluster & (cluster - 1))) - cluster = 1 << (fls(cluster) - 1); + cluster = 1 << __fls(cluster); hash_block_start &= ~(sector_t)(cluster - 1); hash_block_end |= cluster - 1; @@ -695,8 +695,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || - num < 0 || num > 1) { + if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 || + num > 1) { ti->error = "Invalid version"; r = -EINVAL; goto bad; @@ -723,7 +723,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } - v->data_dev_block_bits = ffs(num) - 1; + v->data_dev_block_bits = __ffs(num); if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || !num || (num & (num - 1)) || @@ -733,7 +733,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } - v->hash_dev_block_bits = ffs(num) - 1; + v->hash_dev_block_bits = __ffs(num); if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) @@ -812,7 +812,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } v->hash_per_block_bits = - fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; + __fls((1 << v->hash_dev_block_bits) / v->digest_size); v->levels = 0; if (v->data_blocks) @@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) for (i = v->levels - 1; i >= 0; i--) { sector_t s; v->hash_level_block[i] = hash_position; - s = verity_position_at_level(v, v->data_blocks, i); - s = (s >> v->hash_per_block_bits) + - !!(s & ((1 << v->hash_per_block_bits) - 1)); + s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1) + >> ((i + 1) * v->hash_per_block_bits); if (hash_position + s < hash_position) { ti->error = "Hash device offset overflow"; r = -E2BIG; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d5370a9..9e39d2b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -117,15 +117,29 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_MERGE_IS_OPTIONAL 6 /* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + +/* * Work processed by per-device workqueue. */ struct mapped_device { - struct rw_semaphore io_lock; + struct srcu_struct io_barrier; struct mutex suspend_lock; - rwlock_t map_lock; atomic_t holders; atomic_t open_count; + /* + * The current mapping. + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. + */ + struct dm_table *map; + unsigned long flags; struct request_queue *queue; @@ -155,11 +169,6 @@ struct mapped_device { struct workqueue_struct *wq; /* - * The current mapping. - */ - struct dm_table *map; - - /* * io objects are allocated from here. */ mempool_t *io_pool; @@ -386,10 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_table *map = dm_get_live_table(md); + int srcu_idx; + struct dm_table *map; struct dm_target *tgt; int r = -ENOTTY; +retry: + map = dm_get_live_table(md, &srcu_idx); + if (!map || !dm_table_get_size(map)) goto out; @@ -408,7 +421,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, r = tgt->type->ioctl(tgt, cmd, arg); out: - dm_table_put(map); + dm_put_live_table(md, srcu_idx); + + if (r == -ENOTCONN) { + msleep(10); + goto retry; + } return r; } @@ -502,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio) /* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call - * dm_table_put() when finished. + * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) { - struct dm_table *t; - unsigned long flags; + *srcu_idx = srcu_read_lock(&md->io_barrier); + + return srcu_dereference(md->map, &md->io_barrier); +} - read_lock_irqsave(&md->map_lock, flags); - t = md->map; - if (t) - dm_table_get(t); - read_unlock_irqrestore(&md->map_lock, flags); +void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +{ + srcu_read_unlock(&md->io_barrier, srcu_idx); +} - return t; +void dm_sync_table(struct mapped_device *md) +{ + synchronize_srcu(&md->io_barrier); + synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(md->map); +} + +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ + rcu_read_unlock(); } /* @@ -1349,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci) /* * Entry point to split a bio into clones and submit them to the targets. */ -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) { struct clone_info ci; int error = 0; - ci.map = dm_get_live_table(md); - if (unlikely(!ci.map)) { + if (unlikely(!map)) { bio_io_error(bio); return; } + ci.map = map; ci.md = md; ci.io = alloc_io(md); ci.io->error = 0; @@ -1386,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) /* drop the extra reference count */ dec_pending(ci.io, error); - dm_table_put(ci.map); } /*----------------------------------------------------------------- * CRUD END @@ -1397,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + struct dm_table *map = dm_get_live_table_fast(md); struct dm_target *ti; sector_t max_sectors; int max_size = 0; @@ -1407,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q, ti = dm_table_find_target(map, bvm->bi_sector); if (!dm_target_is_valid(ti)) - goto out_table; + goto out; /* * Find maximum amount of I/O that won't need splitting @@ -1436,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q, max_size = 0; -out_table: - dm_table_put(map); - out: + dm_put_live_table_fast(md); /* * Always allow an entire first page */ @@ -1458,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio) int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; int cpu; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); cpu = part_stat_lock(); part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); @@ -1468,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio) /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); if (bio_rw(bio) != READA) queue_io(md, bio); @@ -1477,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio) return; } - __split_and_process_bio(md, bio); - up_read(&md->io_lock); + __split_and_process_bio(md, map, bio); + dm_put_live_table(md, srcu_idx); return; } @@ -1664,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); struct dm_target *ti; struct request *rq, *clone; sector_t pos; @@ -1719,7 +1757,7 @@ requeued: delay_and_out: blk_delay_queue(q, HZ / 10); out: - dm_table_put(map); + dm_put_live_table(md, srcu_idx); } int dm_underlying_device_busy(struct request_queue *q) @@ -1732,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q) { int r; struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + struct dm_table *map = dm_get_live_table_fast(md); if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) r = 1; else r = dm_table_any_busy_target(map); - dm_table_put(map); + dm_put_live_table_fast(md); return r; } @@ -1751,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table(md); + map = dm_get_live_table_fast(md); if (map) { /* * Request-based dm cares about only own queue for @@ -1762,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) bdi_bits; else r = dm_table_any_congested(map, bdi_bits); - - dm_table_put(map); } + dm_put_live_table_fast(md); } return r; @@ -1869,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; + r = init_srcu_struct(&md->io_barrier); + if (r < 0) + goto bad_io_barrier; + md->type = DM_TYPE_NONE; - init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); - rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); @@ -1937,6 +1976,8 @@ bad_thread: bad_disk: blk_cleanup_queue(md->queue); bad_queue: + cleanup_srcu_struct(&md->io_barrier); +bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); @@ -1960,6 +2001,7 @@ static void free_dev(struct mapped_device *md) bioset_free(md->bs); blk_integrity_unregister(md->disk); del_gendisk(md->disk); + cleanup_srcu_struct(&md->io_barrier); free_minor(minor); spin_lock(&_minor_lock); @@ -2102,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; - unsigned long flags; int merge_is_optional; size = dm_table_get_size(t); @@ -2131,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, merge_is_optional = dm_table_merge_is_optional(t); - write_lock_irqsave(&md->map_lock, flags); old_map = md->map; - md->map = t; + rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2141,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); else clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - write_unlock_irqrestore(&md->map_lock, flags); + dm_sync_table(md); return old_map; } @@ -2152,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, static struct dm_table *__unbind(struct mapped_device *md) { struct dm_table *map = md->map; - unsigned long flags; if (!map) return NULL; dm_table_event_callback(map, NULL, NULL); - write_lock_irqsave(&md->map_lock, flags); - md->map = NULL; - write_unlock_irqrestore(&md->map_lock, flags); + rcu_assign_pointer(md->map, NULL); + dm_sync_table(md); return map; } @@ -2312,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name); static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; + int srcu_idx; might_sleep(); spin_lock(&_minor_lock); - map = dm_get_live_table(md); + map = dm_get_live_table(md, &srcu_idx); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); @@ -2326,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait) dm_table_postsuspend_targets(map); } + /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ + dm_put_live_table(md, srcu_idx); + /* * Rare, but there may be I/O requests still going to complete, * for example. Wait for all references to disappear. @@ -2340,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) dm_device_name(md), atomic_read(&md->holders)); dm_sysfs_exit(md); - dm_table_put(map); dm_table_destroy(__unbind(md)); free_dev(md); } @@ -2397,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work) struct mapped_device *md = container_of(work, struct mapped_device, work); struct bio *c; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); @@ -2408,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work) if (!c) break; - up_read(&md->io_lock); - if (dm_request_based(md)) generic_make_request(c); else - __split_and_process_bio(md, c); - - down_read(&md->io_lock); + __split_and_process_bio(md, map, c); } - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); } static void dm_queue_flush(struct mapped_device *md) @@ -2450,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) * reappear. */ if (dm_table_has_no_data_devices(table)) { - live_map = dm_get_live_table(md); + live_map = dm_get_live_table_fast(md); if (live_map) limits = md->queue->limits; - dm_table_put(live_map); + dm_put_live_table_fast(md); } if (!live_map) { @@ -2533,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) goto out_unlock; } - map = dm_get_live_table(md); + map = md->map; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2554,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (!noflush && do_lockfs) { r = lock_fs(md); if (r) - goto out; + goto out_unlock; } /* @@ -2569,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ - down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - up_write(&md->io_lock); + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based @@ -2589,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); - down_write(&md->io_lock); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - up_write(&md->io_lock); + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2602,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); - goto out; /* pushback list is already flushed, so skip flush */ + goto out_unlock; /* pushback list is already flushed, so skip flush */ } /* @@ -2615,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_table_postsuspend_targets(map); -out: - dm_table_put(map); - out_unlock: mutex_unlock(&md->suspend_lock); return r; @@ -2632,7 +2666,7 @@ int dm_resume(struct mapped_device *md) if (!dm_suspended_md(md)) goto out; - map = dm_get_live_table(md); + map = md->map; if (!map || !dm_table_get_size(map)) goto out; @@ -2656,7 +2690,6 @@ int dm_resume(struct mapped_device *md) r = 0; out: - dm_table_put(map); mutex_unlock(&md->suspend_lock); return r; diff --git a/drivers/md/md.c b/drivers/md/md.c index 681d109..dddc87b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -521,6 +521,7 @@ void mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -2867,7 +2868,7 @@ static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long offset; - if (strict_strtoull(buf, 10, &offset) < 0) + if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; @@ -2895,7 +2896,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev, unsigned long long new_offset; struct mddev *mddev = rdev->mddev; - if (strict_strtoull(buf, 10, &new_offset) < 0) + if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; if (mddev->sync_thread) @@ -2961,7 +2962,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) unsigned long long blocks; sector_t new; - if (strict_strtoull(buf, 10, &blocks) < 0) + if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) @@ -3069,7 +3070,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ if (cmd_match(buf, "none")) recovery_start = MaxSector; - else if (strict_strtoull(buf, 10, &recovery_start)) + else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && @@ -3497,7 +3498,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (clevel[len-1] == '\n') len--; clevel[len] = 0; - if (strict_strtol(clevel, 10, &level)) + if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) @@ -4272,6 +4273,17 @@ action_store(struct mddev *mddev, const char *page, size_t len) return len; } +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + +static ssize_t +last_sync_action_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", mddev->last_sync_action); +} + +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); + static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { @@ -4280,10 +4292,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page) atomic64_read(&mddev->resync_mismatches)); } -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); - - static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t @@ -4356,7 +4364,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; - if (strict_strtol(buf, 10, &n)) + if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) @@ -4424,7 +4432,7 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; - if (strict_strtoull(buf, 10, &min)) + if (kstrtoull(buf, 10, &min)) return -EINVAL; if (min > mddev->resync_max) return -EINVAL; @@ -4461,7 +4469,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len) mddev->resync_max = MaxSector; else { unsigned long long max; - if (strict_strtoull(buf, 10, &max)) + if (kstrtoull(buf, 10, &max)) return -EINVAL; if (max < mddev->resync_min) return -EINVAL; @@ -4686,6 +4694,7 @@ static struct attribute *md_default_attrs[] = { static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, + &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, @@ -5268,8 +5277,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -6405,6 +6414,12 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* need to ensure md_delayed_delete() has completed */ flush_workqueue(md_misc_wq); + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->flags), + msecs_to_jiffies(5000)); err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -7323,7 +7338,7 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc; + char *desc, *action = NULL; struct blk_plug plug; /* just incase thread restarts... */ @@ -7333,17 +7348,21 @@ void md_do_sync(struct md_thread *thread) return; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; - else + action = "repair"; + } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; + mddev->last_sync_action = action ?: desc; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -7892,6 +7911,8 @@ void md_check_recovery(struct mddev *mddev) md_new_event(mddev); } unlock: + wake_up(&mddev->sb_wait); + if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, diff --git a/drivers/md/md.h b/drivers/md/md.h index 653f992b6..20f02c0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -268,6 +268,14 @@ struct mddev { struct md_thread *thread; /* management thread */ struct md_thread *sync_thread; /* doing resync or reconstruct */ + + /* 'last_sync_action' is initialized to "none". It is set when a + * sync operation (i.e "data-check", "requested-resync", "resync", + * "recovery", or "reshape") is started. It holds this value even + * when the sync thread is "frozen" (interrupted) or "idle" (stopped + * or finished). It is overwritten when a new sync operation is begun. + */ + char *last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index fcf65e5..c4d420b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -597,6 +597,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + rdev->sectors = mddev->dev_sectors; } /* Set new parameters */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5595118..ec73458 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -417,7 +417,17 @@ static void raid1_end_write_request(struct bio *bio, int error) r1_bio->bios[mirror] = NULL; to_put = bio; - set_bit(R1BIO_Uptodate, &r1_bio->state); + /* + * Do not set R1BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(conf->mirrors[mirror].rdev, @@ -870,17 +880,17 @@ static void allow_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r1conf *conf) +static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -888,7 +898,7 @@ static void freeze_array(struct r1conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); @@ -1509,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->mirrors+mirror; if (!p->rdev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; rdev->raid_disk = mirror; @@ -1544,12 +1555,12 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; @@ -1595,11 +1606,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) */ struct md_rdev *repl = conf->mirrors[conf->raid_disks + number].rdev; - raise_barrier(conf); + freeze_array(conf, 0); clear_bit(Replacement, &repl->flags); p->rdev = repl; conf->mirrors[conf->raid_disks + number].rdev = NULL; - lower_barrier(conf); + unfreeze_array(conf); clear_bit(WantReplacement, &rdev->flags); } else clear_bit(WantReplacement, &rdev->flags); @@ -2195,7 +2206,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) * frozen */ if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); @@ -2780,8 +2791,8 @@ static int run(struct mddev *mddev) return PTR_ERR(conf); if (mddev->queue) - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; @@ -2963,7 +2974,7 @@ static int raid1_reshape(struct mddev *mddev) return -ENOMEM; } - raise_barrier(conf); + freeze_array(conf, 0); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; @@ -2994,7 +3005,7 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - lower_barrier(conf); + unfreeze_array(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 59d4daa..cd066b6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -97,7 +97,7 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); -static int enough(struct r10conf *conf, int ignore); +static int _enough(struct r10conf *conf, int previous, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); @@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error) * than fail the last device. Here we redefine * "uptodate" to mean "Don't want to retry" */ - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - if (!enough(conf, rdev->raid_disk)) + if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), + rdev->raid_disk)) uptodate = 1; - spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r10_bio); @@ -490,7 +488,17 @@ static void raid10_end_write_request(struct bio *bio, int error) sector_t first_bad; int bad_sectors; - set_bit(R10BIO_Uptodate, &r10_bio->state); + /* + * Do not set R10BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(rdev, @@ -1055,17 +1063,17 @@ static void allow_barrier(struct r10conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r10conf *conf) +static void freeze_array(struct r10conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -1073,7 +1081,7 @@ static void freeze_array(struct r10conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); @@ -1622,37 +1630,58 @@ static void status(struct seq_file *seq, struct mddev *mddev) * Don't consider the device numbered 'ignore' * as we might be about to remove it. */ -static int _enough(struct r10conf *conf, struct geom *geo, int ignore) +static int _enough(struct r10conf *conf, int previous, int ignore) { int first = 0; + int has_enough = 0; + int disks, ncopies; + if (previous) { + disks = conf->prev.raid_disks; + ncopies = conf->prev.near_copies; + } else { + disks = conf->geo.raid_disks; + ncopies = conf->geo.near_copies; + } + rcu_read_lock(); do { int n = conf->copies; int cnt = 0; int this = first; while (n--) { - if (conf->mirrors[this].rdev && - this != ignore) + struct md_rdev *rdev; + if (this != ignore && + (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + test_bit(In_sync, &rdev->flags)) cnt++; - this = (this+1) % geo->raid_disks; + this = (this+1) % disks; } if (cnt == 0) - return 0; - first = (first + geo->near_copies) % geo->raid_disks; + goto out; + first = (first + ncopies) % disks; } while (first != 0); - return 1; + has_enough = 1; +out: + rcu_read_unlock(); + return has_enough; } static int enough(struct r10conf *conf, int ignore) { - return _enough(conf, &conf->geo, ignore) && - _enough(conf, &conf->prev, ignore); + /* when calling 'enough', both 'prev' and 'geo' must + * be stable. + * This is ensured if ->reconfig_mutex or ->device_lock + * is held. + */ + return _enough(conf, 0, ignore) && + _enough(conf, 1, ignore); } static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; + unsigned long flags; /* * If it is not operational, then we have already marked it as dead @@ -1660,18 +1689,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) * next level up know. * else mark the drive as failed */ + spin_lock_irqsave(&conf->device_lock, flags); if (test_bit(In_sync, &rdev->flags) - && !enough(conf, rdev->raid_disk)) + && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. */ + spin_unlock_irqrestore(&conf->device_lock, flags); return; + } if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* + /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -1679,6 +1708,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", @@ -1781,7 +1811,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * very different from resync */ return -EBUSY; - if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) + if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; if (rdev->raid_disk >= 0) @@ -1809,15 +1839,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags); rdev->raid_disk = mirror; err = 0; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); conf->fullsync = 1; rcu_assign_pointer(p->replacement, rdev); break; } - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; @@ -1837,8 +1869,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf, 0); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); @@ -2612,7 +2644,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) r10_bio->devs[slot].bio = NULL; if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } else @@ -2899,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + mddev->reshape_position == MaxSector && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; - return max_sector - sector_nr; + return mddev->dev_sectors - sector_nr; } skipped: @@ -3522,7 +3553,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* FIXME calc properly */ conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + - max(0,mddev->delta_disks)), + max(0,-mddev->delta_disks)), GFP_KERNEL); if (!conf->mirrors) goto out; @@ -3609,8 +3640,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3682,7 +3712,7 @@ static int run(struct mddev *mddev) conf->geo.far_offset == 0) goto out_free_conf; if (conf->prev.far_copies != 1 && - conf->geo.far_offset == 0) + conf->prev.far_offset == 0) goto out_free_conf; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9359828..2bf094a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -664,6 +664,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_FLUSH; + bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; @@ -701,6 +702,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_sector = (sh->sector + rrdev->data_offset); + rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; @@ -4922,7 +4924,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) if (!conf) return -ENODEV; - if (strict_strtoul(page, 10, &new)) + if (kstrtoul(page, 10, &new)) return -EINVAL; err = raid5_set_cache_size(mddev, new); if (err) @@ -4955,7 +4957,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) if (!conf) return -ENODEV; - if (strict_strtoul(page, 10, &new)) + if (kstrtoul(page, 10, &new)) return -EINVAL; if (new > conf->max_nr_stripes) return -EINVAL; @@ -5464,7 +5466,7 @@ static int run(struct mddev *mddev) if (mddev->major_version == 0 && mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - + if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5587,6 +5589,8 @@ static int run(struct mddev *mddev) */ mddev->queue->limits.discard_zeroes_data = 0; + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -5910,7 +5914,7 @@ static int check_reshape(struct mddev *mddev) return 0; /* nothing to do */ if (has_failed(conf)) return -EINVAL; - if (mddev->delta_disks < 0) { + if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { /* We might be able to shrink, but the devices must * be made bigger first. * For raid6, 4 is the minimum size. |